In [1]:
'''
This notebook is a submission to the zindi crop detection challenge. The aim of the challenge is to predict the
type of disease affecting a crop using YOLO (You Only Look Once) model.
'''

'\nThis notebook is a submission to the zindi crop detection challenge. The aim of the challenge is to predict the\ntype of disease affecting a crop using YOLO (You Only Look Once) model.\n'

In [2]:
#import required libraries
import requests
from PIL import Image
from zipfile import ZipFile
import os
import shutil
from sklearn.preprocessing import LabelEncoder
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

#install YOLOv5
!git clone https://github.com/ultralytics/yolov5
%cd yolov5
%pip install -qr requirements.txt

# Add the yolov5 directory to the Python path
import sys
sys.path.append('/content/yolov5')


#import YOLO and Torch
import torch
from yolov5.models.yolo import Model as YOLOv5 # Import YOLOv5 from the correct location

#check if GPU is available
print(f'GPU Available: {torch.cuda.is_available()}')

Cloning into 'yolov5'...
remote: Enumerating objects: 16995, done.[K
remote: Counting objects: 100% (190/190), done.[K
remote: Compressing objects: 100% (130/130), done.[K
remote: Total 16995 (delta 101), reused 119 (delta 60), pack-reused 16805 (from 1)[K
Receiving objects: 100% (16995/16995), 15.72 MiB | 22.67 MiB/s, done.
Resolving deltas: 100% (11639/11639), done.
/content/yolov5
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m207.3/207.3 kB[0m [31m12.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m870.5/870.5 kB[0m [31m34.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m62.7/62.7 kB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m
[?25hCreating new Ultralytics Settings v0.0.6 file ✅ 
View Ultralytics Settings with 'yolo settings' or at '/root/.config/Ultralytics/settings.json'
Update Settings with 'yolo settings key=value', i.e. 'yolo settings runs_dir=path/to/dir'. For help

In [3]:
'''
We have added YOLO and checked our GPU availability. Next we will see how we can read the images from the drive,
Convert the annotated data into YOLO format, train and make predictions.
'''

'\nWe have added YOLO and checked our GPU availability. Next we will see how we can read the images from the drive,\nConvert the annotated data into YOLO format, train and make predictions.\n'

#Set Kaggle API & Download Dataset

In [5]:
from google.colab import files
files.upload()  # This will prompt you to upload the kaggle.json file

Saving kaggle.json to kaggle.json


{'kaggle.json': b'{"username":"eleosandyadenutsi","key":"fc13fc93ddfea2201bcf5dcfe5ae1a6a"}'}

In [6]:
# Create a Kaggle directory
os.makedirs('/root/.kaggle', exist_ok=True)

# Move the kaggle.json file to the appropriate location
!cp kaggle.json /root/.kaggle/

# Set permissions for the kaggle.json file
!chmod 600 /root/.kaggle/kaggle.json


In [7]:
# Download the dataset
!kaggle datasets download -d ohagwucollinspatrick/ghana-crop-disease

Dataset URL: https://www.kaggle.com/datasets/ohagwucollinspatrick/ghana-crop-disease
License(s): Attribution 4.0 International (CC BY 4.0)
Downloading ghana-crop-disease.zip to /content/yolov5
100% 10.4G/10.4G [04:57<00:00, 42.9MB/s]
100% 10.4G/10.4G [04:57<00:00, 37.4MB/s]


In [8]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [9]:
# Upload the dataset
!mv ghana-crop-disease.zip /content/drive/MyDrive/

In [None]:
'''
The dataset has been downloaded and uploaded to google drive. Run the above cells to get the dataset.
Run the cells below to continue the project.
'''

#Data Preprocessing and Label Annotation

In [10]:
# Load the annotation csv
train = pd.read_csv('/content/drive/MyDrive/zindi_train.csv')

# Check the data
train.head()

Unnamed: 0,Image_ID,confidence,class,ymin,xmin,ymax,xmax
0,id_11543h.jpg,1.0,Pepper_Bacterial_Spot,194.649671,328.803454,208.10773,341.967928
1,id_11543h.jpg,1.0,Pepper_Bacterial_Spot,149.632401,256.768914,162.910362,266.195724
2,id_11543h.jpg,1.0,Pepper_Bacterial_Spot,234.046875,327.138158,252.712993,338.876645
3,id_11543h.jpg,1.0,Pepper_Bacterial_Spot,221.277138,340.411184,238.59375,354.651316
4,id_11ee1c.jpg,1.0,Pepper_Fusarium,2000.563598,989.588908,2184.252196,1401.748952


In [15]:
# Check the number of unique classes
train['class'].nunique()

23

In [16]:
'''
There are 23 different annotated classes indicating there are 23 different diseases identified from the images.
The ymin, ymax, xmin, xmax are the positions of the various bouding boxes around the disease.
We will need to convert the train set to YOLO format. The following preprocessing steps will take place:
1. Perform data validation to ensure features are of the correct data type.
2. We will convert the values under the class column to numeric
3. We will scale values of ymin, ymax, xmin, xmax
'''

'\nThere are 23 different annotated classes indicating there are 23 different diseases identified from the images.\nThe ymin, ymax, xmin, xmax are the positions of the various bouding boxes around the disease.\nWe will need to convert the train set to YOLO format. The following preprocessing steps will take place:\n1. Perform data validation to ensure features are of the correct data type.\n2. We will convert the values under the class column to numeric\n3. We will scale values of ymin, ymax, xmin, xmax\n'

In [17]:
# Check the data
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41029 entries, 0 to 41028
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Image_ID    41029 non-null  object 
 1   confidence  41029 non-null  float64
 2   class       41029 non-null  object 
 3   ymin        41029 non-null  float64
 4   xmin        41029 non-null  float64
 5   ymax        41029 non-null  float64
 6   xmax        41029 non-null  float64
dtypes: float64(5), object(2)
memory usage: 2.2+ MB


In [18]:
'''
The data is very clean and there are no missing values and the data types are correct.
'''

'\nThe data is very clean and there are no missing values and the data types are correct.\n'

In [19]:
# Convert class to numeric
le = LabelEncoder()
train['class'] = le.fit_transform(train['class'])

# Check the class column
train['class'].unique()

array([ 5,  8,  0,  1, 16, 14, 22, 12, 13,  4,  2,  9, 18, 10, 19,  6, 17, 11, 20, 15, 21,  7,  3])

In [None]:
# Define the Image dimensions
image_width = 640
image_height = 640

# Directory to save YOLO annotations
output_dir = '/content/dataset/labels'
os.makedirs(output_dir, exist_ok=True)

# Convert the data into YOLO format
for index, row in train.iterrows():
  image_name = row['image_id'].replace('.jpg', '')
  class_id = row['class']
  xmin, ymin, xmax, ymax = row['xmin'], row['ymin'], row['xmax'], row['ymax']

  # Calculate the center coordinates and dimensions
  x_center = ((xmax + xmin) / 2) / image_width
  y_center = ((ymax + ymin) / 2) / image_height
  width = (xmax - xmin) / image_width
  height = (ymax - ymin) / image_height