# Explorative Analysis

In [7]:
import pandas as pd
import json

In [8]:
train = pd.read_csv('data/train.csv')
train.head()

Unnamed: 0,image_id,label
0,1000015157.jpg,0
1,1000201771.jpg,3
2,100042118.jpg,1
3,1000723321.jpg,1
4,1000812911.jpg,3


In [9]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21397 entries, 0 to 21396
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   image_id  21397 non-null  object
 1   label     21397 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 334.5+ KB


## Enhance training with human readable labels

In [15]:
with open('data/label_num_to_disease_map.json') as f:
    label_num_to_disease_map = json.load(f)
label_num_to_disease_map

{'0': 'Cassava Bacterial Blight (CBB)',
 '1': 'Cassava Brown Streak Disease (CBSD)',
 '2': 'Cassava Green Mottle (CGM)',
 '3': 'Cassava Mosaic Disease (CMD)',
 '4': 'Healthy'}

In [16]:
label_num_to_disease_map['0']

'Cassava Bacterial Blight (CBB)'

In [21]:
label_num_to_disease_map_int = dict()
for i in label_num_to_disease_map:
    label_num_to_disease_map_int[int(i)] = label_num_to_disease_map[i]
label_num_to_disease_map_int

{0: 'Cassava Bacterial Blight (CBB)',
 1: 'Cassava Brown Streak Disease (CBSD)',
 2: 'Cassava Green Mottle (CGM)',
 3: 'Cassava Mosaic Disease (CMD)',
 4: 'Healthy'}

In [22]:
train['label_description'] = train['label'].apply(lambda x: label_num_to_disease_map_int[x])

## Label distribution

In [24]:
train.groupby(['label','label_description']).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,image_id
label,label_description,Unnamed: 2_level_1
0,Cassava Bacterial Blight (CBB),1087
1,Cassava Brown Streak Disease (CBSD),2189
2,Cassava Green Mottle (CGM),2386
3,Cassava Mosaic Disease (CMD),13158
4,Healthy,2577


The trainingset seems highly unbalanced

# Looking at images

## Are all images in the same format?

In [27]:
from PIL import Image
def get_image_info(path):
    with Image.open('data/train_images/'+path) as im:
        return {'format':im.format, 'width':im.size[0], 'height':im.size[1], 'mode':im.mode}

In [28]:
image_info = train['image_id'].apply(get_image_info)

In [38]:
pd.DataFrame.from_records(image_info).describe(include='all')

Unnamed: 0,format,width,height,mode
count,21397,21397.0,21397.0,21397
unique,1,,,1
top,JPEG,,,RGB
freq,21397,,,21397
mean,,800.0,600.0,
std,,0.0,0.0,
min,,800.0,600.0,
25%,,800.0,600.0,
50%,,800.0,600.0,
75%,,800.0,600.0,


All 800x600 JPEG RGB