## Import Packages ##


In [3]:
import json
import pandas as pd
import matplotlib.pyplot as plt
import shutil
import numpy as np
import os

## Explore Data ##

In [4]:
data = json.load(open("instances_default.json"))

In [5]:
data.keys()

dict_keys(['licenses', 'info', 'annotations', 'images', 'categories'])

In [None]:
data

In [20]:
data['info']

{'year': '2020',
 'url': 'https://github.com/softwaremill/lemon-dataset',
 'version': '1',
 'date_created': '28.07.2020',
 'contributor': 'Maciej Adamiak',
 'institution': 'SoftwareMill',
 'description': 'Lemons quality control dataset'}

## Total number of Lemon Image ##


In [8]:
len(data['images'])

2690

In [9]:
annotations = data['annotations']

## Merge Category with Image ##
Found multiple categories PER image


In [10]:
data['categories']

[{'id': 1, 'supercategory': '', 'name': 'image_quality'},
 {'id': 2, 'supercategory': '', 'name': 'illness'},
 {'id': 3, 'supercategory': '', 'name': 'gangrene'},
 {'id': 4, 'supercategory': '', 'name': 'mould'},
 {'id': 5, 'supercategory': '', 'name': 'blemish'},
 {'id': 6, 'supercategory': '', 'name': 'dark_style_remains'},
 {'id': 7, 'supercategory': '', 'name': 'artifact'},
 {'id': 8, 'supercategory': '', 'name': 'condition'},
 {'id': 9, 'supercategory': '', 'name': 'pedicel'}]

In [11]:
rows = [[dicts['id'],dicts['area'], dicts['image_id'],dicts['category_id']] for dicts in annotations]

In [12]:
df = pd.DataFrame(rows, columns=["Annot_id","area","image_id","category"])
df

Unnamed: 0,Annot_id,area,image_id,category
0,1,539.0,0,9
1,2,622.0,0,5
2,3,809.0,0,5
3,4,30.0,100,5
4,5,31.0,100,2
...,...,...,...,...
33748,33749,1517.0,6502,5
33749,33750,75.0,6502,5
33750,33751,760.0,6502,5
33751,33752,84.0,6502,5


In [None]:
images = data['images']
images

In [14]:
image_rows = [[image['id'],image['file_name']] for image in images]

In [15]:
image_df = pd.DataFrame(image_rows, columns = ['image_id','file_name'])
image_df

Unnamed: 0,image_id,file_name
0,0,images/0001_A_H_0_A.jpg
1,100,images/0003_A_V_150_A.jpg
2,101,images/0003_A_V_15_A.jpg
3,102,images/0003_A_V_165_A.jpg
4,103,images/0003_A_V_30_A.jpg
...,...,...
2685,6098,images/0023_G_H_135_F.jpg
2686,6099,images/0023_G_H_150_F.jpg
2687,6500,images/0037_G_I_0_A.jpg
2688,6501,images/0037_G_I_105_A.jpg


In [16]:
merged_df = df.merge(image_df)
merged_df

Unnamed: 0,Annot_id,area,image_id,category,file_name
0,1,539.0,0,9,images/0001_A_H_0_A.jpg
1,2,622.0,0,5,images/0001_A_H_0_A.jpg
2,3,809.0,0,5,images/0001_A_H_0_A.jpg
3,4,30.0,100,5,images/0003_A_V_150_A.jpg
4,5,31.0,100,2,images/0003_A_V_150_A.jpg
...,...,...,...,...,...
33748,33749,1517.0,6502,5,images/0037_G_I_120_A.jpg
33749,33750,75.0,6502,5,images/0037_G_I_120_A.jpg
33750,33751,760.0,6502,5,images/0037_G_I_120_A.jpg
33751,33752,84.0,6502,5,images/0037_G_I_120_A.jpg


In [20]:
files=merged_df['file_name'].unique().tolist()
len(files)

2690

In [18]:
merged_df[merged_df['file_name']=="images/0001_E_H_0_C.jpg"]

Unnamed: 0,Annot_id,area,image_id,category,file_name
23781,23782,20.0,3407,2,images/0001_E_H_0_C.jpg
23782,23783,7.0,3407,2,images/0001_E_H_0_C.jpg
23783,23784,27.0,3407,5,images/0001_E_H_0_C.jpg
23784,23785,40.0,3407,2,images/0001_E_H_0_C.jpg
23785,23786,520.0,3407,2,images/0001_E_H_0_C.jpg
23786,23787,37.0,3407,2,images/0001_E_H_0_C.jpg
23787,23788,17.0,3407,2,images/0001_E_H_0_C.jpg
23788,23789,18.0,3407,2,images/0001_E_H_0_C.jpg
23789,23790,117.0,3407,5,images/0001_E_H_0_C.jpg
23790,23791,67.0,3407,2,images/0001_E_H_0_C.jpg


In [46]:
merged_df[merged_df['file_name']=="images/0001_E_H_0_C.jpg"].groupby('category').sum('area')

Unnamed: 0_level_0,Annot_id,area,image_id
category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2,356868,826.0,51105
5,118971,706.0,17035
9,23793,145.0,3407


## Split the data into train and test set (ration 70:30) ##

In [59]:
import random
random.seed(2)
random.shuffle(files)
train_img = files[:int((len(files)+1)*.70)] 
test_img= files[int((len(files)+1)*.70):] 

## Create Function to classify unhealthy and healthy lemon images ##

In [60]:
def classify_category(merged_df,file_name):
    targeted=merged_df.loc[merged_df['file_name'] == file_name].groupby('category').sum('area')
    result=[]
    for idx in targeted.index:
        if idx in [2,3,4]:
            result.append(True)
        else:
            result.append(False)
    if any(result):
        return("Unhealthy_lemon")
    else:
        return("Healthy_lemon")


## Loop through each lemon image and allocate it into categorised folder ##

In [61]:
for file in train_img:
        shutil.copy2(f"raw_data/lemon-dataset/{file}",f"raw_data/lemon-dataset/classified_images/Train_lemon/{classify_category(merged_df,file)}/{file.split('/')[-1]}")

In [62]:
for file in test_img:
    shutil.copy2(f"raw_data/lemon-dataset/{file}",f"raw_data/lemon-dataset/classified_images/Test_lemon/{classify_category(merged_df,file)}/{file.split('/')[-1]}")

In [63]:
Train_unhealthy=len(os.listdir("raw_data/lemon-dataset/classified_images/Train_lemon/Unhealthy_lemon"))
Train_healthy=len(os.listdir("raw_data/lemon-dataset/classified_images/Train_lemon/Healthy_lemon"))
Test_unhealthy=len(os.listdir("raw_data/lemon-dataset/classified_images/Test_lemon/Unhealthy_lemon"))
Test_healthy=len(os.listdir("raw_data/lemon-dataset/classified_images/Test_lemon/Healthy_lemon"))

In [64]:
print(f"Total number of train unhealthy lemon:{Train_unhealthy}")
print(f"Total number of train healthy lemon:{Train_healthy}")
print(f"Total number of test unhealthy lemon:{Test_unhealthy}")
print(f"Total number of test healthy lemon:{Test_healthy}")


Total number of train unhealthy lemon:1358
Total number of train healthy lemon:527
Total number of test unhealthy lemon:594
Total number of test healthy lemon:215


In [65]:
sum([Train_unhealthy, Train_healthy, Test_unhealthy, Test_healthy])

2694