# Fruit-360 preprocessor
This notebook will prepare the fruit-360 dataset for the Peltarion platform.

Note: This notebook requires installation of Sidekick. For more information about this package, see: https://github.com/Peltarion/sidekick

In [18]:
import os
import sidekick
import resource
import functools
import pandas as pd
from glob import glob
from PIL import Image
from sklearn.model_selection import train_test_split

In [12]:
# Path to the raw dataset
input_path = 'fruits-360/Training'
os.chdir(input_path)
# Path to the zip output
output_path = '/tmp/data.zip'

In [13]:
images_rel_path = glob(os.path.join('*', '*.jpg')) + glob(os.path.join('*', '*.png'))
print("Images found: ", len(images_rel_path))

Images found:  53177


## Create Dataframe
The class column values are derived from the names of the subfolders in the `input_path`.

The image column contains the relative path to the images in the subfolders.

In [14]:
df = pd.DataFrame({'image': images_rel_path})
df['class'] = df['image'].apply(lambda path: os.path.basename(os.path.dirname(path)))
df.head()

Unnamed: 0,image,class
0,Tomato 4/r_236_100.jpg,Tomato 4
1,Tomato 4/247_100.jpg,Tomato 4
2,Tomato 4/257_100.jpg,Tomato 4
3,Tomato 4/r_78_100.jpg,Tomato 4
4,Tomato 4/r_68_100.jpg,Tomato 4


### Check that all images have the same format, e.g., RGB

In [15]:
def get_mode(path):
    im = Image.open(path)
    im.close()
    return im.mode

df['image_mode'] = df['image'].apply(lambda path: get_mode(path))
print(df['image_mode'].value_counts())
df = df.drop(['image_mode'], axis=1)

RGB    53177
Name: image_mode, dtype: int64


## Create subsets for training and validation

In [16]:
def create_subsets(df, col='class', validation_size=0.20):
    train_data, validate_data = train_test_split(df, test_size=validation_size, random_state=42, stratify=df[[col]])
    train_data.insert(loc=2, column='subset', value='T')
    validate_data.insert(loc=2, column='subset', value='V')
    return train_data.append(validate_data, ignore_index=True)

df = create_subsets(df)
df['subset'].value_counts()
df.head()

Unnamed: 0,image,class,subset
0,Cocos/121_100.jpg,Cocos,T
1,Lemon/285_100.jpg,Lemon,T
2,Pineapple/279_100.jpg,Pineapple,T
3,Physalis/r_241_100.jpg,Physalis,T
4,Physalis/296_100.jpg,Physalis,T


## Create dataset bundle

In [17]:
'''
Available modes:
- crop_and_resize
- center_crop_or_pad
- resize_image
'''
image_processor = functools.partial(sidekick.process_image, mode='crop_and_resize', size=(100, 100), file_format='jpeg')
sidekick.create_dataset(
    output_path,
    df,
    path_columns=['image'],
    preprocess={
        'image': image_processor
    }
)