# Fruit-360 preprocessor
This notebook will prepare the fruit-360 dataset for the Peltarion platform.

Note: This notebook requires installation of Sidekick. For more information about this package, see: https://github.com/Peltarion/sidekick

In [1]:
import os
import sidekick
import resource
import functools
import pandas as pd
from glob import glob
from PIL import Image
from tqdm import tqdm
from sklearn.model_selection import train_test_split

In [15]:
!pwd

/Users/joakim/rep/fruits


## Setup

### Paths

In [3]:
# Raw dataset
input_path = 'fruits-360/Training_added_granny_smith'
#os.chdir(input_path)
# Zip output
output_path = './data_modified3.zip'

### Progress bar for Pandas

In [4]:
tqdm.pandas()

### Get list of image paths

In [19]:
images_rel_path = glob('fruits-360/Training/*/*.jpg') + glob('fruits-360/Training/*/*.png')
print("Images found: ", len(images_rel_path))

Images found:  53177


## Create Dataframe
The class column values are derived from the names of the subfolders in the `input_path`.

The image column contains the relative path to the images in the subfolders.

In [20]:
df = pd.DataFrame({'image': images_rel_path})
#df['class'] = df['image'].apply(lambda path: os.path.basename(os.path.dirname(path)))
df['class'] = df['image'].progress_apply(lambda path: os.path.basename(os.path.dirname(path)))
df.head()

100%|██████████| 53177/53177 [00:00<00:00, 305172.91it/s]


Unnamed: 0,image,class
0,fruits-360/Training/Tomato 4/r_236_100.jpg,Tomato 4
1,fruits-360/Training/Tomato 4/247_100.jpg,Tomato 4
2,fruits-360/Training/Tomato 4/257_100.jpg,Tomato 4
3,fruits-360/Training/Tomato 4/r_78_100.jpg,Tomato 4
4,fruits-360/Training/Tomato 4/r_68_100.jpg,Tomato 4


### Check that all images have the same format, e.g., RGB

In [21]:
def get_mode(path):
    im = Image.open(path)
    im.close()
    return im.mode

df['image_mode'] = df['image'].progress_apply(lambda path: get_mode(path))
print(df['image_mode'].value_counts())
df = df.drop(['image_mode'], axis=1)

100%|██████████| 53177/53177 [00:18<00:00, 2861.97it/s]

RGB    53177
Name: image_mode, dtype: int64





## Create subsets for training and validation

In [8]:
'''
def create_subsets(df, col='class', validation_size=0.20):
    train_data, validate_data = train_test_split(df, test_size=validation_size, random_state=42, stratify=df[[col]])
    train_data.insert(loc=2, column='subset', value='T')
    validate_data.insert(loc=2, column='subset', value='V')
    return train_data.append(validate_data, ignore_index=True)

df = create_subsets(df)
df['subset'].value_counts()
df.head()
'''

"\ndef create_subsets(df, col='class', validation_size=0.20):\n    train_data, validate_data = train_test_split(df, test_size=validation_size, random_state=42, stratify=df[[col]])\n    train_data.insert(loc=2, column='subset', value='T')\n    validate_data.insert(loc=2, column='subset', value='V')\n    return train_data.append(validate_data, ignore_index=True)\n\ndf = create_subsets(df)\ndf['subset'].value_counts()\ndf.head()\n"

## View number of rows per class

In [8]:
pd.set_option('display.max_rows', 150)
df['class'].value_counts()

Grape Blue             984
Plum 3                 900
Melon Piel de Sapo     738
Peach 2                738
Tomato 3               738
Strawberry Wedge       738
Tomato 1               738
Cherry Rainier         738
Cherry 2               738
Walnut                 735
Tomato 2               672
Apple Red Yellow 2     672
Pepper Yellow          666
Pepper Red             666
Pear Red               666
Apple Granny Smith     502
Pineapple Mini         493
Peach                  492
Cherry 1               492
Apple Braeburn         492
Grapefruit White       492
Pear                   492
Peach Flat             492
Apple Red 1            492
Redcurrant             492
Cherry Wax Black       492
Strawberry             492
Cantaloupe 1           492
Pomegranate            492
Rambutan               492
Physalis with Husk     492
Nectarine              492
Apple Golden 1         492
Apricot                492
Mulberry               492
Cherry Wax Red         492
Apple Red Yellow 1     492
A

## Create dataset bundle

In [9]:
'''
Available modes:
- crop_and_resize
- center_crop_or_pad
- resize_image
'''
image_processor = functools.partial(sidekick.process_image, mode='crop_and_resize', size=(100, 100), file_format='jpeg')
sidekick.create_dataset(
    output_path,
    df,
    path_columns=['image'],
    preprocess={
        'image': image_processor
    }
)