In [62]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import tensorflow as tf

import os
from pathlib import Path

## Tomato Predictor - An introduction
The idea of this project is to learn about:

1. Convolutional Neural Networks (CNNs)
    - Convolution and pooling
    - Common arquitectures - LeNet, AlexNet, VGG, ResNet

2. Image Preprocessing
    - Normalization and standarization of pixels
    - Data Augmentation: from the same sample, generate more images
    - Noise reduction and constrast enhancement

3. Evaluation metrics
    - Precission, recall, F1-score
    - Confusion matrix
    - ROC and AUC curve

4. Optimization and regularization
    - Optimization algorithm - SGC, Adam, RMSprop
    - Techniques to avoid overfitting - dropout, batch normalization, early stopping

## Data

### Extracting data from directories

In [87]:
from pathlib import Path

route_train = 'content/ieee-mbl-cls/train'
route_test = 'content/ieee-mbl-cls/val'
 
dynamic_route_train = "{}/{}".format(route_train, "{}")
dynamic_route_test = "{}/{}".format(route_test, "{}")

labels = []

def extract_labels(route: str) -> None:
    pathlist = Path(route).iterdir()
    for path in pathlist:
        # because path is object not string
        label = str(path.name)   
        labels.append(label)

def convert_data_to_df(dynamic_route):
    images = {} # dict with "image_key_name": "label"

    for label in labels:
        path = dynamic_route.format(label)
        
        pathlist = Path(path).iterdir()
        for path in pathlist:
            images[path.name] = f"{path.name[0]}"

    df = pd.DataFrame({
        "name": images.keys(),
        "label": images.values(), 
    })
    return df
    
extract_labels(route_train)


In [88]:
df = convert_data_to_df(dynamic_route_train)
df = df.sample(frac=1).reset_index(drop=True) # shuffle the dataframe
df.head()

Unnamed: 0,name,label
0,r (2334).jpg,r
1,o (1645).jpg,o
2,u (425).jpg,u
3,o (760).jpg,o
4,r (705).jpg,r


In [89]:
df_test = convert_data_to_df(dynamic_route_test)
df_test = df_test.sample(frac=1).reset_index(drop=True) # shuffle the dataframe
df_test.head()

Unnamed: 0,name,label
0,r (875).jpg,r
1,r (978).jpg,r
2,o (900).jpg,o
3,o (1895).jpg,o
4,u (735).jpg,u


### From images to pixels

Our images are a 256px x 256px. Then, we have adjust our CNN for this. However, we first have to extract the pixel information of the image in order to be able to process the data in the neural network.

In [92]:
import pandas as pd
from PIL import Image
from numpy import asarray

def add_pixels_image_data(df, route, index, name, target_size=(256, 256)):
    """
    For the row at the specified index, it will append the image's pixel data after processing it.
    The image will be resized and normalized before being added to the DataFrame.
    """
    label = ''
    first = name[0]
    
    if first == 'r':
        first = "Ripe"
    elif first == "o":
        first = "Old"
    elif first == "u":
        first = "Unripe"
    else:
        first = "Damaged"

    path = f"{route}/{first}/{name}"

    image = Image.open(path)
    print(f"Format: {image.format}, pixels: {image.size}, mode: {image.mode}\n")    
    image = image.resize(target_size)

    numpydata = asarray(image)
    numpydata = numpydata.astype('float32')

    flattened_image = numpydata.flatten()

    if 'image' not in df.columns:
        df['image'] = pd.Series([None] * len(df))

    df.at[index, 'image'] = flattened_image

    return df

In [None]:
for index, row in df.iterrows():
    df = add_pixels_image_data(df, route_train, index, row['name'])

Format: JPEG, pixels: (256, 256), mode: RGB

Format: JPEG, pixels: (256, 256), mode: RGB

Format: JPEG, pixels: (256, 256), mode: RGB

Format: JPEG, pixels: (256, 256), mode: RGB

Format: JPEG, pixels: (256, 256), mode: RGB

Format: PNG, pixels: (256, 256), mode: RGBA

Format: JPEG, pixels: (256, 256), mode: RGB

Format: JPEG, pixels: (256, 256), mode: RGB

Format: JPEG, pixels: (256, 256), mode: RGB

Format: JPEG, pixels: (256, 256), mode: RGB

Format: JPEG, pixels: (256, 256), mode: RGB

Format: PNG, pixels: (256, 256), mode: RGBA

Format: JPEG, pixels: (256, 256), mode: RGB

Format: JPEG, pixels: (256, 256), mode: RGB

Format: JPEG, pixels: (256, 256), mode: RGB

Format: JPEG, pixels: (256, 256), mode: RGB

Format: JPEG, pixels: (256, 256), mode: RGB

Format: JPEG, pixels: (256, 256), mode: RGB

Format: JPEG, pixels: (256, 256), mode: RGB

Format: JPEG, pixels: (256, 256), mode: RGB

Format: JPEG, pixels: (256, 256), mode: RGB

Format: JPEG, pixels: (256, 256), mode: RGB

Format: JP

[0. 0. 0. ... 0. 0. 0.]
