# Setting Google environment and importing libraries

In [29]:
# connecting google drive to google colab
# from google.colab import drive
# drive.mount('/content/drive')

In [30]:
# importing libraries
import pandas as pd
import numpy as np
import cv2
import os

# Exploratory

### Check image size

In [59]:
# list of folders with pictures
list_of_apples = os.listdir('drive/MyDrive/Apples/')
list_of_apples

['Apple_B', 'Apple_C', 'Apple_F']

In [60]:
# creating a list of folders
folder_path = 'drive/MyDrive/Apples/'
list_of_folders = [folder_path + apple_kind for apple_kind in list_of_apples]
list_of_folders

['drive/MyDrive/Apples/Apple_B',
 'drive/MyDrive/Apples/Apple_C',
 'drive/MyDrive/Apples/Apple_F']

In [32]:
# loop through all the apples and check its shape
# loop takes care about changing BGR to RGB
# loop flatten the picture
picture_data_C = {}
folder_path_C = 'drive/MyDrive/Apples/Apple_C'
for pic in os.listdir(folder_path_C):
  pic_path = os.path.join(folder_path_C, pic)
  img = cv2.imread(pic_path)
  img_rgb = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
  shape = img_rgb.shape
  pixels = img_rgb.flatten()
  picture_data_C[pic] = {'shape': shape, 'pixels': pixels}

# changing picture data dictionary to a data frame
df_C = pd.DataFrame.from_dict(picture_data_C, orient="index").sort_index().reset_index().rename(columns={"index": "name"})
df_C.value_counts(subset="shape")

shape
(258, 320, 3)    666
(322, 480, 3)    336
dtype: int64

In [33]:
# loop through all the apples and check its shape
# loop takes care about changing BGR to RGB
# loop flatten the picture
picture_data_B = {}
folder_path_B = 'drive/MyDrive/Apples/Apple_B'
for pic in os.listdir(folder_path_B):
  pic_path = os.path.join(folder_path_B, pic)
  img = cv2.imread(pic_path)
  img_rgb = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
  shape = img_rgb.shape
  pixels = img_rgb.flatten()
  picture_data_B[pic] = {'shape': shape, 'pixels': pixels}

# changing picture data dictionary to a data frame
df_B = pd.DataFrame.from_dict(picture_data_B, orient="index").sort_index().reset_index().rename(columns={"index": "name"})
df_B.value_counts(subset="shape")

shape
(322, 480, 3)    740
dtype: int64

In [34]:
# loop through all the apples and check its shape
# loop takes care about changing BGR to RGB
# loop flatten the picture
picture_data_F = {}
folder_path_F = 'drive/MyDrive/Apples/Apple_F'
for pic in os.listdir(folder_path_F):
  pic_path = os.path.join(folder_path_F, pic)
  img = cv2.imread(pic_path)
  img_rgb = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
  shape = img_rgb.shape
  pixels = img_rgb.flatten()
  picture_data_F[pic] = {'shape': shape, 'pixels': pixels}

# changing picture data dictionary to a data frame
df_F = pd.DataFrame.from_dict(picture_data_F, orient="index").sort_index().reset_index().rename(columns={"index": "name"})
df_F.value_counts(subset="shape")

shape
(258, 320, 3)    2030
dtype: int64

**First problem with dataset is that images have different shape** <br>
* There is over 1000 samples with different shape
* There is a need to do a resamplig to reseize them to one common shape
* While resizing it needs to be consider aspect ratio and color
* Color is especially important, because it allows to differenciate apples easily than by its shape



**Some of interpolation methods while resizing an image:** <br>
https://www.geeksforgeeks.org/image-resizing-using-opencv-python/

# Resizing and storing into single dataframe

In [49]:
def store_pictures_info_in_dataframe(folder_path:str,
                                     fixed_size:tuple,
                                     apple_class:int) -> pd.DataFrame:
  
  picture_data = {}

  for pic in os.listdir(folder_path):
    pic_path = os.path.join(folder_path, pic)
    img = cv2.imread(pic_path)
    img_rgb = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
    
    # Resize the image to the fixed size
    # interpolation=cv2.INTER_AREA works well with shrinking images
    img_resized = cv2.resize(src=img_rgb,
                             dsize=fixed_size,
                             interpolation=cv2.INTER_AREA)
    
    shape = img_resized.shape
    pixels = img_resized.flatten()
    picture_data[pic] = {'shape': shape, 'pixels': pixels}

  df = pd.DataFrame.from_dict(picture_data, orient="index").sort_index().reset_index().rename(columns={"index": "name"})
  df["apple_class"] = apple_class

  return df

In [65]:
dfs = {}
for apple_class, path in enumerate(list_of_folders):
  df_name = f"df_{apple_class}"
  df = store_pictures_info_in_dataframe(folder_path=path,
                                        fixed_size=(256, 256),
                                        apple_class=apple_class)
  dfs[df_name] = df

In [68]:
df = pd.concat(dfs.values(), ignore_index=True).reset_index()
df

Unnamed: 0,name,shape,pixels,apple_class
0,102red applee00901102.png,"(256, 256, 3)","[57, 58, 56, 53, 53, 48, 58, 61, 57, 61, 64, 5...",0
1,103red applee00916103.png,"(256, 256, 3)","[34, 43, 46, 43, 43, 49, 38, 47, 47, 38, 47, 5...",0
2,107red applee01001107.png,"(256, 256, 3)","[50, 57, 48, 47, 50, 45, 54, 58, 50, 57, 61, 4...",0
3,108red applee01006108.png,"(256, 256, 3)","[30, 47, 56, 36, 48, 52, 33, 41, 47, 35, 45, 5...",0
4,109red applee01021109.png,"(256, 256, 3)","[33, 45, 56, 40, 47, 53, 37, 40, 48, 39, 44, 5...",0
...,...,...,...,...
3767,scene07801.png,"(256, 256, 3)","[45, 38, 41, 61, 55, 56, 69, 66, 64, 69, 68, 6...",2
3768,scene07821.png,"(256, 256, 3)","[48, 55, 48, 65, 69, 61, 74, 74, 64, 76, 76, 7...",2
3769,scene07841.png,"(256, 256, 3)","[43, 50, 45, 61, 66, 61, 64, 65, 58, 70, 71, 6...",2
3770,scene07861.png,"(256, 256, 3)","[40, 47, 42, 59, 64, 58, 64, 65, 58, 70, 71, 6...",2
