# Trash Classifier Data Prep

Make sure to run the following scripts:
- `setup_folders.py`
- `feature_extration/edge_detect.py`

In [1]:
import os
import pandas as pd
from PIL import Image
import numpy as np
import matplotlib.pyplot as plt
import cv2

from feature_extraction import color_hist, edge_detect, utils
DATA_CATEGORIES = ['cardboard', 'glass', 'metal', 'paper', 'plastic', 'trash']
METHODS = ['gaussian', 'mean', 'median']
data_dir = 'dataset-resized'

## Loading the RGB + edge detection histogram images into a df

In [2]:
def extract_color_histogram(row):
    ch = color_hist.ColorHist(row['im_path'], 'outputs')
    ch.get_color_hist()
    ch.get_corresponding_edge()
    ch.get_edge_map()
    ch.merge_hists()
    row['merged'] = ch.merged
    return row

In [3]:
colorHist_df = pd.DataFrame(utils.create_image_paths(data_dir, load_images=True, vectorize_images=True))
colorHist_df = colorHist_df.apply(extract_color_histogram, axis=1)

In [4]:
colorHist_df.head()

Unnamed: 0,im_path,class,im_arr,im_shape,vectorized_R,vectorized_G,vectorized_B,merged
0,dataset-resized/paper/paper283.jpg,paper,"[[[236, 232, 220], [236, 232, 220], [236, 232,...","(384, 512, 3)","[236, 236, 236, 236, 236, 236, 236, 236, 235, ...","[232, 232, 232, 232, 232, 232, 232, 232, 231, ...","[220, 220, 220, 220, 220, 220, 220, 220, 219, ...","[[[[0.68229167], [0.68489583], [1.36197917], [..."
1,dataset-resized/paper/paper297.jpg,paper,"[[[255, 255, 255], [255, 255, 255], [255, 255,...","(384, 512, 3)","[255, 255, 255, 255, 255, 255, 255, 255, 255, ...","[255, 255, 255, 255, 255, 255, 255, 255, 255, ...","[255, 255, 255, 255, 255, 255, 255, 255, 255, ...","[[[[7.46354167], [12.18489583], [16.08072917],..."
2,dataset-resized/paper/paper526.jpg,paper,"[[[240, 240, 238], [240, 240, 238], [240, 240,...","(384, 512, 3)","[240, 240, 240, 239, 239, 238, 238, 238, 237, ...","[240, 240, 240, 239, 239, 238, 238, 238, 237, ...","[238, 238, 238, 237, 237, 236, 236, 236, 235, ...","[[[[2.03645833], [4.71614583], [4.04947917], [..."
3,dataset-resized/paper/paper240.jpg,paper,"[[[255, 255, 255], [255, 255, 255], [255, 255,...","(384, 512, 3)","[255, 255, 255, 255, 255, 255, 254, 254, 255, ...","[255, 255, 255, 255, 255, 255, 254, 254, 255, ...","[255, 255, 255, 255, 255, 255, 254, 254, 255, ...","[[[[1.38020833], [2.05729167], [2.70572917], [..."
4,dataset-resized/paper/paper254.jpg,paper,"[[[240, 244, 245], [240, 244, 245], [240, 244,...","(384, 512, 3)","[240, 240, 240, 240, 240, 240, 240, 240, 239, ...","[244, 244, 244, 244, 244, 244, 244, 244, 243, ...","[245, 245, 245, 245, 245, 245, 245, 245, 244, ...","[[[[0.], [0.], [0.], [0.], [0.], [0.], [0.], [..."


## Loading the Edges into a df

In [5]:
edgeDetected_df = pd.DataFrame(utils.load_output_images(source_file_names=data_dir, load_images=True, vectorize_images=True))
    

## Merging datasets together

In [6]:
colorHist_df['image_name'] = colorHist_df['im_path'].apply(lambda x: x.split('/')[-1])
edgeDetected_df['image_name'] = edgeDetected_df['im_path'].apply(lambda x: x.split('/')[-1])

In [7]:
combined_full_df = pd.merge(left=colorHist_df,
                     right=edgeDetected_df,
                     left_on="image_name",
                     right_on="image_name",
                     suffixes=("_colorHist", "_edgeDetected")
                    ).drop(['class_colorHist'], axis=1).rename(columns={'class_edgeDetected':'class'})
def extract_individual(row):
    row['edges_hist'] = row['merged'][0]
    row['red_hist'] = row['merged'][1]
    row['green_hist'] = row['merged'][2]
    row['blue_hist'] = row['merged'][3]
    return row

combined_full_df = combined_full_df.apply(extract_individual, axis=1)


In [8]:
combined_full_df.head()

Unnamed: 0,im_path_colorHist,im_arr_colorHist,im_shape_colorHist,vectorized_R,vectorized_G,vectorized_B,merged,image_name,class,im_path_edgeDetected,...,vectorized_gaussian_R,vectorized_gaussian_G,vectorized_gaussian_B,vectorized_median_R,vectorized_median_G,vectorized_median_B,edges_hist,red_hist,green_hist,blue_hist
0,dataset-resized/paper/paper283.jpg,"[[[236, 232, 220], [236, 232, 220], [236, 232,...","(384, 512, 3)","[236, 236, 236, 236, 236, 236, 236, 236, 235, ...","[232, 232, 232, 232, 232, 232, 232, 232, 231, ...","[220, 220, 220, 220, 220, 220, 220, 220, 219, ...","[[[[0.68229167], [0.68489583], [1.36197917], [...",paper283.jpg,paper,outputs/paper/median/paper283.jpg,...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[[[0.6822916666666666], [0.6848958333333334], ...","[[[0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0...","[[[0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0...","[[[0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0..."
1,dataset-resized/paper/paper297.jpg,"[[[255, 255, 255], [255, 255, 255], [255, 255,...","(384, 512, 3)","[255, 255, 255, 255, 255, 255, 255, 255, 255, ...","[255, 255, 255, 255, 255, 255, 255, 255, 255, ...","[255, 255, 255, 255, 255, 255, 255, 255, 255, ...","[[[[7.46354167], [12.18489583], [16.08072917],...",paper297.jpg,paper,outputs/paper/median/paper297.jpg,...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 255, 1...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 2, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 255, 0...","[[[7.463541666666667], [12.184895833333334], [...","[[[0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0...","[[[0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0...","[[[0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0..."
2,dataset-resized/paper/paper526.jpg,"[[[240, 240, 238], [240, 240, 238], [240, 240,...","(384, 512, 3)","[240, 240, 240, 239, 239, 238, 238, 238, 237, ...","[240, 240, 240, 239, 239, 238, 238, 238, 237, ...","[238, 238, 238, 237, 237, 236, 236, 236, 235, ...","[[[[2.03645833], [4.71614583], [4.04947917], [...",paper526.jpg,paper,outputs/paper/median/paper526.jpg,...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[[[2.0364583333333335], [4.716145833333333], [...","[[[0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0...","[[[0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0...","[[[0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0..."
3,dataset-resized/paper/paper240.jpg,"[[[255, 255, 255], [255, 255, 255], [255, 255,...","(384, 512, 3)","[255, 255, 255, 255, 255, 255, 254, 254, 255, ...","[255, 255, 255, 255, 255, 255, 254, 254, 255, ...","[255, 255, 255, 255, 255, 255, 254, 254, 255, ...","[[[[1.38020833], [2.05729167], [2.70572917], [...",paper240.jpg,paper,outputs/paper/median/paper240.jpg,...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[[[1.3802083333333333], [2.0572916666666665], ...","[[[0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0...","[[[0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0...","[[[0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0..."
4,dataset-resized/paper/paper254.jpg,"[[[240, 244, 245], [240, 244, 245], [240, 244,...","(384, 512, 3)","[240, 240, 240, 240, 240, 240, 240, 240, 239, ...","[244, 244, 244, 244, 244, 244, 244, 244, 243, ...","[245, 245, 245, 245, 245, 245, 245, 245, 244, ...","[[[[0.], [0.], [0.], [0.], [0.], [0.], [0.], [...",paper254.jpg,paper,outputs/paper/median/paper254.jpg,...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[[[0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0...","[[[0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0...","[[[0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0...","[[[0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0..."


In [9]:
combined_full_df.columns

Index(['im_path_colorHist', 'im_arr_colorHist', 'im_shape_colorHist',
       'vectorized_R', 'vectorized_G', 'vectorized_B', 'merged', 'image_name',
       'class', 'im_path_edgeDetected', 'im_arr_edgeDetected',
       'im_shape_edgeDetected', 'vectorized_mean_R', 'vectorized_mean_G',
       'vectorized_mean_B', 'vectorized_gaussian_R', 'vectorized_gaussian_G',
       'vectorized_gaussian_B', 'vectorized_median_R', 'vectorized_median_G',
       'vectorized_median_B', 'edges_hist', 'red_hist', 'green_hist',
       'blue_hist'],
      dtype='object')

## Train Models below



In [10]:
from sklearn.model_selection import train_test_split

In [11]:
X_columns = [
#     "vectorized_R", # red band of raw image
#     "vectorized_G", # blue band of raw image
#     "vectorized_B", # green band of raw image
    'merged',
    'edges_hist',
    'red_hist',
    'green_hist',
    'blue_hist'
    # edge detection use mean
#     'vectorized_mean_R','vectorized_mean_G','vectorized_mean_B',
    # edge detecion using gaussian
#     'vectorized_gaussian_R','vectorized_gaussian_G','vectorized_gaussian_B',
    # edge detection using median
#     'vectorized_median_R','vectorized_median_G','vectorized_median_B'
]
Y_column = "class"


In [12]:
X_train, X_test, y_train, y_test = train_test_split(combined_full_df[X_columns],
                                                    combined_full_df[Y_column],
                                                    test_size=0.2,
                                                    random_state=42)


In [13]:
X_train

Unnamed: 0,merged,edges_hist,red_hist,green_hist,blue_hist
157,"[[[[0.], [0.], [0.], [0.], [0.], [0.], [0.], [...","[[[0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0...","[[[0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0...","[[[0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0...","[[[0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0..."
1173,"[[[[8.73958333], [9.44270833], [13.4140625], [...","[[[8.739583333333334], [9.442708333333334], [1...","[[[1.0], [0.0], [1.0], [0.0], [0.0], [0.0], [0...","[[[0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [4...","[[[1.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0..."
836,"[[[[0.], [0.], [0.], [0.], [0.], [0.], [0.], [...","[[[0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0...","[[[0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0...","[[[0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0...","[[[0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0..."
2073,"[[[[0.], [0.], [0.], [0.], [0.], [0.], [0.], [...","[[[0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0...","[[[0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0...","[[[0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0...","[[[0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0..."
135,"[[[[0.], [0.], [0.], [0.], [0.], [0.], [0.], [...","[[[0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0...","[[[5.0], [0.0], [0.0], [0.0], [0.0], [0.0], [2...","[[[4.0], [0.0], [2.0], [0.0], [0.0], [0.0], [0...","[[[1.0], [0.0], [3.0], [0.0], [0.0], [0.0], [1..."
...,...,...,...,...,...
1638,"[[[[0.], [0.], [0.], [0.], [0.], [0.], [0.], [...","[[[0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0...","[[[0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0...","[[[0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0...","[[[0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0..."
1095,"[[[[2.09895833], [6.73177083], [7.38541667], [...","[[[2.0989583333333335], [6.731770833333333], [...","[[[1.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0...","[[[1.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0...","[[[1.0], [0.0], [1.0], [0.0], [0.0], [0.0], [0..."
1130,"[[[[0.], [0.], [0.], [0.], [0.], [0.], [0.], [...","[[[0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0...","[[[0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0...","[[[0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0...","[[[0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0..."
1294,"[[[[0.], [0.], [0.], [0.], [0.], [0.], [0.], [...","[[[0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0...","[[[0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0...","[[[0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0...","[[[0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0..."
