In [1]:
import os
import random
import re

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.metrics import confusion_matrix
from sklearn.cluster import MiniBatchKMeans
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier

import tensorflow as tf
from tensorflow.keras import models, layers, Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Dropout
from tensorflow.keras.preprocessing.image import ImageDataGenerator, array_to_img, img_to_array, load_img

In [2]:
# Contains txt files
path_data = r'C:\Users\hnkev\Downloads\W207 Final Project\\'

label_dir = path_data + 'garbage_txt\\'
# Contains img files
image_dir = path_data + 'garbage_img\\'

In [3]:
label_dir

'C:\\Users\\hnkev\\Downloads\\W207 Final Project\\\\garbage_txt\\'

In [4]:
# Read train, validation, test split from txt files
train_file = label_dir + 'one-indexed-files-notrash_train.txt'
val_file   = label_dir + 'one-indexed-files-notrash_val.txt'
test_file  = label_dir + 'one-indexed-files-notrash_test.txt'

df_train = pd.read_csv(train_file, sep=' ', header=None, names=['path', 'label'])
df_valid = pd.read_csv(val_file,   sep=' ', header=None, names=['path', 'label'])
df_test  = pd.read_csv(test_file,   sep=' ', header=None, names=['path', 'label'])

# Converts label id to name
def label_id_to_name(id):
  label_map = {1: 'glass', 2: 'paper', 3: 'cardboard', 4: 'plastic', 5: 'metal', 6: 'trash'}
  return label_map[id]

df_train['label'] = df_train['label'].apply(label_id_to_name)
df_valid['label'] = df_valid['label'].apply(label_id_to_name)
df_test['label'] = df_test['label'].apply(label_id_to_name)

# Change file name to file path i.e. cardboard114.jpg	 to cardboard/cardboard114.jpg	
df_train.path = df_train.path.str.extract(r'([a-z]+)')[0] + "\\" + df_train.path
df_valid.path = df_valid.path.str.extract(r'([a-z]+)')[0] + "\\" + df_valid.path
df_test.path = df_test.path.str.extract(r'([a-z]+)')[0] + "\\" + df_test.path

In [5]:
df_train.label.value_counts()

paper        403
glass        354
plastic      347
cardboard    287
metal        286
trash         91
Name: label, dtype: int64

In [6]:
train_img_list = []
train_img_arr_list = []

for suffix in list(df_train.path):
    img = load_img(image_dir + suffix)
    
    train_img_list.append(img)
    train_img_arr_list.append(img_to_array(img))

In [7]:
train_img_flatten = []
image_size = (224, 224)

for img_arr in train_img_arr_list:
    img = tf.image.resize(img_arr, size=image_size)
    train_img_flatten.append(img.numpy().flatten())
    
X = np.array(train_img_flatten, dtype = 'int8')

In [8]:
test_img_list = []
test_img_arr_list = []

for suffix in list(df_test.path):
    img = load_img(image_dir + suffix)
    
    test_img_list.append(img)
    test_img_arr_list.append(img_to_array(img))

test_img_flatten = []
image_size = (224, 224)

for img_arr in test_img_arr_list:
    img = tf.image.resize(img_arr, size=image_size)
    test_img_flatten.append(img.numpy().flatten())
    
X_test = np.array(test_img_flatten, dtype = 'int8')

In [9]:
maj_baseline = sum(df_test.label == 'paper')/len(df_valid.label)
maj_baseline

0.32926829268292684

In [10]:
df_test_freqs = df_test.label.value_counts()/sum(df_test.label.value_counts())

In [11]:
sum([i**2 for i in df_test_freqs])

0.1842636506048094

In [12]:
# Fit k-nearest neighbors

knn=KNeighborsClassifier(n_neighbors=20)
knn.fit(X, df_train.label)

y_pred_knn=knn.predict(X_test)
y_pred_knn

array(['paper', 'glass', 'cardboard', 'glass', 'plastic', 'metal',
       'paper', 'metal', 'plastic', 'glass', 'glass', 'metal', 'metal',
       'metal', 'glass', 'metal', 'plastic', 'glass', 'paper', 'glass',
       'glass', 'trash', 'plastic', 'glass', 'plastic', 'plastic',
       'glass', 'glass', 'metal', 'plastic', 'glass', 'metal', 'metal',
       'glass', 'glass', 'metal', 'plastic', 'glass', 'paper', 'glass',
       'cardboard', 'plastic', 'glass', 'plastic', 'metal', 'plastic',
       'metal', 'metal', 'plastic', 'paper', 'glass', 'glass', 'metal',
       'glass', 'metal', 'trash', 'paper', 'glass', 'glass', 'glass',
       'paper', 'metal', 'plastic', 'paper', 'glass', 'glass', 'glass',
       'plastic', 'metal', 'plastic', 'glass', 'plastic', 'plastic',
       'plastic', 'plastic', 'metal', 'glass', 'glass', 'glass', 'glass',
       'metal', 'metal', 'metal', 'metal', 'plastic', 'trash', 'trash',
       'glass', 'plastic', 'glass', 'plastic', 'glass', 'glass', 'metal',
    

In [13]:
knn_20_test_acc = sum(df_test.label == y_pred_knn)/len(df_test.label == y_pred_knn)
knn_20_test_acc

0.3874709976798144

In [14]:
#Fit random forest
rf=RandomForestClassifier(random_state = 0)
rf.fit(X, df_train.label)

y_pred_rf=rf.predict(X_test)
y_pred_rf

array(['cardboard', 'paper', 'cardboard', 'glass', 'trash', 'cardboard',
       'paper', 'paper', 'glass', 'metal', 'cardboard', 'glass', 'glass',
       'paper', 'metal', 'plastic', 'cardboard', 'paper', 'paper',
       'cardboard', 'glass', 'trash', 'paper', 'metal', 'plastic',
       'glass', 'glass', 'glass', 'cardboard', 'plastic', 'plastic',
       'paper', 'metal', 'metal', 'paper', 'cardboard', 'plastic',
       'glass', 'paper', 'cardboard', 'cardboard', 'plastic', 'cardboard',
       'plastic', 'glass', 'plastic', 'paper', 'paper', 'paper', 'paper',
       'paper', 'metal', 'metal', 'metal', 'glass', 'trash', 'plastic',
       'glass', 'cardboard', 'glass', 'paper', 'paper', 'plastic',
       'plastic', 'paper', 'paper', 'paper', 'plastic', 'metal', 'paper',
       'metal', 'glass', 'paper', 'plastic', 'plastic', 'glass', 'paper',
       'cardboard', 'metal', 'glass', 'cardboard', 'metal', 'cardboard',
       'paper', 'plastic', 'metal', 'trash', 'trash', 'plastic', 'glass',


In [15]:
rf_test_acc = sum(df_test.label == y_pred_rf)/len(df_test.label == y_pred_rf)
rf_test_acc

0.6635730858468677