In [1]:
import os
import random
import re

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.metrics import confusion_matrix
from sklearn.cluster import MiniBatchKMeans
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier

import tensorflow as tf
from tensorflow.keras import models, layers, Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Dropout
from tensorflow.keras.preprocessing.image import ImageDataGenerator, array_to_img, img_to_array, load_img

In [2]:
# Contains txt files
path_data = r'C:\Users\hnkev\Downloads\W207 Final Project\\'

label_dir = path_data + 'garbage_txt\\'
# Contains img files
image_dir = path_data + 'garbage_img\\'

In [3]:
label_dir

'C:\\Users\\hnkev\\Downloads\\W207 Final Project\\\\garbage_txt\\'

In [4]:
# Read train, validation, test split from txt files
train_file = label_dir + 'one-indexed-files-notrash_train.txt'
val_file   = label_dir + 'one-indexed-files-notrash_val.txt'
test_file  = label_dir + 'one-indexed-files-notrash_test.txt'

df_train = pd.read_csv(train_file, sep=' ', header=None, names=['path', 'label'])
df_valid = pd.read_csv(val_file,   sep=' ', header=None, names=['path', 'label'])
df_test  = pd.read_csv(test_file,   sep=' ', header=None, names=['path', 'label'])

# Converts label id to name
def label_id_to_name(id):
  label_map = {1: 'glass', 2: 'paper', 3: 'cardboard', 4: 'plastic', 5: 'metal', 6: 'trash'}
  return label_map[id]

df_train['label'] = df_train['label'].apply(label_id_to_name)
df_valid['label'] = df_valid['label'].apply(label_id_to_name)
df_test['label'] = df_test['label'].apply(label_id_to_name)

# Change file name to file path i.e. cardboard114.jpg	 to cardboard/cardboard114.jpg	
df_train.path = df_train.path.str.extract(r'([a-z]+)')[0] + "\\" + df_train.path
df_valid.path = df_valid.path.str.extract(r'([a-z]+)')[0] + "\\" + df_valid.path
df_test.path = df_test.path.str.extract(r'([a-z]+)')[0] + "\\" + df_test.path

In [5]:
df_train.label.value_counts()

paper        403
glass        354
plastic      347
cardboard    287
metal        286
trash         91
Name: label, dtype: int64

In [6]:
train_img_list = []
train_img_arr_list = []

for suffix in list(df_train.path):
    img = load_img(image_dir + suffix)
    
    train_img_list.append(img)
    train_img_arr_list.append(img_to_array(img))

In [7]:
train_img_flatten = []
image_size = (224, 224)

for img_arr in train_img_arr_list:
    img = tf.image.resize(img_arr, size=image_size)
    train_img_flatten.append(img.numpy().flatten())
    
X = np.array(train_img_flatten, dtype = 'int8')

In [8]:
n_clusters = 6
random_state = 0  # makes sure you get the same results each time

def fit_kmeans(X, n_clusters, random_state):
  ## YOUR CODE HERE ## 
  model = MiniBatchKMeans(n_clusters=n_clusters,
                 random_state=random_state,
                 batch_size = 16384)
  model.fit(X)
  return model

model = fit_kmeans(X, n_clusters, random_state)



In [17]:
label_map_reverse = {'glass':0, 'paper':1, 'cardboard':2, 'plastic':3, 'metal':4, 'trash':5}

In [18]:
#Map the centroids to the corresponding values, 
#which are the most probable labels in the training set for the k means cluster labels

label_nums = np.array([label_map_reverse[label] for label in df_train.label])

In [19]:
reference_labels = {}

for i in np.unique(model.labels_):
    index = np.where(model.labels_ == i, 1, 0)
    num = np.bincount(label_nums[index == 1]).argmax()
    reference_labels[i] = num

reference_labels

{0: 1, 1: 3, 2: 0, 3: 2, 4: 1, 5: 2}

In [20]:
reference_labels_verbal = {0:'paper', 1:'plastic', 2:'glass', 3:'cardboard', 4:'paper', 5:'cardboard'}

In [9]:
test_img_list = []
test_img_arr_list = []

for suffix in list(df_test.path):
    img = load_img(image_dir + suffix)
    
    test_img_list.append(img)
    test_img_arr_list.append(img_to_array(img))

test_img_flatten = []
image_size = (224, 224)

for img_arr in test_img_arr_list:
    img = tf.image.resize(img_arr, size=image_size)
    test_img_flatten.append(img.numpy().flatten())
    
X_test = np.array(test_img_flatten, dtype = 'int8')

In [11]:
test_preds = model.predict(X_test)

In [21]:
df_kmeans_test = df_test
df_kmeans_test['kmeans_preds'] = [reference_labels_verbal[cluster] for cluster in test_preds]

In [22]:
df_kmeans_test

Unnamed: 0,path,label,kmeans_preds
0,paper\paper70.jpg,paper,cardboard
1,paper\paper380.jpg,paper,glass
2,cardboard\cardboard31.jpg,cardboard,cardboard
3,glass\glass12.jpg,glass,paper
4,paper\paper169.jpg,paper,paper
...,...,...,...
426,metal\metal389.jpg,metal,cardboard
427,paper\paper303.jpg,paper,plastic
428,paper\paper405.jpg,paper,glass
429,paper\paper465.jpg,paper,glass


In [23]:
kmeans_6_test_acc = sum(df_test.label == df_kmeans_test.kmeans_preds)/len(df_test.label == df_kmeans_test.kmeans_preds)
kmeans_6_test_acc

0.3805104408352668