# Training classifiers for each values of K using saved features

In [1]:
import os 
import numpy as np 
import pickle
import pandas as pd
import gc


data_dir = os.path.join(os.getcwd(),'BlobStorage')

train_data_df = pd.read_pickle(data_dir+'/train_data_features_df.pkl')
val_data_df = pd.read_pickle(data_dir+'/val_data_features_df.pkl')

In [2]:
#Combining train and val data
train_val_data_df = pd.concat([train_data_df,val_data_df])

In [3]:
#Reading Test data
test_data_df = pd.read_pickle(data_dir+'/test_data_features_df.pkl')
X_test = test_data_df.img_features.apply(pd.Series)
#y_test = test_data_df['class_name'].astype('category')

In [None]:
print(train_data_df.shape)
print(val_data_df.shape)
print(test_data_df.shape)

In [4]:
#Training a classifier for each value of K.
from sklearn.ensemble import RandomForestClassifier

f = open("fasttext/clusterCenters.txt",'r')

lines = f.readlines()

for line in lines[12:]:
  
    line = line.split()
    modelName = line[0]
    classesNow = line[1:]
    print(modelName)
    
    #Subsetting dataframe for only the classes being used now.
    train_now_df = train_val_data_df[train_val_data_df['class_name'].isin(classesNow)]
    
    X_train_val = train_now_df.img_features.apply(pd.Series)
    y_train_val = train_now_df['class_name'].astype('category')

    #training randomforest
    mdl_rf = RandomForestClassifier(n_estimators=700,random_state=0,verbose=1,n_jobs=-1, min_samples_split= 2, min_samples_leaf= 1, max_features= 'auto', max_depth= 40, bootstrap= False)
    
    clf_fit = mdl_rf.fit(X_train_val, y_train_val)

    #Saving baseline model
    #pickle.dump(clf_fit, open('trained_models/'+ modelName + '.sav', 'wb'))
    
    # evaluate the model on test data
    yhat_clf = clf_fit.predict(X_test)

    pred_df = pd.DataFrame(data=yhat_clf, index=test_data_df['image_paths'], columns=['max_prob'])
    pred_df.to_pickle('predictions/'+modelName+'.pkl') 
    
    #Finding prob predictions for all classes
    yhat_clf_prob = clf_fit.predict_proba(X_test)
    
    pred_df = pd.DataFrame(data=yhat_clf_prob, index=test_data_df['image_paths'], columns=clf_fit.classes_)
    pred_df.to_pickle('predictions/all_categories/'+modelName+'.pkl') 
    
    del clf_fit,train_now_df,X_train_val,y_train_val
    gc.collect()
    
f.close()

model70


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:   58.0s
[Parallel(n_jobs=-1)]: Done 176 tasks      | elapsed:  5.0min
[Parallel(n_jobs=-1)]: Done 426 tasks      | elapsed: 11.9min
[Parallel(n_jobs=-1)]: Done 700 out of 700 | elapsed: 19.4min finished
[Parallel(n_jobs=12)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done  26 tasks      | elapsed:    1.1s
[Parallel(n_jobs=12)]: Done 176 tasks      | elapsed:    5.1s
[Parallel(n_jobs=12)]: Done 426 tasks      | elapsed:   11.7s
[Parallel(n_jobs=12)]: Done 700 out of 700 | elapsed:   18.9s finished
[Parallel(n_jobs=12)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done  26 tasks      | elapsed:    0.9s
[Parallel(n_jobs=12)]: Done 176 tasks      | elapsed:    4.9s
[Parallel(n_jobs=12)]: Done 426 tasks      | elapsed:   11.4s
[Parallel(n_jobs=12)]: Done 700 out of 700 | elapsed: 

model75


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done 176 tasks      | elapsed:  6.4min
[Parallel(n_jobs=-1)]: Done 426 tasks      | elapsed: 15.3min


MemoryError: could not allocate 78643200 bytes

# Using Trained classifiers to predict on test data for each K. Saving predictions

In [None]:
import os 
import numpy as np 
import pickle
import pandas as pd

data_dir = os.path.join(os.getcwd(),'BlobStorage')

test_data_df = pd.read_pickle(data_dir+'/test_data_features_df.pkl')

In [None]:
X_test = test_data_df.img_features.apply(pd.Series)
y_test = test_data_df['class_name'].astype('category')

In [None]:
f = open("fasttext/clusterCenters.txt",'r')

lines = f.readlines()

for line in lines:
    line = line.split()
    modelName = line[0]
    classesNow = line[1:]
    print(modelName)
    
    clf_fit = pickle.load(open('trained_models/'+ modelName + '.sav', 'rb')) 
    
    # evaluate the model on test data
    yhat_clf = clf_fit.predict(X_test)
    
    pred_df = pd.DataFrame(data=yhat_clf, index=test_data_df['image_paths'], columns=['max_prob'])
    pred_df.to_pickle('predictions/'+modelName+'.pkl') 
    
    #Finding prob predictions for all classes
    yhat_clf_prob = clf_fit.predict_proba(X_test)
    
    pred_df = pd.DataFrame(data=yhat_clf_prob, index=test_data_df['image_paths'], columns=clf_fit.classes_)
    pred_df.to_pickle('predictions/all_categories/'+modelName+'.pkl') 

f.close()

# Generating close word dict from FastText for each K

In [None]:
#Finding closest words to top predictions on testing set
import math
import pickle
from scipy.spatial import distance
#from itertools import islice

#def take(n, iterable):
#    "Return first n items of the iterable as a list"
#    return list(islice(iterable, n))

def scipy_distance(v, u):
    return distance.euclidean(v, u)

#Reading the fasttext dictionary populated at clustering phase
fastext_dict = pickle.load(open("fasttext/fastext_dict.pkl","rb"))
print(len(fastext_dict))
#print(fastext_dict.keys())
#print(fastext_dict['car'])

#total_classes = 379

dict_keys = list(fastext_dict.keys())

In [None]:
#Generating the close words dictionary for all dictionary keys

closeWords_Count = 6
    
closeWord_dict = {}
    
for word in dict_keys:
    distance_dict = {}
        
    for fast_word in dict_keys:
        dist = scipy_distance(fastext_dict[word],fastext_dict[fast_word])
        distance_dict[fast_word] = dist
            
        #sorted_distace_dict = {k: v for k, v in sorted(distance_dict.items(), key=lambda item: item[1],reverse = True)[:closeWords_Count+1]}
    closeWords_dict = {k: v for k, v in sorted(distance_dict.items(), key=lambda item: item[1])[:closeWords_Count]}
        
    closeWord_dict[word] = list(closeWords_dict.keys())
    
pickle.dump(closeWord_dict, open('close_word_dict/closeWord_dict.pkl', 'wb'))

In [None]:
#Generating the close words dictionary for each model

closeWords_Count = 6

f = open("fasttext/clusterCenters.txt",'r')

lines = f.readlines()

for line in lines:
    
    line = line.split()
    modelName = line[0]
    print(modelName)
    classesNow = line[1:]
    
    closeWord_dict = {}
    
    for word in classesNow:
        distance_dict = {}
        
        for fast_word in dict_keys:
            dist = scipy_distance(fastext_dict[word],fastext_dict[fast_word])
            distance_dict[fast_word] = dist
            
        #sorted_distace_dict = {k: v for k, v in sorted(distance_dict.items(), key=lambda item: item[1],reverse = True)[:closeWords_Count+1]}
        closeWords_dict = {k: v for k, v in sorted(distance_dict.items(), key=lambda item: item[1])[:closeWords_Count]}
        
        closeWord_dict[word] = list(closeWords_dict.keys())
    
    pickle.dump(closeWord_dict, open('close_word_dict/'+ modelName + '_closeWord_dict.pkl', 'wb'))
           
    #pred_df = pd.read_csv('predictions/'+modelName+'.txt', header=True, index=True, sep=',')
f.close()

# Running final predictions from classifier and close word dict

In [1]:
import os 
import numpy as np 
import pickle
import pandas as pd

data_dir = os.path.join(os.getcwd(),'BlobStorage')

test_data_df = pd.read_pickle(data_dir+'/test_data_features_df.pkl')
y_test_df = pd.DataFrame(test_data_df.set_index('image_paths').class_name)

closeWord_dict = pickle.load(open('close_word_dict/closeWord_dict.pkl',"rb"))

In [2]:
#Running final predictions for top 3 predictions from classifier
h = open("Kmodels_final_accuracy.txt", "w")

f = open("fasttext/clusterCenters.txt",'r')

lines = f.readlines()

for line in lines[0:12]:
    
    line = line.split()
    modelName = line[0]
    print(modelName)
    
    #Reading the predictions for each model
    pred_df = pd.read_pickle('predictions/all_categories/'+modelName+'.pkl')
    
    #Finding top 3 predictions
    top_n_predictions = np.argsort(pred_df.values, axis = 1)[:,-3:]
    #then find the associated code for each prediction
    top_class = pred_df.columns[top_n_predictions]
    top_class_df = pd.DataFrame(data=top_class,columns=['top1','top2','top3'],index = pred_df.index)

    results = pd.merge(y_test_df, top_class_df, left_index=True, right_index=True)
    
    #closeWord_dict = pickle.load(open('close_word_dict/'+ modelName + '_closeWord_dict.pkl',"rb"))
    
    results['guesses_1'] = results['top1'].map(closeWord_dict)
    results['guesses_2'] = results['top2'].map(closeWord_dict)
    results['guesses_3'] = results['top3'].map(closeWord_dict)
    
    pred_check = []
    
    #pred_df['pred_check'] = np.where(pred_df['actual_label'] in pred_df['guesses'],1,0)
    for index,row in results.iterrows():
        if (row['class_name'] in row['guesses_1']) or (row['class_name'] in row['guesses_2']) or (row['class_name'] in row['guesses_3']):
            pred_check.append(1)
        else:
            pred_check.append(0)
        
    results['pred_check'] = pred_check
    
    total_right = results['pred_check'].sum()
    total_rows = len(pred_df)
    accuracy = round(total_right/total_rows,4)
    
    h.write(str(modelName) + ',' + str(accuracy) + '\n')
    
f.close()
h.close()  

model10
model15
model20
model25
model30
model35
model40
model45
model50
model55
model60
model65


In [None]:
#Running final predictions for single predictions
h = open("Kmodels_singlePred_final_accuracy.txt", "w")

f = open("fasttext/clusterCenters.txt",'r')

lines = f.readlines()

for line in lines:
    
    line = line.split()
    modelName = line[0]
    print(modelName)
    
    #Reading the predictions for each model
    pred_df = pd.read_pickle('predictions/'+modelName+'.pkl')

    results = pd.merge(y_test_df, pred_df, left_index=True, right_index=True)
    
    closeWord_dict = pickle.load(open('close_word_dict/'+ modelName + '_closeWord_dict.pkl',"rb"))
    
    results['guesses'] = results['max_prob'].map(closeWord_dict)
    
    pred_check = []
    
    #pred_df['pred_check'] = np.where(pred_df['actual_label'] in pred_df['guesses'],1,0)
    for index,row in results.iterrows():
        if row['class_name'] in row['guesses']:
            pred_check.append(1)
        else:
            pred_check.append(0)
        
    results['pred_check'] = pred_check
    
    total_right = results['pred_check'].sum()
    total_rows = len(pred_df)
    accuracy = round(total_right/total_rows,4)
    
    h.write(str(modelName) + ',' + str(accuracy) + '\n')
    
f.close()
h.close()  