# Creating Train, Validation Features and Saving

In [None]:
#Reading ClusterCenters and finding the classes to train on.
import os 
import numpy as np 

data_dir = os.path.join(os.getcwd(),'BlobStorage')
validation_data_dir = os.path.join(data_dir, 'validation_data')
train_data_dir = os.path.join(data_dir, 'train_data')
test_data_dir = os.path.join(data_dir, 'test_data')
f = open("fasttext/clusterCenters.txt",'r')

lines = f.readlines()

#print(lines)

line = lines[0].split()
print(line)
modelName = line[0]
kValue = int(modelName[5:])
classesNow = line[1:]
print(modelName)
print(kValue)
print(classesNow)
f.close() 

In [None]:
#Creating list of all image paths for train and validation data.
from os import walk
from os.path import normpath, basename

train_path_list = []
train_dir_list = []
val_path_list = []
val_dir_list = []

#Generating Train data path list
for (dirpath, dirnames, filenames) in walk(train_data_dir):
    for filename in filenames:
        train_dir_list.append(basename(normpath(dirpath)))
        train_path_list.append(os.path.join(dirpath, filename))
    
#print(len(train_path_list))
#print(train_path_list[2])
#print(len(train_dir_list))
#print(train_dir_list[2])

#Generating Validation data path list
for (dirpath, dirnames, filenames) in walk(validation_data_dir):
    for filename in filenames:
        val_dir_list.append(basename(normpath(dirpath)))
        val_path_list.append(os.path.join(dirpath, filename))
    
#print(len(val_path_list))
#print(val_path_list[2])

In [None]:
#Converting lists into dataframes
import pandas as pd

train_data_df = pd.DataFrame()
train_data_df['image_paths'] = train_path_list
train_data_df['class_name'] = train_dir_list

val_data_df = pd.DataFrame()
val_data_df['image_paths'] = val_path_list
val_data_df['class_name'] = val_dir_list

#train_data_df.describe()
#val_data_df.describe()
#print(train_data_df.loc[[243000]])
#print(val_data_df.loc[[15100]])

print(train_data_df.shape)
print(val_data_df.shape)

#Subsetting dataframes for only the classes being used now.
train_data_df = train_data_df[train_data_df['class_name'].isin(classesNow)]
val_data_df = val_data_df[val_data_df['class_name'].isin(classesNow)]

print(train_data_df.shape)
print(val_data_df.shape)

In [None]:
#Checking if GPU is available
#from keras import backend as K
#K.tensorflow_backend._get_available_gpus()



In [None]:
#Function for extracting features using VGG19 for a given image path
from keras.applications.vgg19 import VGG19
from keras.preprocessing import image
from keras.applications.vgg19 import preprocess_input
from keras.models import Model
import numpy as np
from PIL import ImageFile
ImageFile.LOAD_TRUNCATED_IMAGES = True

base_model = VGG19(weights='imagenet')
#print(base_model.summary())
model = Model(inputs=base_model.input, outputs=base_model.get_layer('fc2').output)

def feature_extract_vgg19(image_path,model_keras):

    img_path = image_path
    img = image.load_img(img_path, target_size=(224, 224))
    x = image.img_to_array(img)
    x = np.expand_dims(x, axis=0)
    x = preprocess_input(x)

    flatten_features = model_keras.predict(x)
    return(flatten_features[0])

In [None]:
#Extracting features and creating dataframe 

train_data_df['img_features'] = [feature_extract_vgg19(x,model_keras = model) for x in train_data_df['image_paths']]

val_data_df['img_features'] = [feature_extract_vgg19(x,model_keras = model) for x in val_data_df['image_paths']]

print(train_data_df.shape)
print(val_data_df.shape)

In [None]:
#Saving Train and Validation features dataframes
import pickle

train_data_df.to_pickle(data_dir+'/train_data_df.pkl')
val_data_df.to_pickle(data_dir+'/val_data_df.pkl')

# Reading Train,Validation feature dataframes and Training SVM

In [None]:
#Loading Saved Train and Validation feature dataframes
import pickle
import os 
import pandas as pd
import numpy as np

data_dir = os.path.join(os.getcwd(),'BlobStorage')

train_data_df = pd.read_pickle(data_dir+'/train_data_df.pkl')
val_data_df = pd.read_pickle(data_dir+'/val_data_df.pkl')

In [None]:
#Splitting array column of features into multiple columns

#feature_set = np.split(train_data_df['img_features'],len(train_data_df),axis = 0)
#feature_set

X_train = train_data_df.img_features.apply(pd.Series)
y_train = train_data_df['class_name'].astype('category')

X_val = val_data_df.img_features.apply(pd.Series)
y_val = val_data_df['class_name'].astype('category')

X_train_val = pd.concat([X_train,X_val])
y_train_val = pd.concat([y_train,y_val])

In [None]:
#X_train, X_test, y_train, y_test = train_test_split(df3, train_data_df['class_name'], random_state = 0) 

In [None]:
print(X_train.shape)
print(X_val.shape)
print(X_train_val.shape)
print(y_train_val.shape)

In [None]:
from sklearn.svm import SVC 
  
# training a linear SVM classifier 
#svm_model_linear = SVC(kernel = 'linear', C = 1, verbose = 1)

svm_model_linear = SVC(kernel = 'rbf', C = 10, verbose = 1,probability = True)

clf_fit = svm_model_linear.fit(X_train_val, y_train_val)

#from sklearn.ensemble import RandomForestClassifier

#training randomforest
#mdl_rf = RandomForestClassifier(n_estimators=1000,random_state=0,verbose=1,n_jobs=-1, min_samples_split= 2, min_samples_leaf= 1, max_features= 'auto', max_depth= 60, bootstrap= False)
    
#clf_fit = mdl_rf.fit(X_train_val, y_train_val)



In [None]:
#GridSearch for SVM and RandomForest
from sklearn.model_selection import GridSearchCV
#from sklearn.svm import SVC 
from sklearn.ensemble import RandomForestClassifier

#param_grid = {'C': [0.1,1, 10, 100],'kernel': ['rbf', 'linear', 'sigmoid']}

#grid = GridSearchCV(SVC(),param_grid,refit=True,verbose=2)
#grid.fit(X_train_val,y_train_val)
#print(grid.best_estimator_)



# Create the parameter grid based on the results of random search 
param_grid = {
    'bootstrap': [True],
    'max_depth': [5,30,90],
    'max_features': [2, 3],
    'min_samples_leaf': [2,5,10],
    'min_samples_split': [5,10,15,100],
    'n_estimators': [500,1000,1500],
}
# Create a based model
rf = RandomForestClassifier(random_state=0,verbose=1,n_jobs=-1)
# Instantiate the grid search model
grid_search = GridSearchCV(estimator = rf, param_grid = param_grid, 
                          cv = 3, n_jobs = -1, verbose = 2)

grid_search.fit(X_train_val, y_train_val)
grid_search.best_params_


In [None]:
#Using RamdomizedSearchCV for RF parameter tuning

from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap,
              'n_jobs': [-1]}

# Use the random grid to search for best hyperparameters
# First create the base model to tune
rf = RandomForestClassifier()
# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 100, cv = 2, verbose=2, random_state=42, n_jobs = None)
# Fit the random search model
rf_random.fit(X_train_val, y_train_val)

print(rf_random.best_params_)

In [None]:
#from sklearn.metrics import confusion_matrix 
# model accuracy for X_test   
#predictions = clf_fit.predict(X_train) 
  
# creating a confusion matrix 
#cm = confusion_matrix(y_train, predictions) 
#print(cm)

#from sklearn.metrics import classification_report 

#print(classification_report(y_train, predictions))

In [None]:
print(rf_random.best_params_)

In [None]:
#from sklearn.calibration import CalibratedClassifierCV

# calibrate model on validation data
#calibrator = CalibratedClassifierCV(clf_fit, cv='prefit').fit(X_val, y_val)

# evaluate the model
#yhat = calibrator.predict(testX)

In [None]:
#Saving SVM and Calibrator
#clf_fit.to_pickle(data_dir+'/rf_trained.pkl')
#calibrator.to_pickle(data_dir+'/calibrator_trained.pkl')

#pickle.dump(clf_fit, open(data_dir+'/rf_trained.sav', 'wb'))

pickle.dump(clf_fit, open(data_dir+'/svm_trained_prob.sav', 'wb'))

#pickle.dump(calibrator, open(data_dir+'/calibrator_trained.sav', 'wb'))

# Loading Test data,extracting Features and using saved model to predict

In [None]:
#Creating list of all image paths for test data
from os import walk
from os.path import normpath, basename
import pickle
import os 
import pandas as pd
import numpy as np


test_path_list = []
test_dir_list = []

data_dir = os.path.join(os.getcwd(),'BlobStorage')
test_data_dir = os.path.join(data_dir, 'test_data_20')

#Generating Train data path list
for (dirpath, dirnames, filenames) in walk(test_data_dir):
    for filename in filenames:
        test_dir_list.append(basename(normpath(dirpath)))
        test_path_list.append(os.path.join(dirpath, filename))

In [None]:
#Converting lists into dataframes
test_data_df = pd.DataFrame()
test_data_df['image_paths'] = test_path_list
test_data_df['class_name'] = test_dir_list

print(test_data_df.shape)

In [None]:
#Function for extracting features using VGG19 for a given image path
from keras.applications.vgg19 import VGG19
from keras.preprocessing import image
from keras.applications.vgg19 import preprocess_input
from keras.models import Model
from PIL import ImageFile
ImageFile.LOAD_TRUNCATED_IMAGES = True

base_model = VGG19(weights='imagenet')
#print(base_model.summary())
model = Model(inputs=base_model.input, outputs=base_model.get_layer('fc2').output)

def feature_extract_vgg19(image_path,model_keras):

    img_path = image_path
    img = image.load_img(img_path, target_size=(224, 224))
    x = image.img_to_array(img)
    x = np.expand_dims(x, axis=0)
    x = preprocess_input(x)

    flatten_features = model_keras.predict(x)
    return(flatten_features[0])

In [None]:
#Extracting features and creating dataframe

test_data_df['img_features'] = [feature_extract_vgg19(x,model_keras = model) for x in test_data_df['image_paths']]

test_data_df.to_pickle(data_dir+'/test_data_df.pkl')

In [10]:
#Loading Saved Test feature dataframe
import pickle
import os 
import pandas as pd
import numpy as np

data_dir = os.path.join(os.getcwd(),'BlobStorage')

test_data_df = pd.read_pickle(data_dir+'/test_data_df.pkl')

In [11]:
#Loading Saved Models
#svm_model_linear = pickle.load(data_dir+'/svm_trained.pkl')
#calibrator = pickle.load(data_dir+'/calibrator_trained.pkl')

clf_fit = pickle.load(open(data_dir+'/rf_trained.sav', 'rb')) 

#clf_fit = pickle.load(open(data_dir+'/svm_trained.sav', 'rb'))
#clf_fit = pickle.load(open(data_dir+'/svm_trained_prob.sav', 'rb'))
#calibrator = pickle.load(open(data_dir+'/calibrator_trained.sav', 'rb'))

In [3]:
X_test = test_data_df.img_features.apply(pd.Series)
y_test = test_data_df['class_name'].astype('category')

In [12]:
# evaluate the model on test data
yhat_clf = clf_fit.predict(X_test)
#yhat_calibrator = calibrator.predict(X_test)

[Parallel(n_jobs=12)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done  26 tasks      | elapsed:    0.0s
[Parallel(n_jobs=12)]: Done 176 tasks      | elapsed:    0.1s
[Parallel(n_jobs=12)]: Done 426 tasks      | elapsed:    0.3s
[Parallel(n_jobs=12)]: Done 776 tasks      | elapsed:    0.5s
[Parallel(n_jobs=12)]: Done 1000 out of 1000 | elapsed:    0.7s finished


In [None]:
# creating a confusion matrix on predictions
#from sklearn.metrics import confusion_matrix 
#cm = confusion_matrix(y_test, yhat)
#print(cm)

from sklearn.metrics import classification_report 
from sklearn.metrics import accuracy_score 

print(classification_report(y_test, yhat_clf))
print(accuracy_score(y_test, yhat_clf))

#print(classification_report(y_test, yhat_calibrator))
#print(accuracy_score(y_test, yhat_calibrator))

In [None]:
print(clf_fit.classes_)



In [13]:
yhat_clf_prob = clf_fit.predict_proba(X_test)

[Parallel(n_jobs=12)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done  26 tasks      | elapsed:    0.0s
[Parallel(n_jobs=12)]: Done 176 tasks      | elapsed:    0.1s
[Parallel(n_jobs=12)]: Done 426 tasks      | elapsed:    0.3s
[Parallel(n_jobs=12)]: Done 776 tasks      | elapsed:    0.5s
[Parallel(n_jobs=12)]: Done 1000 out of 1000 | elapsed:    0.7s finished


In [14]:
top_n_predictions = np.argsort(yhat_clf_prob, axis = 1)[:,-3:]
    
#then find the associated SOC code for each prediction
top_class = clf_fit.classes_[top_n_predictions]
top_class_df = pd.DataFrame(data=top_class,columns=['top1','top2','top3'])
print(top_class_df.shape)
print(top_class_df.head(10))

    #merge it up with the validation labels and descriptions
results = pd.merge(pd.DataFrame(y_test), top_class_df, left_index=True, right_index=True)
print(results.shape)
print(results.head(10))


(7530, 3)
    top1     top2    top3
0  video  weather  coffee
1  video  weather  coffee
2  video  weather  coffee
3  video  weather  coffee
4  video  weather  coffee
5  video  weather  coffee
6  video  weather  coffee
7  video  weather  coffee
8  video  weather  coffee
9  video  weather  coffee
(7530, 4)
  class_name   top1     top2    top3
0     coffee  video  weather  coffee
1     coffee  video  weather  coffee
2     coffee  video  weather  coffee
3     coffee  video  weather  coffee
4     coffee  video  weather  coffee
5     coffee  video  weather  coffee
6     coffee  video  weather  coffee
7     coffee  video  weather  coffee
8     coffee  video  weather  coffee
9     coffee  video  weather  coffee
