# Extracting features from VGG19 for all Train, Validation, and Test. Saving Pickle

In [None]:
#Creating list of all image paths for train and validation data.
from os import walk
from os.path import normpath, basename
import os
import pandas as pd

train_path_list = []
train_dir_list = []
val_path_list = []
val_dir_list = []
test_path_list = []
test_dir_list = []

data_dir = os.path.join(os.getcwd(),'BlobStorage')
train_data_dir = os.path.join(data_dir, 'train_data')
validation_data_dir = os.path.join(data_dir, 'validation_data')
test_data_dir = os.path.join(data_dir, 'test_data')

#Generating Train data path list
for (dirpath, dirnames, filenames) in walk(train_data_dir):
    for filename in filenames:
        train_dir_list.append(basename(normpath(dirpath)))
        train_path_list.append(os.path.join(dirpath, filename))
    
#Generating Validation data path list
for (dirpath, dirnames, filenames) in walk(validation_data_dir):
    for filename in filenames:
        val_dir_list.append(basename(normpath(dirpath)))
        val_path_list.append(os.path.join(dirpath, filename))
    
#Generating Test data path list
for (dirpath, dirnames, filenames) in walk(test_data_dir):
    for filename in filenames:
        test_dir_list.append(basename(normpath(dirpath)))
        test_path_list.append(os.path.join(dirpath, filename))


In [None]:
#Converting lists into dataframes
import pandas as pd

train_data_df = pd.DataFrame()
train_data_df['image_paths'] = train_path_list
train_data_df['class_name'] = train_dir_list

val_data_df = pd.DataFrame()
val_data_df['image_paths'] = val_path_list
val_data_df['class_name'] = val_dir_list


print(train_data_df.shape)
print(val_data_df.shape)

test_data_df = pd.DataFrame()
test_data_df['image_paths'] = test_path_list
test_data_df['class_name'] = test_dir_list

print(test_data_df.shape)

In [None]:
#Function for extracting features using VGG19 for a given image path
from keras.applications.vgg19 import VGG19
from keras.preprocessing import image
from keras.applications.vgg19 import preprocess_input
from keras.models import Model
import numpy as np
from PIL import ImageFile
ImageFile.LOAD_TRUNCATED_IMAGES = True

base_model = VGG19(weights='imagenet')
#print(base_model.summary())
model = Model(inputs=base_model.input, outputs=base_model.get_layer('fc2').output)

def feature_extract_vgg19(image_path,model_keras):

    img_path = image_path
    img = image.load_img(img_path, target_size=(224, 224))
    x = image.img_to_array(img)
    x = np.expand_dims(x, axis=0)
    x = preprocess_input(x)

    flatten_features = model_keras.predict(x)
    return(flatten_features[0])

In [None]:
#print(val_data_df.shape)
#print(val_data_df.columns)
#sub_df = val_data_df[val_data_df['class_name']=='valley']
#print(sub_df.shape)

#pd.set_option('display.max_rows', None)
#pd.set_option('display.max_columns', None)
#pd.set_option('display.width', None)
#pd.set_option('display.max_colwidth', -1)

#new_sub_df = sub_df[~sub_df.image_paths.str.contains('/.jpg',regex = False)]

#print(new_sub_df.shape)
#print(new_sub_df['image_paths'])

train_data_df = train_data_df[~train_data_df.image_paths.str.contains('/.jpg',regex = False)]

val_data_df = val_data_df[~val_data_df.image_paths.str.contains('/.jpg',regex = False)]

test_data_df = test_data_df[~test_data_df.image_paths.str.contains('/.jpg',regex = False)]


In [None]:
print(train_data_df.shape)
print(val_data_df.shape)
print(test_data_df.shape)

In [None]:
#Extracting features and creating dataframe 

train_data_df['img_features'] = [feature_extract_vgg19(x,model_keras = model) for x in train_data_df['image_paths']]

val_data_df['img_features'] = [feature_extract_vgg19(x,model_keras = model) for x in val_data_df['image_paths']]

test_data_df['img_features'] = [feature_extract_vgg19(x,model_keras = model) for x in test_data_df['image_paths']]

print(train_data_df.shape)
print(val_data_df.shape)
print(test_data_df.shape)

In [None]:
#Saving Train and Validation features dataframes
import pickle

train_data_df.to_pickle(data_dir+'/train_data_features_df.pkl')
val_data_df.to_pickle(data_dir+'/val_data_features_df.pkl')
test_data_df.to_pickle(data_dir+'/test_data_features_df.pkl')

# Training base line model and Saving (entire data)

In [None]:
#Loading Saved Train and Validation feature dataframes
import pickle
import os 
import pandas as pd
import numpy as np

data_dir = os.path.join(os.getcwd(),'BlobStorage')

train_data_df = pd.read_pickle(data_dir+'/train_data_features_df.pkl')
val_data_df = pd.read_pickle(data_dir+'/val_data_features_df.pkl')

In [None]:
train_data_df.reset_index(drop=True)
print(len(train_data_df))
print(train_data_df.index)

In [None]:
df1 = train_data_df.iloc[0:100000]
df2 = train_data_df.iloc[100000:200000]
df3 = train_data_df.iloc[200000:300000]
df4 = train_data_df.iloc[300000:400000]
df5 = train_data_df.iloc[400000:500000]
df6 = train_data_df.iloc[500000:615827]

In [None]:
df1_train = df1.img_features.apply(pd.Series)
df2_train = df2.img_features.apply(pd.Series)
df3_train = df3.img_features.apply(pd.Series)
df4_train = df4.img_features.apply(pd.Series)
df5_train = df5.img_features.apply(pd.Series)
df6_train = df6.img_features.apply(pd.Series)

In [None]:
X_train = pd.concat([df1_train,df2_train,df3_train,df4_train,df5_train,df6_train])

In [None]:
print(len(X_train))
del df1_train,df2_train,df3_train,df4_train,df5_train,df6_train,df1,df2,df3,df4,df5,df6

In [None]:
#import multiprocessing as mp

#print(train_data_df.shape)

#p = mp.Pool(mp.cpu_count())

In [None]:
#Splitting array column of features into multiple columns

#X_train = train_data_df.img_features.apply(pd.Series)
#X_train = p.map(pd.Series, train_data_df['img_features'])
y_train = train_data_df['class_name'].astype('category')

X_val = val_data_df.img_features.apply(pd.Series)
#X_val = p.map(pd.Series, val_data_df['img_features'])
y_val = val_data_df['class_name'].astype('category')

X_train_val = pd.concat([X_train,X_val])
y_train_val = pd.concat([y_train,y_val])

In [None]:
#Saving Train and Validation features dataframes
import pickle

X_train_val.to_pickle(data_dir+'/train_val_x_df.pkl')
y_train_val.to_pickle(data_dir+'/train_val_y_df.pkl')

In [None]:
import gc
gc.collect()

In [1]:
import pickle
import os 
import pandas as pd
import numpy as np

data_dir = os.path.join(os.getcwd(),'BlobStorage')

X_train_val = pd.read_pickle(data_dir+'/train_val_x_df.pkl')
y_train_val = pd.read_pickle(data_dir+'/train_val_y_df.pkl')

In [None]:
#Training RandomForest Classifier
from sklearn.ensemble import RandomForestClassifier

#training randomforest
mdl_rf = RandomForestClassifier(n_estimators=1000,random_state=0,verbose=1,n_jobs=-1, min_samples_split= 2, min_samples_leaf= 1, max_features= 'auto', max_depth= 60, bootstrap= False)
    
clf_fit = mdl_rf.fit(X_train_val, y_train_val)

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed: 19.3min


# Using saved baseline model to predict

In [None]:
#Loading Saved Test feature dataframe
import pickle
import os 
import pandas as pd
import numpy as np

data_dir = os.path.join(os.getcwd(),'BlobStorage')

test_data_df = pd.read_pickle(data_dir+'/test_data_features_df.pkl')

In [None]:
#Loading Saved baseline Model

clf_fit = pickle.load(open(data_dir+'/rf_baseline_trained.sav', 'rb')) 

In [None]:
X_test = test_data_df.img_features.apply(pd.Series)
y_test = test_data_df['class_name'].astype('category')

In [None]:
# evaluate the model on test data
yhat_clf = clf_fit.predict(X_test)

In [None]:
# creating a confusion matrix on predictions
from sklearn.metrics import classification_report 
from sklearn.metrics import accuracy_score 

print(classification_report(y_test, yhat_clf))
print(accuracy_score(y_test, yhat_clf))

In [None]:
yhat_clf_prob = clf_fit.predict_proba(X_test)

In [None]:
top_n_predictions = np.argsort(yhat_clf_prob, axis = 1)[:,-3:]
    
#then find the associated SOC code for each prediction
top_class = clf_fit.classes_[top_n_predictions]
top_class_df = pd.DataFrame(data=top_class,columns=['top1','top2','top3'])
print(top_class_df.shape)
print(top_class_df.head(10))

#merge it up with the validation labels and descriptions
results = pd.merge(pd.DataFrame(y_test), top_class_df, left_index=True, right_index=True)
print(results.shape)
print(results.head(10))