In [136]:
import pandas as pd
import numpy as np
import glob
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score

### Before ensembling: single model accuracy

Six models (best models produced from cv0, cv1, cv2, cv3, cv4 and whole training dataset) were produced in the transfer learning process. Validation accuracy showed that the models' accuracy was around 70%. Later we would show that averaging ensemble could lift the accuracy.

In [235]:
cv_score = pd.read_pickle("/Users/zhanglingling/Desktop/ML1030/boston_train_evaluate/cv_score.pickle")
cv_score 

CV round,0,1,2,3,4,mean,std
train_loss,0.167847,0.070979,0.085303,0.051624,0.052649,0.08568,0.042939
train_acc,0.9471,0.972844,0.970884,0.980929,0.979752,0.970302,0.012226
val_loss,1.802394,1.540489,1.852702,1.808886,1.562203,1.713335,0.133567
val_acc,0.669805,0.672003,0.691994,0.672214,0.692622,0.679727,0.010308


In [236]:
wholedata_score = pd.read_pickle("/Users/zhanglingling/Desktop/ML1030/boston_train_evaluate/wholedata_score.pickle")
wholedata_score 

Unnamed: 0,loss,acc
0,0.058055,0.978464


## 1. Averaging ensemble
### 1.1 Averaging ensemble of all single models

In [338]:
predict_dir = '/Users/zhanglingling/Desktop/ML1030/boston_test_prediction/'
file_list = list(glob.glob(predict_dir + "*.csv*"))
file_list.sort()
file_list

['/Users/zhanglingling/Desktop/ML1030/boston_test_prediction/boston.test_bestmodel.hdf5.cv0.prediction.csv',
 '/Users/zhanglingling/Desktop/ML1030/boston_test_prediction/boston.test_bestmodel.hdf5.cv1.prediction.csv',
 '/Users/zhanglingling/Desktop/ML1030/boston_test_prediction/boston.test_bestmodel.hdf5.cv2.prediction.csv',
 '/Users/zhanglingling/Desktop/ML1030/boston_test_prediction/boston.test_bestmodel.hdf5.cv3.prediction.csv',
 '/Users/zhanglingling/Desktop/ML1030/boston_test_prediction/boston.test_bestmodel.hdf5.cv4.prediction.csv',
 '/Users/zhanglingling/Desktop/ML1030/boston_test_prediction/boston.test_bestmodel.wholedata.hdf5.prediction.csv']

In [292]:
test_csv = "/Users/zhanglingling/Desktop/ML1030/us_safety/boston_test_fetched_with_target.csv"  
test_df = pd.read_csv(test_csv)
test_df = test_df.sort_values("_file")
target = "safety"
img_name_col = "_file"
test_df = test_df[[img_name_col, target]]
print(test_df.shape)
print(averaged_prediction.shape)
test_df.head()

(3976, 2)
(3975, 4)


Unnamed: 0,_file,safety
0,gsv_0.jpg,1
1,gsv_1.jpg,1
10,gsv_10.jpg,0
99,gsv_100.jpg,1
992,gsv_1000.jpg,1


Note that test_df have 3976 samples whereas averaged_prediction have 3975 samples, that is because one image was not able to be fetched by the Google Street View API

In [293]:
df = pd.merge(test_df, averaged_prediction, how='left', on='_file', 
                   indicator=True)
df[df['_merge'] == 'left_only']

Unnamed: 0,_file,safety,0,1,pred_safety,_merge
640,gsv_1578.jpg,0,,,,left_only


In [343]:
def matrix_performance(test_df, file_list):
    
    #produce averaging ensembled model's prediction - 'averaged_prediction'
    df_list = []
    for f in file_list:
        df = pd.read_csv(f)
        df = df.sort_values("_file")
        df_list.append(df)
        
    averaged_prediction = pd.concat(df_list).groupby('_file').mean()
    averaged_prediction.reset_index(level=0, inplace=True)
    averaged_prediction['pred_safety'] =  np.where(averaged_prediction['0'] > 0.5, 0, 1)
    
    #prepare y_true, y_pred
    df = pd.merge(test_df, averaged_prediction, how='left', on='_file', 
                   indicator=True)
    df[df['_merge'] == 'left_only']
    df = df[df['_merge'] == 'both']
    df['pred_safety'] = df['pred_safety'].astype(int)
    
    y_true = df['safety']
    y_pred = df['pred_safety']
    
    #confusion matrix
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
    matrix = pd.DataFrame([{'tn': tn, 'fp': fp, 'fn': fn, 'tp': tp}])
    
    #performance
    accuracy = accuracy_score(y_true, y_pred) # accuracy: (tp + tn) / (p + n)
    precision = precision_score(y_true, y_pred) # precision tp / (tp + fp)
    recall = recall_score(y_true, y_pred) # recall: tp / (tp + fn)
    f1 = f1_score(y_true, y_pred) # f1: 2 tp / (2 tp + fp + fn)
    performance = pd.DataFrame([{'Accuracy': accuracy, 'Precision': precision, 'Recall': recall, 'F1 score': f1}])
    
    return matrix, performance

In [344]:
conf_matrix, performance = matrix_performance(test_df, df_list)
display(conf_matrix, performance)

Unnamed: 0,fn,fp,tn,tp
0,440,606,1547,1382


Unnamed: 0,Accuracy,F1 score,Precision,Recall
0,0.736855,0.725459,0.695171,0.758507


### 1.2 Averaging ensemble of best models only (cv2, cv4)

In [345]:
file_list = ['/Users/zhanglingling/Desktop/ML1030/boston_test_prediction/boston.test_bestmodel.hdf5.cv2.prediction.csv',
             '/Users/zhanglingling/Desktop/ML1030/boston_test_prediction/boston.test_bestmodel.hdf5.cv4.prediction.csv']
conf_matrix, performance = matrix_performance(test_df, file_list)
display(conf_matrix, performance)

Unnamed: 0,fn,fp,tn,tp
0,472,653,1500,1350


Unnamed: 0,Accuracy,F1 score,Precision,Recall
0,0.716981,0.705882,0.673989,0.740944


### 1.3 Averaging ensemble of best models only (cv2, cv4, wholedata)

In [348]:
file_list = ['/Users/zhanglingling/Desktop/ML1030/boston_test_prediction/boston.test_bestmodel.hdf5.cv2.prediction.csv',
             '/Users/zhanglingling/Desktop/ML1030/boston_test_prediction/boston.test_bestmodel.hdf5.cv4.prediction.csv',
             '/Users/zhanglingling/Desktop/ML1030/boston_test_prediction/boston.test_bestmodel.wholedata.hdf5.prediction.csv']
conf_matrix, performance = matrix_performance(test_df, file_list)
display(conf_matrix, performance)

Unnamed: 0,fn,fp,tn,tp
0,478,602,1551,1344


Unnamed: 0,Accuracy,F1 score,Precision,Recall
0,0.728302,0.713376,0.690647,0.737651


### 1.4 Conclusion of averaging ensemble

Above results showed that the performance of averaging ensemble was better than that of single models; the performance of averaging ensemble of all single models together was better than that of ensembling best models only.

## 2. Conditional ensemble
Next, we would like to print out the confusion matrix of each single model and design a conditional weighting rule and see whether conditional ensemble would further boost the model performance.

In [349]:
predict_dir = '/Users/zhanglingling/Desktop/ML1030/boston_test_prediction/'
file_list = list(glob.glob(predict_dir + "*.csv*"))
file_list.sort()
file_list

['/Users/zhanglingling/Desktop/ML1030/boston_test_prediction/boston.test_bestmodel.hdf5.cv0.prediction.csv',
 '/Users/zhanglingling/Desktop/ML1030/boston_test_prediction/boston.test_bestmodel.hdf5.cv1.prediction.csv',
 '/Users/zhanglingling/Desktop/ML1030/boston_test_prediction/boston.test_bestmodel.hdf5.cv2.prediction.csv',
 '/Users/zhanglingling/Desktop/ML1030/boston_test_prediction/boston.test_bestmodel.hdf5.cv3.prediction.csv',
 '/Users/zhanglingling/Desktop/ML1030/boston_test_prediction/boston.test_bestmodel.hdf5.cv4.prediction.csv',
 '/Users/zhanglingling/Desktop/ML1030/boston_test_prediction/boston.test_bestmodel.wholedata.hdf5.prediction.csv']

In [350]:
df_list = []
for f in file_list:
    df = pd.read_csv(f)
    df = df.sort_values("_file")
    df_list.append(df)

In [367]:
def matrix_performance_singlemodel(test_df, data):
    
    #for a single model, no need to average
    prediction = data 
    prediction['pred_safety'] =  np.where(prediction['0'] > 0.5, 0, 1)
    
    #prepare y_true, y_pred
    df = pd.merge(test_df, prediction, how='left', on='_file', 
                   indicator=True)
    df[df['_merge'] == 'left_only']
    df = df[df['_merge'] == 'both']
    df['pred_safety'] = df['pred_safety'].astype(int)
    
    y_true = df['safety']
    y_pred = df['pred_safety']
    
    #confusion matrix
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
    matrix = pd.DataFrame([{'tn': tn, 'fp': fp, 'fn': fn, 'tp': tp}])
    
    #performance
    accuracy = accuracy_score(y_true, y_pred) # accuracy: (tp + tn) / (p + n)
    precision = precision_score(y_true, y_pred) # precision tp / (tp + fp)
    recall = recall_score(y_true, y_pred) # recall: tp / (tp + fn)
    f1 = f1_score(y_true, y_pred) # f1: 2 tp / (2 tp + fp + fn)
    performance = pd.DataFrame([{'Accuracy': accuracy, 'Precision': precision, 'Recall': recall, 'F1 score': f1}])
    
    return matrix, performance

In [436]:
conf_matrix_table = pd.DataFrame(columns=['fn','fp','tn', 'tp'])
performance_table = pd.DataFrame(columns=['Accuracy', 'F1 score', 'Precision', 'Recall'])

In [437]:
for i in range(len(df_list)):
    conf_matrix, performance = matrix_performance_singlemodel(test_df, df_list[i])
    conf_matrix_table = pd.concat([conf_matrix_table, conf_matrix])
    performance_table = pd.concat([performance_table, performance])
    

In [438]:
conf_matrix_table.index = ['cv0', 'cv1', 'cv2', 'cv3', 'cv4', 'whole']
performance_table.index = ['cv0', 'cv1', 'cv2', 'cv3', 'cv4', 'whole']

In [439]:
conf_matrix_table

Unnamed: 0,fn,fp,tn,tp
cv0,485,704,1449,1337
cv1,442,775,1378,1380
cv2,537,649,1504,1285
cv3,653,558,1595,1169
cv4,505,684,1469,1317
whole,579,536,1617,1243


In [440]:
performance_table

Unnamed: 0,Accuracy,F1 score,Precision,Recall
cv0,0.700881,0.692208,0.655071,0.733809
cv1,0.693836,0.69399,0.640371,0.757409
cv2,0.701635,0.684239,0.664426,0.705269
cv3,0.695346,0.658777,0.676896,0.641603
cv4,0.700881,0.688988,0.658171,0.722832
whole,0.719497,0.690364,0.698707,0.682217
