In [112]:
# Import libraries
import pandas as pd
import numpy as np
import os
from joblib import dump, load


In [100]:
# Gather list of filepaths and UIDs
data_path = "data/blind_test/blind_test/" # leave trailing slash so that we can append the filename
file_paths = []
uids = []
for file_name in os.listdir(data_path):
    file_path = data_path + file_name
    uid = file_name.split(".")[0]

    file_paths.append(file_path)
    uids.append(uid)

df = pd.DataFrame({"uid": uids, "file_path": file_paths})
display(df)

Unnamed: 0,uid,file_path
0,AFULZ1019,data/blind_test/blind_test/AFULZ1019.csv
1,AHTVXZ206,data/blind_test/blind_test/AHTVXZ206.csv
2,AKXTJD546,data/blind_test/blind_test/AKXTJD546.csv
3,APSJQV812,data/blind_test/blind_test/APSJQV812.csv
4,ARMMIP966,data/blind_test/blind_test/ARMMIP966.csv
...,...,...
102,ZENCJG630,data/blind_test/blind_test/ZENCJG630.csv
103,ZFSFZW942,data/blind_test/blind_test/ZFSFZW942.csv
104,ZJQMJR272,data/blind_test/blind_test/ZJQMJR272.csv
105,ZQKBXR902,data/blind_test/blind_test/ZQKBXR902.csv


In [101]:
# Load features from individual CSVs into a single dataframe
def get_features(file_path):
    feature_df = pd.read_csv(file_path, header=None)
    return feature_df.iloc[0].values.tolist()


features_df = df[["file_path"]].apply(
    lambda row: get_features(row[0]), axis=1, result_type="expand"
)
display(features_df)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1014,1015,1016,1017,1018,1019,1020,1021,1022,1023
0,0.000312,0.002733,-0.001607,0.003544,-0.040561,0.056610,0.000575,0.001112,-0.368693,0.000344,...,-0.798802,0.190185,-1.267598,-1.149763,1.569592,0.564020,-0.854228,1.970371,0.328199,-1.551044
1,0.000208,0.007524,-0.000101,0.001876,-0.052381,-0.247116,0.000866,-0.001215,-0.235077,0.000467,...,-0.166672,0.473651,-0.671415,-0.525698,-0.099970,0.020691,-0.777866,1.508321,0.378072,-0.841764
2,0.000327,0.005883,-0.001441,0.002858,-0.008328,-0.497405,0.001089,-0.000971,-0.282730,0.000512,...,1.370655,-0.761339,0.124354,0.371866,-0.941755,-0.674214,-1.563971,1.721614,-0.286807,-0.449043
3,0.000423,0.007362,-0.000603,0.002156,-0.028289,-0.077637,0.000712,0.000173,-0.211316,0.000496,...,0.784321,-0.700878,-0.444130,0.028558,1.128646,0.714054,-0.565383,0.648561,0.642141,-1.094233
4,0.000316,0.004430,0.000411,0.001363,-0.125821,-0.346757,0.000852,-0.000216,-0.209535,0.000505,...,-1.183529,-0.014170,-0.522831,0.946102,-0.041580,0.890518,-0.969357,1.662105,1.019175,-1.004081
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
102,0.000320,0.005695,-0.000903,0.003247,-0.038171,0.151869,0.000685,0.001118,-0.326524,0.000371,...,2.830519,-1.112213,-0.348413,0.102431,-0.185106,0.047154,-0.307782,0.551432,0.806686,-1.421993
103,0.000536,0.005143,0.002053,0.001889,-0.068980,-0.358939,0.000896,-0.000794,-0.193036,0.000469,...,0.447034,-0.739529,-0.019063,-0.118387,1.496582,-0.285316,-0.867392,1.297851,0.517406,-0.724908
104,0.000466,0.005755,-0.002297,0.003087,-0.095429,-0.091939,0.000955,0.001986,-0.257270,0.000412,...,1.837692,-1.553680,-0.385746,0.236822,-1.207960,-0.415188,-1.619983,1.191690,0.358602,-1.534049
105,0.000394,0.007342,0.001139,0.001530,-0.106957,-0.331865,0.000851,-0.001169,-0.237243,0.000430,...,-0.348795,-0.336277,-0.443603,0.381027,0.061047,-0.136507,-1.193353,3.047893,0.385818,-0.626281


In [102]:
# Merge dataframes and fix column names
num_features = features_df.shape[1]
feature_names = [f"f{i}" for i in range(num_features)]

features_df.columns = feature_names

df = pd.concat([df, features_df], axis=1)
display(df)


Unnamed: 0,uid,file_path,f0,f1,f2,f3,f4,f5,f6,f7,...,f1014,f1015,f1016,f1017,f1018,f1019,f1020,f1021,f1022,f1023
0,AFULZ1019,data/blind_test/blind_test/AFULZ1019.csv,0.000312,0.002733,-0.001607,0.003544,-0.040561,0.056610,0.000575,0.001112,...,-0.798802,0.190185,-1.267598,-1.149763,1.569592,0.564020,-0.854228,1.970371,0.328199,-1.551044
1,AHTVXZ206,data/blind_test/blind_test/AHTVXZ206.csv,0.000208,0.007524,-0.000101,0.001876,-0.052381,-0.247116,0.000866,-0.001215,...,-0.166672,0.473651,-0.671415,-0.525698,-0.099970,0.020691,-0.777866,1.508321,0.378072,-0.841764
2,AKXTJD546,data/blind_test/blind_test/AKXTJD546.csv,0.000327,0.005883,-0.001441,0.002858,-0.008328,-0.497405,0.001089,-0.000971,...,1.370655,-0.761339,0.124354,0.371866,-0.941755,-0.674214,-1.563971,1.721614,-0.286807,-0.449043
3,APSJQV812,data/blind_test/blind_test/APSJQV812.csv,0.000423,0.007362,-0.000603,0.002156,-0.028289,-0.077637,0.000712,0.000173,...,0.784321,-0.700878,-0.444130,0.028558,1.128646,0.714054,-0.565383,0.648561,0.642141,-1.094233
4,ARMMIP966,data/blind_test/blind_test/ARMMIP966.csv,0.000316,0.004430,0.000411,0.001363,-0.125821,-0.346757,0.000852,-0.000216,...,-1.183529,-0.014170,-0.522831,0.946102,-0.041580,0.890518,-0.969357,1.662105,1.019175,-1.004081
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
102,ZENCJG630,data/blind_test/blind_test/ZENCJG630.csv,0.000320,0.005695,-0.000903,0.003247,-0.038171,0.151869,0.000685,0.001118,...,2.830519,-1.112213,-0.348413,0.102431,-0.185106,0.047154,-0.307782,0.551432,0.806686,-1.421993
103,ZFSFZW942,data/blind_test/blind_test/ZFSFZW942.csv,0.000536,0.005143,0.002053,0.001889,-0.068980,-0.358939,0.000896,-0.000794,...,0.447034,-0.739529,-0.019063,-0.118387,1.496582,-0.285316,-0.867392,1.297851,0.517406,-0.724908
104,ZJQMJR272,data/blind_test/blind_test/ZJQMJR272.csv,0.000466,0.005755,-0.002297,0.003087,-0.095429,-0.091939,0.000955,0.001986,...,1.837692,-1.553680,-0.385746,0.236822,-1.207960,-0.415188,-1.619983,1.191690,0.358602,-1.534049
105,ZQKBXR902,data/blind_test/blind_test/ZQKBXR902.csv,0.000394,0.007342,0.001139,0.001530,-0.106957,-0.331865,0.000851,-0.001169,...,-0.348795,-0.336277,-0.443603,0.381027,0.061047,-0.136507,-1.193353,3.047893,0.385818,-0.626281


In [103]:
# load StandardScaler from training notebook
%store -r scaler_std

In [104]:
# Apply standardization to blind test data
test_features = df[feature_names]
# Transform testing data using same fit
test_features = scaler_std.transform(test_features)

# Get back into pandas df
test_features_df = pd.DataFrame(test_features, columns=feature_names)
display(test_features_df)

Unnamed: 0,f0,f1,f2,f3,f4,f5,f6,f7,f8,f9,...,f1014,f1015,f1016,f1017,f1018,f1019,f1020,f1021,f1022,f1023
0,-0.146899,-0.676596,-0.740728,1.837886,0.625696,0.983566,-1.079453,1.150700,-1.417336,-0.539970,...,-0.851326,0.457279,-1.878643,-1.768336,1.292483,0.071074,-0.042549,0.253941,-0.001729,-1.890603
1,-1.013119,1.395241,0.301525,-0.558722,0.338027,-0.326136,0.513413,-1.153209,0.114025,0.497178,...,-0.182936,0.830347,-0.392857,-0.901061,-0.586506,-0.529588,0.136946,-0.187076,0.084178,-0.352899
2,-0.017153,0.685652,-0.625928,0.852583,1.410126,-1.405410,1.733257,-0.911580,-0.432119,0.879492,...,1.442575,-0.795017,1.590329,0.346298,-1.533882,-1.297821,-1.710858,0.016507,-1.061093,0.498513
3,0.780873,1.325301,-0.046211,-0.155941,0.924339,0.404677,-0.328637,0.220810,0.386342,0.738955,...,0.822608,-0.715444,0.173575,-0.130802,0.796225,0.236940,0.636406,-1.007700,0.539045,-0.900247
4,-0.113965,0.057133,0.655038,-1.296029,-1.449227,-0.755799,0.438547,-0.163942,0.406753,0.816920,...,-1.258121,0.188329,-0.022562,1.144324,-0.520791,0.432024,-0.313169,-0.040293,1.188495,-0.704798
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
102,-0.073752,0.604065,-0.253760,1.411310,0.683850,1.394334,-0.479618,1.156485,-0.934049,-0.310083,...,2.986179,-1.256799,0.412118,-0.028140,-0.682321,-0.500333,1.241917,-1.100407,0.822478,-1.610822
103,1.723507,0.365452,1.791379,-0.539942,-0.065914,-0.808329,0.678450,-0.736460,0.595854,0.513626,...,0.465974,-0.766312,1.232912,-0.335015,1.210314,-0.867886,-0.073491,-0.387966,0.324186,-0.099559
104,1.134510,0.630180,-1.218027,1.181417,-0.709584,0.343008,0.998858,2.015977,-0.140325,0.034281,...,1.936402,-1.837812,0.319077,0.158626,-1.833479,-1.011462,-1.842519,-0.489294,0.050641,-1.853758
105,0.540822,1.316665,1.158963,-1.055112,-0.990139,-0.691582,0.432350,-1.108253,0.089196,0.189212,...,-0.375506,-0.235594,0.174887,0.359029,-0.405291,-0.703374,-0.839691,1.282413,0.097521,0.114263


In [105]:
# Load features extracted from training notebook
%store -r extracted_feature_names

In [106]:
# Extract only features determined during feature selection
test_features_df = test_features_df[extracted_feature_names]
display(test_features_df)

Unnamed: 0,f0,f4,f6,f11,f12,f14,f15,f16,f20,f24,...,f994,f995,f998,f1000,f1002,f1003,f1008,f1012,f1015,f1018
0,-0.146899,0.625696,-1.079453,1.109644,-0.530666,-0.610915,0.102099,-0.990751,-1.205252,-0.964032,...,0.175115,-0.211398,-0.743569,-1.328760,-1.059856,0.338097,0.319857,-0.197834,0.457279,1.292483
1,-1.013119,0.338027,0.513413,0.069209,0.035065,0.119403,-0.296364,0.281088,0.327934,-0.338220,...,1.113632,-0.297296,-0.900834,-1.908068,0.666708,-0.103931,-0.221016,-0.378205,0.830347,-0.586506
2,-0.017153,1.410126,1.733257,-0.441265,-1.100930,0.988179,0.211648,-0.585983,1.735698,-0.405427,...,-0.801915,-1.215820,-0.901971,-0.678460,0.069009,-0.138985,-0.433189,-0.379577,-0.795017,-1.533882
3,0.780873,0.924339,-0.328637,-0.695412,-1.464071,1.390128,2.066784,-1.402021,-0.926916,0.131901,...,0.528962,-1.467276,-1.058958,-0.906097,-0.667262,-1.853970,0.270810,0.405335,-0.715444,0.796225
4,-0.113965,-1.449227,0.438547,-2.232116,1.980808,0.447333,-0.864879,0.646908,1.124951,1.627869,...,-1.573356,-0.995015,0.704553,0.322442,-0.199035,-0.223754,-0.141572,0.966109,0.188329,-0.520791
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
102,-0.073752,0.683850,-0.479618,0.551994,-0.710469,-0.081720,0.155407,-1.915931,-1.605506,-0.408096,...,0.724467,0.055185,-1.827125,-0.149113,-0.292149,-2.085941,-0.319074,1.118447,-1.256799,-0.682321
103,1.723507,-0.065914,0.678450,-1.286878,0.186071,0.121855,0.414350,-0.891285,-0.275452,0.700659,...,1.188535,0.154092,-0.892598,-1.207726,1.829099,-0.362955,0.832229,-1.052162,-0.766312,1.210314
104,1.134510,-0.709584,0.998858,1.559704,2.005952,0.315484,-0.408060,-0.902374,-0.924769,-0.738259,...,1.129969,0.122589,-1.612577,-1.149437,-1.393835,-0.933912,-0.197377,0.989398,-1.837812,-1.833479
105,0.540822,-0.990139,0.432350,-0.576376,-0.545066,1.052978,0.890297,0.116937,-1.242391,1.286926,...,0.541301,-0.039223,-0.263439,-1.102963,0.980728,0.439252,-0.184565,0.072733,-0.235594,-0.405291


In [107]:
# load model from training notebook
%store -r svm_model_final

In [None]:
# load Emma's model
svm_model = load('')

In [108]:
# Make predictions
ypred = svm_model_final.predict(test_features_df)

# print(ypred)
# # show the inputs and predicted outputs
# for i in range(len(test_features_df)):
#  print("Predicted=%s" % ypred[i])


In [109]:
# Add prediction to df
test_features_df["class"] = ypred
display(test_features_df)

Unnamed: 0,f0,f4,f6,f11,f12,f14,f15,f16,f20,f24,...,f995,f998,f1000,f1002,f1003,f1008,f1012,f1015,f1018,class
0,-0.146899,0.625696,-1.079453,1.109644,-0.530666,-0.610915,0.102099,-0.990751,-1.205252,-0.964032,...,-0.211398,-0.743569,-1.328760,-1.059856,0.338097,0.319857,-0.197834,0.457279,1.292483,0
1,-1.013119,0.338027,0.513413,0.069209,0.035065,0.119403,-0.296364,0.281088,0.327934,-0.338220,...,-0.297296,-0.900834,-1.908068,0.666708,-0.103931,-0.221016,-0.378205,0.830347,-0.586506,1
2,-0.017153,1.410126,1.733257,-0.441265,-1.100930,0.988179,0.211648,-0.585983,1.735698,-0.405427,...,-1.215820,-0.901971,-0.678460,0.069009,-0.138985,-0.433189,-0.379577,-0.795017,-1.533882,0
3,0.780873,0.924339,-0.328637,-0.695412,-1.464071,1.390128,2.066784,-1.402021,-0.926916,0.131901,...,-1.467276,-1.058958,-0.906097,-0.667262,-1.853970,0.270810,0.405335,-0.715444,0.796225,0
4,-0.113965,-1.449227,0.438547,-2.232116,1.980808,0.447333,-0.864879,0.646908,1.124951,1.627869,...,-0.995015,0.704553,0.322442,-0.199035,-0.223754,-0.141572,0.966109,0.188329,-0.520791,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
102,-0.073752,0.683850,-0.479618,0.551994,-0.710469,-0.081720,0.155407,-1.915931,-1.605506,-0.408096,...,0.055185,-1.827125,-0.149113,-0.292149,-2.085941,-0.319074,1.118447,-1.256799,-0.682321,0
103,1.723507,-0.065914,0.678450,-1.286878,0.186071,0.121855,0.414350,-0.891285,-0.275452,0.700659,...,0.154092,-0.892598,-1.207726,1.829099,-0.362955,0.832229,-1.052162,-0.766312,1.210314,1
104,1.134510,-0.709584,0.998858,1.559704,2.005952,0.315484,-0.408060,-0.902374,-0.924769,-0.738259,...,0.122589,-1.612577,-1.149437,-1.393835,-0.933912,-0.197377,0.989398,-1.837812,-1.833479,0
105,0.540822,-0.990139,0.432350,-0.576376,-0.545066,1.052978,0.890297,0.116937,-1.242391,1.286926,...,-0.039223,-0.263439,-1.102963,0.980728,0.439252,-0.184565,0.072733,-0.235594,-0.405291,4


In [110]:
# Get final predictions
final_df = pd.concat((df, test_features_df), axis=1)
#display(final_df)

# Print only values of interest
final_df = final_df[["uid", "class"]]
display(final_df)

# Save to csv
final_df.to_csv("outputs/group9_blind_predictions.csv")

Unnamed: 0,uid,class
0,AFULZ1019,0
1,AHTVXZ206,1
2,AKXTJD546,0
3,APSJQV812,0
4,ARMMIP966,2
...,...,...
102,ZENCJG630,0
103,ZFSFZW942,1
104,ZJQMJR272,0
105,ZQKBXR902,4


In [111]:
#final_df.describe()
final_df["class"].value_counts()

0    48
4    32
3    17
1     6
2     4
Name: class, dtype: int64