# SVM on X-ray embeddings

In [30]:
# Import modules
import os
import csv
import pandas as pd
import pickle
import re
import numpy as np
from glob import glob
from sklearn.model_selection import KFold
from sklearn import svm
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.svm import SVC

### Set up working directory

In [19]:
working_dir = '.'
embd_lst = os.listdir(os.path.join(working_dir, 'embeddings'))
tsv_file = open(os.path.join(working_dir, 'labels_covid19_posi.tsv'))
read_tsv =  csv.reader(tsv_file, delimiter="\t")

### Load embeddings and do basic pre-processing

In [20]:
x = []
y = []
is_first = True

for row in read_tsv:
    if not is_first:
        data = row[4].replace('\t',',').strip('[').strip(']').split(',')
        data = [item.strip().strip("'").strip() for item in data]
        embadding_paths = glob(os.path.join(working_dir,'embeddings',row[1]+'_'+row[2]+'*'))
        if (len(embadding_paths)>0):
            for embadding_path in embadding_paths:
                with open(embadding_path, 'rb') as pickle_file:
                    x.append(pickle.load(pickle_file).detach().numpy())
                    temp_y = ''
                    if 'COVID 19' in data:
                        temp_y = 0 #covid19
                    elif 'normal' in data:
                        temp_y = 1 #normal
                    else:
                        temp_y = 2 #No COVID but abnormal
                    y.append(temp_y)
    is_first = False
                    
x = np.asarray(x)
y = np.asarray(y)

In [108]:
%%time
# get shape of feature matrix
print('Feature matrix shape is: ', x.shape)

# Standard scaler - Need to figure out how best to make feature vector lengths uniform 
ss = StandardScaler()
# run this on our feature matrix
data_scaled = ss.fit_transform(x)

# use fit_transform to run PCA on our standardized matrix
# pca = PCA(n_components=1000)
# data_pca = ss.fit_transform(data_scaled)

# # look at new shape
# print('PCA matrix shape is: ', data_pca.shape)

Feature matrix shape is:  (2114, 2048)
PCA matrix shape is:  (2114, 2048)
CPU times: user 110 ms, sys: 18 ms, total: 128 ms
Wall time: 126 ms


### Generate training and test sets, check class distribution

In [116]:
kf = KFold(n_splits=5)
fold_ind = 0
best_test_index = []
best_train_index = []
running_score = 0
x = data_scaled

for train_index, test_index in kf.split(x):
    x_train, x_test = x[train_index], x[test_index]
    y_train, y_test = y[train_index], y[test_index]
    clf = SVC(decision_function_shape='ovo')
    clf.fit(x_train, y_train)
    clf_score = clf.score(x_test, y_test)
    print('Fold '+ str(fold_ind) + "(Train index: " + str(train_index[0]) + ")" + ' Results', clf_score)
    if running_score < clf_score:
        best_test_index = test_index
        best_train_index = train_index
        running_score = clf_score
    fold_ind = fold_ind + 1

Fold 0(Train index: 423) Results 0.5130023640661938
Fold 1(Train index: 0) Results 0.6643026004728132
Fold 2(Train index: 0) Results 0.6572104018912529
Fold 3(Train index: 0) Results 0.5791962174940898
Fold 4(Train index: 0) Results 0.6042654028436019


In [87]:
# Use best train/test split
X_train, X_test = x[best_train_index], x[best_test_index]
y_train, y_test = y[best_train_index], y[best_test_index]
clf = SVC(decision_function_shape='ovo')
clf.fit(X_train, y_train)
clf.score(X_test, y_test)

0.6619385342789598

In [117]:
X_train, X_test, y_train, y_test = train_test_split(x,
                                                    y,
                                                    test_size=.2,
                                                    random_state=1234123)
 
# look at the distrubution of labels in the train set
print("COVID: ", pd.Series(y_train).value_counts()[0])
print("Normal: ", pd.Series(y_train).value_counts()[1])
print("No COVID but abnormal: ", pd.Series(y_train).value_counts()[2])

COVID:  517
Normal:  154
No COVID but abnormal:  1020


## Run the SVM with 5-fold cross validation, draw out best performing

In [118]:
from sklearn.model_selection import GridSearchCV 
  
# defining parameter range 
param_grid = {'C': [0.1, 1, 10, 100],  
              'gamma': [0.1, 0.01, 0.001], 
              'kernel': ['rbf', 'linear']}  
  
grid = GridSearchCV(SVC(), param_grid, refit = True, verbose = 3, cv = 5) 
  
# fitting the model for grid search 
grid.fit(X_train, y_train) 

Fitting 5 folds for each of 24 candidates, totalling 120 fits
[CV] C=0.1, gamma=0.1, kernel=rbf ....................................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] ........ C=0.1, gamma=0.1, kernel=rbf, score=0.602, total=   7.6s
[CV] C=0.1, gamma=0.1, kernel=rbf ....................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    7.6s remaining:    0.0s


[CV] ........ C=0.1, gamma=0.1, kernel=rbf, score=0.604, total=   7.5s
[CV] C=0.1, gamma=0.1, kernel=rbf ....................................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:   15.1s remaining:    0.0s


[CV] ........ C=0.1, gamma=0.1, kernel=rbf, score=0.604, total=   7.5s
[CV] C=0.1, gamma=0.1, kernel=rbf ....................................
[CV] ........ C=0.1, gamma=0.1, kernel=rbf, score=0.604, total=   7.5s
[CV] C=0.1, gamma=0.1, kernel=rbf ....................................
[CV] ........ C=0.1, gamma=0.1, kernel=rbf, score=0.604, total=   7.5s
[CV] C=0.1, gamma=0.1, kernel=linear .................................
[CV] ..... C=0.1, gamma=0.1, kernel=linear, score=0.540, total=   5.9s
[CV] C=0.1, gamma=0.1, kernel=linear .................................
[CV] ..... C=0.1, gamma=0.1, kernel=linear, score=0.521, total=   5.8s
[CV] C=0.1, gamma=0.1, kernel=linear .................................
[CV] ..... C=0.1, gamma=0.1, kernel=linear, score=0.550, total=   5.9s
[CV] C=0.1, gamma=0.1, kernel=linear .................................
[CV] ..... C=0.1, gamma=0.1, kernel=linear, score=0.541, total=   5.8s
[CV] C=0.1, gamma=0.1, kernel=linear .................................
[CV] .

[CV] ......... C=10, gamma=0.1, kernel=rbf, score=0.617, total=   7.4s
[CV] C=10, gamma=0.1, kernel=rbf .....................................
[CV] ......... C=10, gamma=0.1, kernel=rbf, score=0.615, total=   7.4s
[CV] C=10, gamma=0.1, kernel=rbf .....................................
[CV] ......... C=10, gamma=0.1, kernel=rbf, score=0.633, total=   7.5s
[CV] C=10, gamma=0.1, kernel=rbf .....................................
[CV] ......... C=10, gamma=0.1, kernel=rbf, score=0.615, total=   7.5s
[CV] C=10, gamma=0.1, kernel=rbf .....................................
[CV] ......... C=10, gamma=0.1, kernel=rbf, score=0.615, total=   7.4s
[CV] C=10, gamma=0.1, kernel=linear ..................................
[CV] ...... C=10, gamma=0.1, kernel=linear, score=0.540, total=   5.8s
[CV] C=10, gamma=0.1, kernel=linear ..................................
[CV] ...... C=10, gamma=0.1, kernel=linear, score=0.521, total=   5.8s
[CV] C=10, gamma=0.1, kernel=linear ..................................
[CV] .

[CV] ... C=100, gamma=0.001, kernel=linear, score=0.541, total=   6.1s
[CV] C=100, gamma=0.001, kernel=linear ...............................
[CV] ... C=100, gamma=0.001, kernel=linear, score=0.518, total=   6.3s


[Parallel(n_jobs=1)]: Done 120 out of 120 | elapsed: 18.0min finished


GridSearchCV(cv=5, estimator=SVC(),
             param_grid={'C': [0.1, 1, 10, 100], 'gamma': [0.1, 0.01, 0.001],
                         'kernel': ['rbf', 'linear']},
             verbose=3)

In [119]:
# print best parameter after tuning 
print(grid.best_params_) 
  
# print how our model looks after hyper-parameter tuning 
print(grid.best_estimator_) 

{'C': 10, 'gamma': 0.001, 'kernel': 'rbf'}
SVC(C=10, gamma=0.001)


## Final output metrics - Model Accuracy + ROC scores

In [120]:
grid_predictions = grid.predict(X_test) 
  
# calculate accuracy
accuracy = accuracy_score(y_test, grid_predictions)
print('Model accuracy is: ', accuracy)

Model accuracy is:  0.6217494089834515


In [121]:
import numpy as np
from sklearn.metrics import roc_auc_score

params_with_prob = grid.best_params_ 
params_with_prob['probability'] = True
svm = SVC(**params_with_prob)
svm.fit(X_train, y_train)

y_prob = svm.predict_proba(X_test)

macro_roc_auc_ovo = roc_auc_score(y_test, y_prob, multi_class="ovo",
                                  average="macro")
weighted_roc_auc_ovo = roc_auc_score(y_test, y_prob, multi_class="ovo",
                                     average="weighted")
macro_roc_auc_ovr = roc_auc_score(y_test, y_prob, multi_class="ovr",
                                  average="macro")
weighted_roc_auc_ovr = roc_auc_score(y_test, y_prob, multi_class="ovr",
                                     average="weighted")
print("One-vs-One ROC AUC scores:\n{:.6f} (macro),\n{:.6f} "
      "(weighted by prevalence)"
      .format(macro_roc_auc_ovo, weighted_roc_auc_ovo))
print("One-vs-Rest ROC AUC scores:\n{:.6f} (macro),\n{:.6f} "
      "(weighted by prevalence)"
      .format(macro_roc_auc_ovr, weighted_roc_auc_ovr))

One-vs-One ROC AUC scores:
0.718105 (macro),
0.711593 (weighted by prevalence)
One-vs-Rest ROC AUC scores:
0.713709 (macro),
0.693480 (weighted by prevalence)
