In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv('Visualization/OASIS/oasis_3.csv')

In [3]:
df = df.dropna(axis=1, how='all') # Drop any empty columns
df = df.dropna(axis=0, how='any') # Drop any rows with empty values
df = df.rename(columns={'id':'Freesurfer ID', 'dx1':'Diagnosis', # Rename columns
                        'TOTAL_HIPPOCAMPUS_VOLUME':'TotalHippocampusVol'})
df = df.drop_duplicates(subset='Subject', keep='first') # Keep only the first visit; this is possible because
                                                        # df is sorted by age
df = df.reset_index(drop=True) # Reset the index
def label_disease (row):
    if row['cdr'] < 0.5:
        return 0
    elif row['Diagnosis'] == 'Cognitively normal':
        return 0
    else:
        return 1
# Labels the diagnosis numerically
df.insert(6, 'Dementia', df.apply (lambda row : label_disease(row), axis=1))
df.loc[(df['M/F'] == 'M'), 'M/F'] = 0
df.loc[(df['M/F'] == 'F'), 'M/F'] = 1

In [4]:
df['Dementia'].value_counts()

0    712
1    310
Name: Dementia, dtype: int64

In [5]:
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.preprocessing import MinMaxScaler 
from sklearn.model_selection import cross_val_score

In [6]:
df.columns

Index(['Subject', 'MR ID', 'Freesurfer ID', 'Age', 'M/F', 'Diagnosis',
       'Dementia', 'mmse', 'cdr', 'apoe', 'TotalHippocampusVol',
       'IntraCranialVol', 'lhCortexVol', 'rhCortexVol', 'CortexVol',
       'SubCortGrayVol', 'TotalGrayVol', 'SupraTentorialVol',
       'lhCorticalWhiteMatterVol', 'rhCorticalWhiteMatterVol',
       'CorticalWhiteMatterVol', 'L.SurfArea', 'R.SurfArea'],
      dtype='object')

In [7]:
Y = df['Dementia'] # target
#X = df[['Age', 'M/F', 'mmse', 'apoe', 'TotalHippocampusVol', 'IntraCranialVol', 'CortexVol']] # features we will use
X = df.drop(['Subject', 'MR ID', 'Freesurfer ID', 'Diagnosis', 'Dementia', 'cdr'], 1)

# splitting into three sets
X_trainval, X_test, Y_trainval, Y_test = train_test_split(X, Y, random_state=0)

# Feature scaling
scaler = MinMaxScaler().fit(X_trainval)
X_trainval_scaled = scaler.transform(X_trainval)
X_test_scaled = scaler.transform(X_test)
X

Unnamed: 0,Age,M/F,mmse,apoe,TotalHippocampusVol,IntraCranialVol,lhCortexVol,rhCortexVol,CortexVol,SubCortGrayVol,TotalGrayVol,SupraTentorialVol,lhCorticalWhiteMatterVol,rhCorticalWhiteMatterVol,CorticalWhiteMatterVol,L.SurfArea,R.SurfArea
0,73.0,1,30.0,23.0,6861.9,1.186091e+06,181943.698754,178031.558882,359975.257636,48400.000000,491102.257636,7.736716e+05,174372.329393,173244.012238,347616.341631,67598.1,67185.8
1,73.0,0,29.0,34.0,7833.2,1.714636e+06,227101.503019,230240.532783,457342.035802,56773.000000,607473.035802,1.051714e+06,239168.338419,245361.377267,484529.715686,83138.1,85742.3
2,66.0,1,29.0,33.0,7983.5,1.405092e+06,204825.718573,209641.219733,414466.938306,59379.000000,557900.938306,9.299305e+05,213905.159729,222232.368895,436137.528624,76695.5,78697.9
3,61.0,1,30.0,23.0,8525.1,1.443177e+06,213861.671106,206884.661369,420746.332475,53910.000000,566477.332475,9.709780e+05,242595.702097,233016.992108,475612.694206,87710.1,84634.9
4,54.0,1,30.0,33.0,9298.2,1.554566e+06,225743.655875,224311.450543,450055.106418,63545.000000,611117.106418,9.867349e+05,229534.963360,230927.823126,460462.786487,82224.0,81421.3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1017,51.0,0,30.0,33.0,8307.6,1.523360e+06,234230.577917,234561.404619,468791.982536,175293.167969,644085.150505,1.008741e+06,252444.625000,255790.125000,508234.750000,86945.6,87200.2
1018,65.0,0,30.0,44.0,7981.8,1.619949e+06,233083.969716,235478.553430,468562.523147,57431.000000,617912.523147,1.012598e+06,229996.472108,227638.300745,457634.772852,86355.3,85926.8
1019,64.0,1,29.0,34.0,6828.5,1.635620e+06,222900.570985,227573.947792,450474.518776,49085.000000,593751.518776,9.897836e+05,208218.439241,211361.618639,419580.057880,79639.5,80013.5
1020,71.0,1,18.0,34.0,4501.3,1.407411e+06,187058.783023,196406.732825,383465.515848,45633.000000,506151.515848,8.747602e+05,206082.644662,214673.053871,420755.698533,76344.0,79045.4


In [8]:
print(X_trainval.shape, X_test.shape, Y_trainval.shape, Y_test.shape)

(766, 17) (256, 17) (766,) (256,)


In [9]:
from sklearn.svm import SVC, LinearSVC
from sklearn.metrics import confusion_matrix, accuracy_score, recall_score, roc_curve, auc
from IPython.display import clear_output
from sklearn.ensemble import BaggingClassifier

In [10]:
best_score = 0
kfolds=5

for c_paramter in [0.001, 0.01, 0.1, 1, 10, 100, 1000]: #iterate over the values we need to try for the parameter C
    for gamma_paramter in [0.001, 0.01, 0.1, 1, 10, 100, 1000]: #iterate over the values we need to try for the parameter gamma
        for k_parameter in ['rbf', 'poly', 'sigmoid', 'linear']: # iterate over the values we need to try for the kernel parameter
            n_estimators = 10
            svmModel = BaggingClassifier(SVC(kernel=k_parameter, C=c_paramter, gamma=gamma_paramter, cache_size=7000), max_samples=1.0 / n_estimators, n_estimators=n_estimators, n_jobs=-1)
            #svmModel = SVC(kernel=k_parameter, C=c_paramter, gamma=gamma_paramter, cache_size=7000, n_jobs=-1) #define the model
            
            # perform cross-validation
            scores = cross_val_score(svmModel, X_trainval_scaled, Y_trainval, cv=kfolds, scoring='accuracy')
            # the training set will be split internally into training and cross validation

            # compute mean cross-validation accuracy
            score = np.mean(scores)
            # if we got a better score, store the score and parameters
            if score >= best_score:
                print(f'Currently on: {c_paramter}, {gamma_paramter}, {k_parameter}')
                print(f'{k_parameter}: {score}')
                best_score = score #store the score 
                best_parameter_c = c_paramter #store the parameter c
                best_parameter_gamma = gamma_paramter #store the parameter gamma
                best_parameter_k = k_parameter
            

# rebuild a model with best parameters to get score 
SelectedSVMmodel = SVC(C=best_parameter_c, gamma=best_parameter_gamma, kernel=best_parameter_k).fit(X_trainval_scaled, Y_trainval)

test_score = SelectedSVMmodel.score(X_test_scaled, Y_test)
PredictedOutput = SelectedSVMmodel.predict(X_test_scaled)
test_recall = recall_score(Y_test, PredictedOutput, pos_label=1)
fpr, tpr, thresholds = roc_curve(Y_test, PredictedOutput, pos_label=1)
test_auc = auc(fpr, tpr)
print("Best accuracy on cross validation set is:", best_score)
print("Best parameter for c is: ", best_parameter_c)
print("Best parameter for gamma is: ", best_parameter_gamma)
print("Best parameter for kernel is: ", best_parameter_k)
print("Test accuracy with the best parameters is", test_score)
print("Test recall with the best parameters is", test_recall)
print("Test recall with the best parameter is", test_auc)

Currently on: 0.001, 0.001, rbf
rbf: 0.6879891350479588
Currently on: 0.001, 0.001, poly
poly: 0.6879891350479588
Currently on: 0.001, 0.001, sigmoid
sigmoid: 0.6879891350479588
Currently on: 0.001, 0.001, linear
linear: 0.6879891350479588
Currently on: 0.001, 0.01, rbf
rbf: 0.6879891350479588
Currently on: 0.001, 0.01, poly
poly: 0.6879891350479588
Currently on: 0.001, 0.01, sigmoid
sigmoid: 0.6879891350479588
Currently on: 0.001, 0.01, linear
linear: 0.6879891350479588
Currently on: 0.001, 0.1, rbf
rbf: 0.6879891350479588
Currently on: 0.001, 0.1, poly
poly: 0.6879891350479588
Currently on: 0.001, 0.1, sigmoid
sigmoid: 0.6879891350479588
Currently on: 0.001, 0.1, linear
linear: 0.6879891350479588
Currently on: 0.001, 1, rbf
rbf: 0.6879891350479588
Currently on: 0.001, 1, poly
poly: 0.6879891350479588
Currently on: 0.001, 1, sigmoid
sigmoid: 0.6879891350479588
Currently on: 0.001, 1, linear
linear: 0.6879891350479588
Currently on: 0.001, 10, rbf
rbf: 0.6879891350479588
Currently on: 0