### Import Libraries and set random seed

In [1]:
import pandas as pd
import numpy as np
from sklearn.utils import resample
from sklearn import svm
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import classification_report
from sklearn.preprocessing import StandardScaler

### Set Random Number Seed
np.random.seed(112)

### Define helper methods

In [2]:
def load_data(filename):
    data = pd.read_excel(filename)
    print("Loaded data with shape: ", data.shape)
    return data

In [3]:
def filter_data(data, columns=None, include_control=True, combine_p_groups=False):
    df = data.copy()
    
    if not include_control:
        df = df[df.GroupID!=0]
        
    if columns:
        df = df[columns]

    if combine_p_groups:
        df.loc[df['GroupID'] != 0, 'GroupID'] = 1 

    return df

In [4]:
def resample_data(df):
    df_control =df[df.GroupID==0]
    df_park = df[df.GroupID==1]
    df_msa = df[df.GroupID==2]
    df_psp = df[df.GroupID==3]

    max_length = max([len(df_park), len(df_msa), len(df_psp), len(df_control)])

    df_control_upsampled = resample(df_control, replace=True, n_samples=max_length, random_state=3)
    df_msa_upsampled = resample(df_msa, replace=True, n_samples=max_length, random_state=1)
    df_psp_upsampled = resample(df_psp, replace=True, n_samples=max_length, random_state=2)

    return pd.concat([df_control, df_park, df_msa_upsampled, df_psp_upsampled])


In [5]:
def split_x_y_data(df, ylabel="GroupID"):
    x_cols = [col for col in df.columns if col != ylabel]
    Xd = pd.DataFrame(df, columns= x_cols)
    Yd = df[ylabel]
    return Xd, Yd

In [6]:
def standardize_data(X_train, X_test):
    sc = StandardScaler()
    sc.fit(X_train)
    X_train_std = sc.transform(X_train)
    X_test_std = sc.transform(X_test)
    return X_train_std, X_test_std

In [1]:
def svm_grid_search(X_train, X_test, y_train, y_test):

    tuned_parameters = [{'kernel': ['rbf'], 'gamma': [1,1e-1,1e-2,1e-3, 1e-4],
                         'C': [1, 10, 100, 1000]},
                        {'kernel': ['linear'], 'C': [1, 10, 100, 1000]}]

    print("# Tuning hyper-parameters for f1")
    print()

    clf = GridSearchCV(svm.SVC(), tuned_parameters, cv=5,
                       scoring='f1_macro', verbose=99999,
                       n_jobs = -1 )
    clf.fit(X_train, y_train)

    print("Best parameters set found on development set:")
    print()
    print(clf.best_params_)
    print()
    print("Grid scores on development set:")
    print()
    means = clf.cv_results_['mean_test_score']
    stds = clf.cv_results_['std_test_score']
    for mean, std, params in zip(means, stds, clf.cv_results_['params']):
        print("%0.3f (+/-%0.03f) for %r"
              % (mean, std * 2, params))
    print()

    print("Detailed classification report:")
    print()
    print("The model is trained on the full development set.")
    print("The scores are computed on the full evaluation set.")
    print()
    y_true, y_pred = y_test, clf.predict(X_test)
    print(set(y_test) - set(y_pred))
    print(classification_report(y_true, y_pred))
    print()

### All Groups, 7 Regions and UPDRS, Resampled

In [None]:
# Load the excel file
data = load_data("real_data2.xlsx")

# Only grab the columns of interest.  TODO: Replace with list derived from feature selection
columns_of_interest = ["GroupID","SCP_FW","MCP_FW","Putamen_FA","Caudate_FA","STN_FW", "RN_FW", "Thalamus_FA", "UPDRS"]
filtered_data = filter_data(data, columns=columns_of_interest)

# Resample the data so that there are even numbers of each label
resampled_data = resample_data(filtered_data)

# Split predictor and response data
X, y = split_x_y_data(resampled_data)

# Shuffle and split training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=0)

# standardize the data
X_train_std, X_test_std = standardize_data(X_train, X_test)

# Perform a grid search to find best SVC model
svm_grid_search(X_train, X_test, y_train, y_test)

Loaded data with shape:  (746, 39)
# Tuning hyper-parameters for f1

Fitting 5 folds for each of 24 candidates, totalling 120 fits
[CV] C=1, gamma=1, kernel=rbf ........................................
[CV]  C=1, gamma=1, kernel=rbf, score=0.6860251263504361, total=   0.0s
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[CV] C=1, gamma=1, kernel=rbf ........................................
[CV]  C=1, gamma=1, kernel=rbf, score=0.6741612901656254, total=   0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.2s remaining:    0.0s
[CV] C=1, gamma=1, kernel=rbf ........................................
[CV]  C=1, gamma=1, kernel=rbf, score=0.7083240205854711, total=   0.0s
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    0.3s remaining:    0.0s
[CV] C=1, gamma=1, kernel=rbf ........................................
[CV]  C=1, gamma=1, kernel=rbf, score=0.7322783452822978, total=   0.0s
[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:    0

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


[CV]  C=1, gamma=0.001, kernel=rbf, score=0.506084656084656, total=   0.0s
[Parallel(n_jobs=1)]: Done  18 out of  18 | elapsed:    1.7s remaining:    0.0s
[CV] C=1, gamma=0.001, kernel=rbf ....................................
[CV]  C=1, gamma=0.001, kernel=rbf, score=0.5392618934724198, total=   0.0s
[Parallel(n_jobs=1)]: Done  19 out of  19 | elapsed:    1.7s remaining:    0.0s
[CV] C=1, gamma=0.001, kernel=rbf ....................................
[CV]  C=1, gamma=0.001, kernel=rbf, score=0.5079716349564115, total=   0.0s
[Parallel(n_jobs=1)]: Done  20 out of  20 | elapsed:    1.8s remaining:    0.0s
[CV] C=1, gamma=0.0001, kernel=rbf ...................................


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


[CV]  C=1, gamma=0.0001, kernel=rbf, score=0.5125682384254215, total=   0.0s
[Parallel(n_jobs=1)]: Done  21 out of  21 | elapsed:    1.9s remaining:    0.0s
[CV] C=1, gamma=0.0001, kernel=rbf ...................................
[CV]  C=1, gamma=0.0001, kernel=rbf, score=0.5074483726727617, total=   0.0s
[Parallel(n_jobs=1)]: Done  22 out of  22 | elapsed:    2.0s remaining:    0.0s
[CV] C=1, gamma=0.0001, kernel=rbf ...................................
[CV]  C=1, gamma=0.0001, kernel=rbf, score=0.49945975011153465, total=   0.0s
[Parallel(n_jobs=1)]: Done  23 out of  23 | elapsed:    2.0s remaining:    0.0s
[CV] C=1, gamma=0.0001, kernel=rbf ...................................


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


[CV]  C=1, gamma=0.0001, kernel=rbf, score=0.5165817289882646, total=   0.0s
[Parallel(n_jobs=1)]: Done  24 out of  24 | elapsed:    2.1s remaining:    0.0s
[CV] C=1, gamma=0.0001, kernel=rbf ...................................
[CV]  C=1, gamma=0.0001, kernel=rbf, score=0.5115352312503064, total=   0.0s
[Parallel(n_jobs=1)]: Done  25 out of  25 | elapsed:    2.2s remaining:    0.0s
[CV] C=10, gamma=1, kernel=rbf .......................................
[CV]  C=10, gamma=1, kernel=rbf, score=0.8711469835563308, total=   0.0s
[Parallel(n_jobs=1)]: Done  26 out of  26 | elapsed:    2.2s remaining:    0.0s
[CV] C=10, gamma=1, kernel=rbf .......................................
[CV]  C=10, gamma=1, kernel=rbf, score=0.88591542925696, total=   0.0s
[Parallel(n_jobs=1)]: Done  27 out of  27 | elapsed:    2.3s remaining:    0.0s
[CV] C=10, gamma=1, kernel=rbf .......................................
[CV]  C=10, gamma=1, kernel=rbf, score=0.8871778995018432, total=   0.0s
[Parallel(n_jobs=1)]: Don

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


[CV]  C=10, gamma=0.001, kernel=rbf, score=0.5243263940985247, total=   0.0s
[Parallel(n_jobs=1)]: Done  42 out of  42 | elapsed:    3.6s remaining:    0.0s
[CV] C=10, gamma=0.001, kernel=rbf ...................................
[CV]  C=10, gamma=0.001, kernel=rbf, score=0.5173397875112081, total=   0.0s
[Parallel(n_jobs=1)]: Done  43 out of  43 | elapsed:    3.7s remaining:    0.0s
[CV] C=10, gamma=0.001, kernel=rbf ...................................
[CV]  C=10, gamma=0.001, kernel=rbf, score=0.5392618934724198, total=   0.0s
[Parallel(n_jobs=1)]: Done  44 out of  44 | elapsed:    3.8s remaining:    0.0s
[CV] C=10, gamma=0.001, kernel=rbf ...................................


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


[CV]  C=10, gamma=0.001, kernel=rbf, score=0.5185073408768246, total=   0.0s
[Parallel(n_jobs=1)]: Done  45 out of  45 | elapsed:    3.9s remaining:    0.0s
[CV] C=10, gamma=0.0001, kernel=rbf ..................................
[CV]  C=10, gamma=0.0001, kernel=rbf, score=0.5066742081447964, total=   0.0s
[Parallel(n_jobs=1)]: Done  46 out of  46 | elapsed:    4.0s remaining:    0.0s
[CV] C=10, gamma=0.0001, kernel=rbf ..................................
[CV]  C=10, gamma=0.0001, kernel=rbf, score=0.5267314921920185, total=   0.0s
[Parallel(n_jobs=1)]: Done  47 out of  47 | elapsed:    4.1s remaining:    0.0s
[CV] C=10, gamma=0.0001, kernel=rbf ..................................


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


[CV]  C=10, gamma=0.0001, kernel=rbf, score=0.506084656084656, total=   0.0s
[Parallel(n_jobs=1)]: Done  48 out of  48 | elapsed:    4.2s remaining:    0.0s
[CV] C=10, gamma=0.0001, kernel=rbf ..................................
[CV]  C=10, gamma=0.0001, kernel=rbf, score=0.5387739788937393, total=   0.0s
[Parallel(n_jobs=1)]: Done  49 out of  49 | elapsed:    4.2s remaining:    0.0s
[CV] C=10, gamma=0.0001, kernel=rbf ..................................
[CV]  C=10, gamma=0.0001, kernel=rbf, score=0.5079716349564115, total=   0.0s
[Parallel(n_jobs=1)]: Done  50 out of  50 | elapsed:    4.3s remaining:    0.0s
[CV] C=100, gamma=1, kernel=rbf ......................................


  'precision', 'predicted', average, warn_for)


[CV]  C=100, gamma=1, kernel=rbf, score=0.9208853425034675, total=   0.0s
[Parallel(n_jobs=1)]: Done  51 out of  51 | elapsed:    4.4s remaining:    0.0s
[CV] C=100, gamma=1, kernel=rbf ......................................
[CV]  C=100, gamma=1, kernel=rbf, score=0.9064557206323538, total=   0.0s
[Parallel(n_jobs=1)]: Done  52 out of  52 | elapsed:    4.5s remaining:    0.0s
[CV] C=100, gamma=1, kernel=rbf ......................................
[CV]  C=100, gamma=1, kernel=rbf, score=0.9152092195414926, total=   0.0s
[Parallel(n_jobs=1)]: Done  53 out of  53 | elapsed:    4.5s remaining:    0.0s
[CV] C=100, gamma=1, kernel=rbf ......................................
[CV]  C=100, gamma=1, kernel=rbf, score=0.9258570367183965, total=   0.0s
[Parallel(n_jobs=1)]: Done  54 out of  54 | elapsed:    4.6s remaining:    0.0s
[CV] C=100, gamma=1, kernel=rbf ......................................
[CV]  C=100, gamma=1, kernel=rbf, score=0.8943338861249309, total=   0.0s
[Parallel(n_jobs=1)]: Done

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


[CV]  C=100, gamma=0.0001, kernel=rbf, score=0.50274358555251, total=   0.0s
[Parallel(n_jobs=1)]: Done  73 out of  73 | elapsed:    6.7s remaining:    0.0s
[CV] C=100, gamma=0.0001, kernel=rbf .................................
[CV]  C=100, gamma=0.0001, kernel=rbf, score=0.5369704342307082, total=   0.0s
[Parallel(n_jobs=1)]: Done  74 out of  74 | elapsed:    6.8s remaining:    0.0s
[CV] C=100, gamma=0.0001, kernel=rbf .................................
[CV]  C=100, gamma=0.0001, kernel=rbf, score=0.5080602756552228, total=   0.0s
[Parallel(n_jobs=1)]: Done  75 out of  75 | elapsed:    6.9s remaining:    0.0s
[CV] C=1000, gamma=1, kernel=rbf .....................................


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


[CV]  C=1000, gamma=1, kernel=rbf, score=0.9309977793114326, total=   0.0s
[Parallel(n_jobs=1)]: Done  76 out of  76 | elapsed:    7.0s remaining:    0.0s
[CV] C=1000, gamma=1, kernel=rbf .....................................
[CV]  C=1000, gamma=1, kernel=rbf, score=0.906570091452919, total=   0.0s
[Parallel(n_jobs=1)]: Done  77 out of  77 | elapsed:    7.1s remaining:    0.0s
[CV] C=1000, gamma=1, kernel=rbf .....................................
[CV]  C=1000, gamma=1, kernel=rbf, score=0.918407010535093, total=   0.0s
[Parallel(n_jobs=1)]: Done  78 out of  78 | elapsed:    7.2s remaining:    0.0s
[CV] C=1000, gamma=1, kernel=rbf .....................................
[CV]  C=1000, gamma=1, kernel=rbf, score=0.9211840924169692, total=   0.0s
[Parallel(n_jobs=1)]: Done  79 out of  79 | elapsed:    7.3s remaining:    0.0s
[CV] C=1000, gamma=1, kernel=rbf .....................................
[CV]  C=1000, gamma=1, kernel=rbf, score=0.8904880179971422, total=   0.0s
[Parallel(n_jobs=1)]: D

### All Groups, All Data, Resampled

In [None]:
# Load the excel file
data = load_data("real_data2.xlsx")

# Resample the data so that there are even numbers of each label
resampled_data = resample_data(filtered_data)

# Split predictor and response data
X, y = split_x_y_data(resampled_data)

# Shuffle and split training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=0)

# standardize the data
X_train_std, X_test_std = standardize_data(X_train, X_test)

# Perform a grid search to find best SVC model
svm_grid_search(X_train, X_test, y_train, y_test)

Loaded data with shape:  (746, 39)
# Tuning hyper-parameters for f1



  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision'

### Control vs. All Parkinsons, All Data, Resampled

In [None]:
# Load the excel file
data = load_data("real_data2.xlsx")

# Combine Parkinson's Groups
filtered_data = filter_data(data, combine_p_groups=True)

# Resample the data so that there are even numbers of each label
resampled_data = resample_data(filtered_data)

# Split predictor and response data
X, y = split_x_y_data(resampled_data)

# Shuffle and split training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=0)

# standardize the data
X_train_std, X_test_std = standardize_data(X_train, X_test)

# Perform a grid search to find best SVC model
svm_grid_search(X_train, X_test, y_train, y_test)

### Ignore Controls, All Data, Resampled

In [None]:
# Load the excel file
data = load_data("real_data2.xlsx")

# Ignore the Control Group
filtered_data = filter_data(data, include_control=False)

# Resample the data so that there are even numbers of each label
resampled_data = resample_data(filtered_data)

# Split predictor and response data
X, y = split_x_y_data(resampled_data)

# Shuffle and split training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=0)

# standardize the data
X_train_std, X_test_std = standardize_data(X_train, X_test)

# Perform a grid search to find best SVC model
svm_grid_search(X_train, X_test, y_train, y_test)

### Ignore Controls, 7 Regions and UPDRS, Resampled

In [None]:
# Load the excel file
data = load_data("real_data2.xlsx")

# Only grab the columns of interest.  TODO: Replace with list derived from feature selection
columns_of_interest = ["GroupID","SCP_FW","MCP_FW","Putamen_FA","Caudate_FA","STN_FW", "RN_FW", "Thalamus_FA", "UPDRS"]
filtered_data = filter_data(data, columns=columns_of_interest, include_control=False)

# Resample the data so that there are even numbers of each label
resampled_data = resample_data(filtered_data)

# Split predictor and response data
X, y = split_x_y_data(resampled_data)

# Shuffle and split training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=0)

# standardize the data
X_train_std, X_test_std = standardize_data(X_train, X_test)

# Perform a grid search to find best SVC model
svm_grid_search(X_train, X_test, y_train, y_test)