In this notebook I want  to try to recreate the feature subset selection that Derek did using ANOVA to find the most significant features when evaluating differences between certain classes.  Namely Control/Parkinsons,  PD/MSA&PSP,  MSA/PSP

In [49]:
from sklearn import svm
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV

def svm_grid_search(X_train, X_test, y_train, y_test, cv=5):

    tuned_parameters = [{'kernel': ['linear'], 'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000]}, {'kernel': ['rbf'], 'C': [1, 10, 100, 1000], 'gamma': [1e-4, 1e-3, 1e-2, 1e-1, 1]}]

    print("# Tuning hyper-parameters for f1")
    print()

    clf = GridSearchCV(svm.SVC(), tuned_parameters, cv=cv,
                       n_jobs = -1 )
    clf.fit(X_train, y_train)

    print("Best parameters set found on development set:")
    print()
    print(clf.best_params_)
    print()
    print("Grid scores on development set:")
    print()
    means = clf.cv_results_['mean_test_score']
    stds = clf.cv_results_['std_test_score']
    for mean, std, params in zip(means, stds, clf.cv_results_['params']):
        print("%0.3f (+/-%0.03f) for %r"
              % (mean, std * 2, params))
    print()

    print("Detailed classification report:")
    print()
    print("The model is trained on the full development set.")
    print("The scores are computed on the full evaluation set.")
    print()
    y_true, y_pred = y_test, clf.predict(X_test)
    print(classification_report(y_true, y_pred))
    print()
    
    return clf

In [51]:
from sklearn.utils import resample

def resample_to_equal_class_sizes(X,y):
    df = pd.DataFrame(X)
    df['group'] = [int(i) for i in y]
    groups = []
    for v in set(df['group']):
        groups.append(df[df['group'] == v])
           
    max_length = max([len(group) for group in groups])
    print("Maximum class size is %s" %max_length)
    
    final_groups = []
    for group in groups:
        if len(group) < max_length:
            print("Class %s size is %s. Resampling with replacement to %s" %(max(group['group']),len(group), max_length))
            final_groups.append(resample(group, replace=True, n_samples=max_length))
        else:
            print("Class %s size has max class size (%s)." %(max(group['group']), max_length))
            final_groups.append(group)
    df = pd.concat(final_groups)
    return df.drop('group', axis=1).values, df['group'].values
    

#print(X_train_std.shape)
#print(y_train_group_park.shape)
#xr, yr = resample_to_equal_class_sizes(X_train_std, y_train_group_park)
#print(xr.shape, yr.shape)


In [12]:
# Read in the data
import pandas as pd

raw_data = pd.read_excel('data/training_data.xlsx')

# remove unneeded subject ID column
data = raw_data.drop('Subject', axis=1)

### Control Vs. All Parkinsons

In [48]:
cvp = data.copy()
cvp.loc[cvp['GroupID'] != 0,'GroupID'] = 1

# Feature Extraction with Univariate Statistical Tests (ANOVA F-value for classification)
import pandas as pd
import numpy as np
from sklearn.feature_selection import SelectKBest, f_classif

y_cvp = cvp['GroupID']
X_cvp = cvp.drop('GroupID', axis=1)

# feature extraction
test_cvp = SelectKBest(score_func=f_classif, k=10)
fit_cvp = test_cvp.fit(X_cvp, y_cvp)

# summarize scores
#np.set_printoptions(suppress=True)
#for x in pd.DataFrame([cvp.columns, fit.scores_]).transpose().sort_values(by=1, ascending=False).transpose().values[1]:
#    print(x)

In [70]:
from sklearn.cross_validation import train_test_split

X_cvp_fss = fit_cvp.transform(X_cvp)
X_cvp_fss_train, X_cvp_fss_test, y_train, y_test = train_test_split(X_cvp_fss, y_cvp)

X_cvp_fss_train_res, y_train_res = resample_to_equal_class_sizes(X_cvp_fss_train, y_train)

clf_cvp = svm_grid_search(X_cvp_fss_train_res, X_cvp_fss_test, y_train_res, y_test)

Maximum class size is 377
Class 0 size is 182. Resampling with replacement to 377
Class 1 size has max class size (377).
# Tuning hyper-parameters for f1

Best parameters set found on development set:

{'C': 100, 'gamma': 1, 'kernel': 'rbf'}

Grid scores on development set:

0.935 (+/-0.043) for {'C': 0.001, 'kernel': 'linear'}
0.938 (+/-0.045) for {'C': 0.01, 'kernel': 'linear'}
0.935 (+/-0.049) for {'C': 0.1, 'kernel': 'linear'}
0.936 (+/-0.045) for {'C': 1, 'kernel': 'linear'}
0.944 (+/-0.028) for {'C': 10, 'kernel': 'linear'}
0.942 (+/-0.034) for {'C': 100, 'kernel': 'linear'}
0.940 (+/-0.035) for {'C': 1000, 'kernel': 'linear'}
0.928 (+/-0.056) for {'C': 1, 'gamma': 0.0001, 'kernel': 'rbf'}
0.939 (+/-0.047) for {'C': 1, 'gamma': 0.001, 'kernel': 'rbf'}
0.935 (+/-0.046) for {'C': 1, 'gamma': 0.01, 'kernel': 'rbf'}
0.939 (+/-0.047) for {'C': 1, 'gamma': 0.1, 'kernel': 'rbf'}
0.951 (+/-0.037) for {'C': 1, 'gamma': 1, 'kernel': 'rbf'}
0.936 (+/-0.044) for {'C': 10, 'gamma': 0.0001, 'k

In [99]:
# import the validation data
import pandas as pd

raw_validation_data = pd.read_excel('data/Validation.xlsx')

# remove unneeded subject ID column
validation_data = raw_validation_data[data.columns]
validation_data.loc[validation_data['GroupID'] != 0, 'GroupID'] = 1

# split x and y data
y_valid = validation_data['GroupID']
X_valid = validation_data.drop('GroupID', axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


In [100]:
#transform the validation data (feature selection)
X_valid_fss = fit_cvp.transform(X_valid)

clf_cvp.score(X_valid_fss, y_valid)

0.92

### PD vs. MSA/PSP

In [86]:
pva = data.copy()
pva = pva.loc[pva['GroupID'] != 0]
pva.loc[pva['GroupID'] != 1,'GroupID'] = 2

# Feature Extraction with Univariate Statistical Tests (ANOVA F-value for classification)
import pandas as pd
import numpy as np
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif

y_pva = pva['GroupID']
X_pva = pva.drop('GroupID', axis=1)

# feature extraction
test_pva = SelectKBest(score_func=f_classif, k=10)
fit_pva = test_pva.fit(X_pva, y_pva)

# summarize scores
np.set_printoptions(suppress=True)
#for x in pd.DataFrame([pva.columns, fit.scores_]).transpose().sort_values(by=1, ascending=False).transpose().values[1]:
#    print(x)

In [87]:
from sklearn.cross_validation import train_test_split

X_pva_fss = fit_pva.transform(X_pva)
X_pva_fss_train, X_pva_fss_test, y_train, y_test = train_test_split(X_pva_fss, y_pva)

X_pva_fss_train_res, y_train_res = resample_to_equal_class_sizes(X_pva_fss_train, y_train)

clf_pva = svm_grid_search(X_pva_fss_train_res, X_pva_fss_test, y_train_res, y_test)

Maximum class size is 300
Class 1 size has max class size (300).
Class 2 size is 79. Resampling with replacement to 300
# Tuning hyper-parameters for f1

Best parameters set found on development set:

{'C': 100, 'gamma': 1, 'kernel': 'rbf'}

Grid scores on development set:

0.663 (+/-0.079) for {'C': 0.001, 'kernel': 'linear'}
0.668 (+/-0.064) for {'C': 0.01, 'kernel': 'linear'}
0.742 (+/-0.063) for {'C': 0.1, 'kernel': 'linear'}
0.812 (+/-0.037) for {'C': 1, 'kernel': 'linear'}
0.842 (+/-0.042) for {'C': 10, 'kernel': 'linear'}
0.845 (+/-0.043) for {'C': 100, 'kernel': 'linear'}
0.840 (+/-0.065) for {'C': 1000, 'kernel': 'linear'}
0.655 (+/-0.076) for {'C': 1, 'gamma': 0.0001, 'kernel': 'rbf'}
0.660 (+/-0.071) for {'C': 1, 'gamma': 0.001, 'kernel': 'rbf'}
0.700 (+/-0.071) for {'C': 1, 'gamma': 0.01, 'kernel': 'rbf'}
0.715 (+/-0.027) for {'C': 1, 'gamma': 0.1, 'kernel': 'rbf'}
0.795 (+/-0.034) for {'C': 1, 'gamma': 1, 'kernel': 'rbf'}
0.663 (+/-0.070) for {'C': 10, 'gamma': 0.0001, 'ke

In [97]:
# import the validation data
import pandas as pd

raw_validation_data = pd.read_excel('data/Validation.xlsx')

# remove unneeded subject ID column
validation_data = raw_validation_data[data.columns]
validation_data = validation_data.loc[validation_data['GroupID'] != 0]
validation_data.loc[validation_data['GroupID'] != 1,'GroupID'] = 2

# split x and y data
y_valid = validation_data['GroupID']
X_valid = validation_data.drop('GroupID', axis=1)

In [98]:
#transform the validation data (feature selection)
X_valid_fss = fit_pva.transform(X_valid)

print(clf_pva.score(X_valid_fss, y_valid.values))
print("Actual:   ", y_valid.values)
print("Predicted:",clf_pva.predict(X_valid_fss))

0.75
Actual:    [1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2
 2 2 2 2 2 2 2]
Predicted: [1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 1 2 1 1 1 1 1 1 1 1 2 1 1 1
 1 1 2 1 1 2 1]


### MSA vs PSP

In [104]:
mvp = data.copy()
mvp = mvp.loc[mvp['GroupID'] != 0]
mvp = mvp.loc[mvp['GroupID'] != 1]

# Feature Extraction with Univariate Statistical Tests (ANOVA F-value for classification)
import pandas as pd
import numpy as np
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif

y_mvp = mvp['GroupID']
X_mvp = mvp.drop('GroupID', axis=1)

# feature extraction
test_mvp = SelectKBest(score_func=f_classif, k=10)
fit_mvp = test_mvp.fit(X_mvp, y_mvp)

# summarize scores
#np.set_printoptions(suppress=True)
#for x in pd.DataFrame([mvp.columns, fit.scores_]).transpose().sort_values(by=1, ascending=False).transpose().values[1]:
#    print(x)
    
#pd.DataFrame([mvp.columns, fit.scores_]).transpose().sort_values(by=1, ascending=False)

In [105]:
from sklearn.cross_validation import train_test_split

X_mvp_fss = fit_mvp.transform(X_mvp)
X_mvp_fss_train, X_mvp_fss_test, y_train, y_test = train_test_split(X_mvp_fss, y_mvp)

X_mvp_fss_train_res, y_train_res = resample_to_equal_class_sizes(X_mvp_fss_train, y_train)

clf_mvp = svm_grid_search(X_mvp_fss_train_res, X_mvp_fss_test, y_train_res, y_test)

Maximum class size is 41
Class 2 size is 39. Resampling with replacement to 41
Class 3 size has max class size (41).
# Tuning hyper-parameters for f1

Best parameters set found on development set:

{'C': 1000, 'kernel': 'linear'}

Grid scores on development set:

0.671 (+/-0.149) for {'C': 0.001, 'kernel': 'linear'}
0.671 (+/-0.149) for {'C': 0.01, 'kernel': 'linear'}
0.671 (+/-0.149) for {'C': 0.1, 'kernel': 'linear'}
0.768 (+/-0.049) for {'C': 1, 'kernel': 'linear'}
0.805 (+/-0.124) for {'C': 10, 'kernel': 'linear'}
0.841 (+/-0.188) for {'C': 100, 'kernel': 'linear'}
0.866 (+/-0.146) for {'C': 1000, 'kernel': 'linear'}
0.671 (+/-0.149) for {'C': 1, 'gamma': 0.0001, 'kernel': 'rbf'}
0.671 (+/-0.149) for {'C': 1, 'gamma': 0.001, 'kernel': 'rbf'}
0.671 (+/-0.149) for {'C': 1, 'gamma': 0.01, 'kernel': 'rbf'}
0.671 (+/-0.149) for {'C': 1, 'gamma': 0.1, 'kernel': 'rbf'}
0.768 (+/-0.049) for {'C': 1, 'gamma': 1, 'kernel': 'rbf'}
0.671 (+/-0.149) for {'C': 10, 'gamma': 0.0001, 'kernel': 'rbf

In [107]:
# import the validation data
import pandas as pd

raw_validation_data = pd.read_excel('data/Validation.xlsx')

# remove unneeded subject ID column
validation_data = raw_validation_data[data.columns]
validation_data = validation_data.loc[validation_data['GroupID'] != 0]
validation_data = validation_data.loc[validation_data['GroupID'] != 1]

# split x and y data
y_valid = validation_data['GroupID']
X_valid = validation_data.drop('GroupID', axis=1)

In [108]:
#transform the validation data (feature selection)
X_valid_fss = fit_mvp.transform(X_valid)

print(clf_mvp.score(X_valid_fss, y_valid.values))
print("Actual:   ", y_valid.values)
print("Predicted:",clf_mvp.predict(X_valid_fss))

0.9166666666666666
Actual:    [2 2 2 2 3 3 3 3 3 3 3 3]
Predicted: [2 2 3 2 3 3 3 3 3 3 3 3]
