In [1]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import balanced_accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import cross_validate
from sklearn import svm
import networkx as nx
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import average_precision_score
import numpy as np
import itertools
from sklearn.metrics import precision_score

In [2]:
df = pd.read_csv('/Users/harshil.dadlani/Desktop/UW_Courses/2021_Fall/560/Project/Datasets/Extracted_Data/final_extracted_a0f66459.csv')

In [3]:
electrode_col_names = [col for col in df.columns if 'GRID' in col ]

In [4]:
X = df[electrode_col_names]
Y = df['mvmt']
mapping = {'mv_0':0, 'r_arm_1':1}
Y = Y.map(mapping)
scoring_metrics = ['balanced_accuracy', 'accuracy', 'average_precision']
random_state=1

In [5]:
# Separate out the test-set as hold-out (80:20)
X_train, X_test, y_train, y_test = train_test_split(X, Y,stratify=Y, test_size=0.20)

In [6]:
#test_size=len(round(y_test))/2
int(len(y_test)/2)

168

In [7]:
# Define utility functions
def get_scores(X,y,classifier,scoring_metrics):
    cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=3, random_state=random_state)
    pipeline = make_pipeline(classifier)
    scores = cross_validate(pipeline, X, y, scoring=scoring_metrics, cv=cv, n_jobs=-1)
    return {'average_precision':scores['test_average_precision'].mean(),
            'balanced_accuracy':scores['test_balanced_accuracy'].mean(),
            'accuracy':scores['test_accuracy'].mean()}
            
def get_positively_correlated_features(df, threshold=0.6):
    cols = X_train.corr().unstack()
    cols = cols.sort_values(kind="quicksort",ascending=False)[64:]
    corr_elems = list(cols[cols>threshold].index)
    correlated_features = [list(pair) for pair in corr_elems[1::2]]
    g = nx.Graph()
    ipath = correlated_features
    for p in ipath:
        g.add_edges_from(zip(p, p[1:]))
    correlated_features = []
    for c in nx.connected_components(g):
        correlated_features.append(list(c))
    return correlated_features

In [8]:
# Random guessing
y_train.value_counts()/len(y_train)

1    0.639344
0    0.360656
Name: mvmt, dtype: float64

In [9]:
# Iteration-0, Baseline model
clf = LogisticRegression(C=10,random_state=random_state)
get_scores(X_train,y_train,clf,scoring_metrics=scoring_metrics)


{'average_precision': 0.6645852570139653,
 'balanced_accuracy': 0.4971932465090107,
 'accuracy': 0.635369620299987}

In [10]:
# Iteration-0, Test Set performance
clf.fit(X_train, y_train)
pred = clf.predict(X_test)
accuracy_score(y_test,pred)

0.6398809523809523

In [30]:
# Utility functions
def tune_rf(X_train, y_train, criteria='accuracy'):
    grid_combo = list(itertools.product(n_estimators, max_features, max_depth,min_samples_split,min_samples_leaf,bootstrap))
    best_res = None
    best_model = None
    max_iter=100
    iter_1=0
    best_metric=0
    criteria = criteria
    for combo in grid_combo:
        clf = RandomForestClassifier(n_estimators=combo[0], max_features=combo[1], max_depth=combo[2],min_samples_split=combo[3],min_samples_leaf=combo[4],bootstrap=combo[5])
        res = get_scores(X_train,y_train,clf,scoring_metrics=scoring_metrics)
        if res[criteria]>best_metric:
            best_metric=res[criteria]
            best_res=res
            best_model=combo
        if iter_1==max_iter:
            break
        iter_1+=1
    return best_model, best_res

def perform_dimensionality_reduction(X_train,X_val,threshold=0.7):
    correlated_features = get_positively_correlated_features(X_train,threshold)
    combined_features=[]
    X_train_temp=pd.DataFrame()
    X_val_temp = pd.DataFrame()
    for group in correlated_features:
        combined_features.extend(group)
        if len(group)==2:
            pca = PCA(n_components=1,random_state=random_state)
            pca.fit(X_train[group])
            X_train_temp['_'.join(group)+'_PC1']=pd.Series(pca.transform(X_train[group]).T[0],index=X_train.index)
            X_val_temp['_'.join(group)+'_PC1']=pd.Series(pca.transform(X_val[group]).T[0],index=X_val.index)
        else:
            pca = PCA(n_components=2,random_state=random_state)
            pca.fit(X_train[group])
            X_train_temp['_'.join(group)+'_PC1']=pd.Series(pca.transform(X_train[group]).T[0],index=X_train.index)
            X_train_temp['_'.join(group)+'_PC2']=pd.Series(pca.transform(X_train[group]).T[1],index=X_train.index) 
            
            X_val_temp['_'.join(group)+'_PC1']=pd.Series(pca.transform(X_val[group]).T[0],index=X_val.index)        
            X_val_temp['_'.join(group)+'_PC2']=pd.Series(pca.transform(X_val[group]).T[1],index=X_val.index)
    
    return X_train_temp.join(X_train.drop(combined_features,axis=1)), X_val_temp.join(X_val.drop(combined_features,axis=1))

In [12]:
# Iteration-1, Random Forest tuning

# Number of trees in random forest
n_estimators = [100]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [10,20,30,40,50,60,70,80,90,100,110,120]
# Minimum number of samples required to split a node
min_samples_split = [2, 5]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2]
# Method of selecting samples for training each tree
bootstrap = [True, False]

best_model, best_score=tune_rf(X_train, y_train)


In [20]:
# Iteration-1, Test Set performance
clf=RandomForestClassifier(n_estimators=best_model[0], max_features=best_model[1], max_depth=best_model[2],min_samples_split=best_model[3],min_samples_leaf=best_model[4],bootstrap=best_model[5],random_state=random_state)
clf.fit(X_train,y_train)
pred=clf.predict(X_test)
accuracy_score(y_test,pred)

0.6755952380952381

In [31]:
# Iteration-2, Perform Dimensionality reduction and tune Random Forest Classifier

X_train_2, X_test_2 = perform_dimensionality_reduction(X_train, X_test, threshold=0.55)
# Number of trees in random forest
n_estimators = [100]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [5,10,15,20,25,30,35,40,45,50,60,70,80,90,100]
# Minimum number of samples required to split a node
min_samples_split = [2, 5]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2]
# Method of selecting samples for training each tree
bootstrap = [True, False]

best_model, best_score=tune_rf(X_train_2, y_train)

In [35]:
# Test Set Performance 2
clf=RandomForestClassifier(n_estimators=best_model[0], max_features=best_model[1], max_depth=best_model[2],min_samples_split=best_model[3],min_samples_leaf=best_model[4],bootstrap=best_model[5],random_state=random_state)
clf.fit(X_train_2,y_train)
pred=clf.predict(X_test_2)
accuracy_score(y_test,pred)

0.7053571428571429

In [44]:
# Iteration-3, Perform Dimensionality reduction and tune SVM
C = [0.1,1,10,10000]
gamma = [0.1,0.01,0.001]
kernel = ['rbf']
grid_combo = list(itertools.product(C, gamma, kernel))

for combo in grid_combo:
    clf = svm.SVC(C=combo[0],gamma=combo[1],kernel=combo[2])
    clf.fit(X_train_2,y_train)
    pred = clf.predict(X_test_2)
    score = accuracy_score(y_test,pred)
    print(score)

0.6398809523809523
0.6398809523809523
0.6398809523809523
0.6398809523809523
0.6398809523809523
0.6398809523809523
0.6398809523809523
0.6398809523809523
0.6398809523809523
0.6398809523809523
0.6398809523809523
0.6398809523809523
