In [866]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import balanced_accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import cross_validate
from sklearn import svm
import networkx as nx
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import numpy as np

In [2]:
df = pd.read_csv('/Users/harshil.dadlani/Desktop/UW_Courses/2021_Fall/560/Project/Datasets/Extracted_Data/final_extracted_a0f66459.csv')

In [33]:
electrode_col_names = [col for col in df.columns if 'GRID' in col ]

In [39]:
X = df[electrode_col_names]
Y = df['mvmt']

In [831]:
# Separate out the test-set as hold-out (80:20)
X_train, X_test, y_train, y_test = train_test_split(X, Y,stratify=Y, test_size=0.20)

In [718]:
#test_size=len(round(y_test))/2
int(len(y_test)/2)

168

In [719]:
# Separate out the train and validation set (70:10)
#X_train, X_val, y_train, y_val = train_test_split(X_train, y_train,stratify=y_train, test_size=int(len(y_test)/2))

In [848]:
# Define utility functions
def get_scores(X,y,classifier,scoring_metrics=['accuracy','balanced_accuracy']):
    cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=3, random_state=1)
    pipeline = make_pipeline(classifier)
    scores = cross_validate(pipeline, X, y, scoring=scoring_metrics, cv=cv, n_jobs=-1)
    return {"balanced_accuracy": scores['test_balanced_accuracy'].mean(),
            "accuracy":scores['test_accuracy'].mean()}
            
def get_positively_correlated_features(df, threshold=0.6):
    cols = X_train.corr().unstack()
    cols = cols.sort_values(kind="quicksort",ascending=False)[64:]
    corr_elems = list(cols[cols>threshold].index)
    correlated_features = [list(pair) for pair in corr_elems[1::2]]
    g = nx.Graph()
    ipath = correlated_features
    for p in ipath:
        g.add_edges_from(zip(p, p[1:]))
    correlated_features = []
    for c in nx.connected_components(g):
        correlated_features.append(list(c))
    return correlated_features

In [849]:
# Random guessing
y_train.value_counts()/len(y_train)

r_arm_1    0.639344
mv_0       0.360656
Name: mvmt, dtype: float64

In [895]:
# Iteration-0, Baseline model
clf = LogisticRegression(random_state=0,C=10)
get_scores(X_train,y_train,clf,scoring_metrics=['accuracy','balanced_accuracy'])


{'balanced_accuracy': 0.4982501473321547, 'accuracy': 0.637109064343709}

In [896]:
# Test Set performance
clf.fit(X_train,y_train)
pred=clf.predict(X_test)
balanced_accuracy_score(pred,y_test)

0.3194029850746269

In [904]:
# Iteration-1, Random Forest tuning

# Number of trees in random forest
n_estimators = [100]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [5,10,15,20,25,30,35,40,45,50,60,70,80,90,100]
# Minimum number of samples required to split a node
min_samples_split = [2, 5]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2]
# Method of selecting samples for training each tree
bootstrap = [True, False]

grid_combo = list(itertools.product(n_estimators, max_features, max_depth,min_samples_split,min_samples_leaf,bootstrap))
best_score = None
best_model = None
max_iter=2
best_bac=0
iter_1=10000
for combo in grid_combo:
    clf = RandomForestClassifier(n_estimators=combo[0], max_features=combo[1], max_depth=combo[2],min_samples_split=combo[3],min_samples_leaf=combo[4],bootstrap=combo[5])
    res = get_scores(X_train,y_train,clf,scoring_metrics=['accuracy','balanced_accuracy'])
    if res['balanced_accuracy']>best_bac:
        best_bac=res['balanced_accuracy']
        best_score=res
        best_model=combo
    if iter_1==max_iter:
        break
    iter_1+=1
print(best_score)
print(best_model)

{'balanced_accuracy': 0.6001262035436362, 'accuracy': 0.6731398768240583}
(100, 'sqrt', 30, 2, 1, False)


In [905]:
# Test Set Performance
combo=best_model
clf=RandomForestClassifier(n_estimators=combo[0], max_features=combo[1], max_depth=combo[2],min_samples_split=combo[3],min_samples_leaf=combo[4],bootstrap=combo[5])
clf.fit(X_train,y_train)
pred=clf.predict(X_test)
balanced_accuracy_score(pred,y_test)

0.6334586466165413

In [734]:
# Iteration-2, Peform Dimensionality reduction

def perform_dimensionality_reduction(X_train,X_val,threshold=0.7):
    correlated_features = get_positively_correlated_features(X_train,threshold)
    combined_features=[]
    X_train_temp=pd.DataFrame()
    X_val_temp = pd.DataFrame()
    for group in correlated_features:
        combined_features.extend(group)
        if len(group)==2:
            pca = PCA(n_components=1)
            pca.fit(X_train[pair])
            X_train_temp['_'.join(group)+'_PC1']=pd.Series(pca.transform(X_train[pair]).T[0],index=X_train.index)
            X_val_temp['_'.join(group)+'_PC1']=pd.Series(pca.transform(X_val[pair]).T[0],index=X_val.index)
        else:
            pca = PCA(n_components=2)
            pca.fit(X_train[pair])
            X_train_temp['_'.join(group)+'_PC1']=pd.Series(pca.transform(X_train[pair]).T[0],index=X_train.index)
            X_train_temp['_'.join(group)+'_PC2']=pd.Series(pca.transform(X_train[pair]).T[1],index=X_train.index) 
            
            X_val_temp['_'.join(group)+'_PC1']=pd.Series(pca.transform(X_val[pair]).T[0],index=X_val.index)        
            X_val_temp['_'.join(group)+'_PC2']=pd.Series(pca.transform(X_val[pair]).T[1],index=X_val.index)
    
    return X_train_temp, X_val_temp

In [743]:
# C = [0.1,1,10,10000]
# gamma = [0.1,0.01,0.001]
# kernel = ['rbf', 'poly', 'sigmoid']
# grid_combo = list(itertools.product(C, gamma, kernel))

# cols=['GRID1','GRID2','GRID3','GRID4','GRID5']
# for combo in grid_combo:
#     print(combo)
#     clf = svm.SVC(C=combo[0],gamma=combo[1],kernel=combo[2])
#     clf.fit(X_train[cols],y_train)
#     pred = clf.predict(X_val[cols])
#     score = balanced_accuracy_score(y_val,pred)
#     print(score)