## CS 155 Project 1 Modelling Shenanigans!
Let's start modelling for fun! :)

In [261]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import TensorDataset, DataLoader
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import make_classification
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import fbeta_score, make_scorer
from mlxtend.feature_selection import SequentialFeatureSelector
from sklearn import linear_model
from sklearn.model_selection import KFold

In [281]:
df = pd.read_csv("trainingData2.csv")
df = df.sample(frac=1)

trainsize = int(len(df.index)*0.75)
df_train = df.iloc[:trainsize,:]
df_test = df.iloc[trainsize:,:]
weights = (((df_train['lab']==1)+0.1).values)*6.9
labtest = df_test[df_test['lab']==1.0]

df_train=df_train.drop('lab', axis=1)
df_test=df_test.drop('lab', axis=1)
labtest=labtest.drop('lab', axis=1)
df = df.drop('lab',axis=1)

In [282]:
# Normalizing data
NPTX = df_train.drop('label', axis=1).values
scaler = StandardScaler()
scaler.fit(NPTX)
NPTX=scaler.transform(NPTX)
NPTY = df_test.drop('label', axis=1).values
NPTY=scaler.transform(NPTY)
NPTL = labtest.drop('label', axis=1).values
NPTL=scaler.transform(NPTL)

normalizedTrainDF= pd.DataFrame(NPTX, columns = df_train.loc[:, df.columns != 'label'].columns.values)
normalizedTestDF= pd.DataFrame(NPTY, columns = df_test.loc[:, df.columns != 'label'].columns.values)
normalizedTestLabDF= pd.DataFrame(NPTL, columns = df_test.loc[:, df.columns != 'label'].columns.values)

### Stepwise Logistic Regression

In [289]:
ftwo_scorer = make_scorer(fbeta_score, beta=2)

#Stepwise log reg?
sfs = SequentialFeatureSelector(linear_model.LogisticRegression(max_iter=12000),
                                k_features='best',
                                forward=True,
                                scoring=ftwo_scorer,
                                cv=KFold(n_splits=5, shuffle=True,random_state=False))
selected_features = sfs.fit(normalizedTrainDF, df_train['label'].values).k_feature_names_

model = LogisticRegression(multi_class='multinomial', solver='lbfgs',max_iter=12000, class_weight=[1,10])
model.fit(normalizedTrainDF[np.asarray(selected_features)], df_train['label'].values, sample_weight = weights)

# Check accuracy on testing data
yhat = model.predict(normalizedTestDF[np.asarray(selected_features)])
testy= df_test['label'].values
yhat0 = model.predict(normalizedTrainDF[np.asarray(selected_features)])
trainy = df_train['label'].values

yhat2 = model.predict(normalizedTestLabDF[np.asarray(selected_features)])
testy2= labtest['label'].values

print('Train Accuracy: '+ str(np.sum(yhat0==trainy)/len(trainy)))
print( fbeta_score(trainy, yhat0, average='weighted', beta=2))

print('Test Accuracy: '+ str(np.sum(yhat==testy)/len(testy)))
print(fbeta_score(testy, yhat, average='weighted', beta=2))

print('Lab Test Accuracy: '+ str(np.sum(yhat==testy)/len(testy)))
print(fbeta_score(testy2, yhat2, average='weighted', beta=2))

print(selected_features)

InvalidParameterError: The 'class_weight' parameter of LogisticRegression must be an instance of 'dict', a str among {'balanced'} or None. Got {1, 10} instead.

In [284]:
sfs = SequentialFeatureSelector(linear_model.LogisticRegression(max_iter=12000),
                                k_features='parsimonious',
                                forward=True,
                                scoring=ftwo_scorer,
                                cv=KFold(n_splits=5, shuffle=True,random_state=False))
selected_features = sfs.fit(normalizedTrainDF, df_train['label'].values).k_feature_names_
model = LogisticRegression(multi_class='multinomial', solver='lbfgs',max_iter=12000)
model.fit(normalizedTrainDF[np.asarray(selected_features)], df_train['label'].values, sample_weight = weights)

# Check accuracy on testing data
yhat = model.predict(normalizedTestDF[np.asarray(selected_features)])
testy= df_test['label'].values
yhat0 = model.predict(normalizedTrainDF[np.asarray(selected_features)])
trainy = df_train['label'].values

yhat2 = model.predict(normalizedTestLabDF[np.asarray(selected_features)])
testy2= labtest['label'].values

print('Train Accuracy: '+ str(np.sum(yhat0==trainy)/len(trainy)))
print( fbeta_score(trainy, yhat0, average='weighted', beta=2))

print('Test Accuracy: '+ str(np.sum(yhat==testy)/len(testy)))
print(fbeta_score(testy, yhat, average='weighted', beta=2))

print('Lab Test Accuracy: '+ str(np.sum(yhat==testy)/len(testy)))
print(fbeta_score(testy2, yhat2, average='weighted', beta=2))

print(selected_features)

Train Accuracy: 0.9842454394693201
0.984221614045162
Test Accuracy: 0.9833333333333333
0.9832947656429426
Lab Test Accuracy: 0.9833333333333333
0.8978721642656069
('meanStepDisplacement', 'sdStepDisplacement', 'shapiroDisplacement', 'meanAngle', 'meanVideoStepTravel')


In [285]:
sfs = SequentialFeatureSelector(linear_model.LogisticRegression(max_iter=12000),
                                k_features='best',
                                forward=False,
                                scoring=ftwo_scorer,
                                cv=KFold(n_splits=5, shuffle=True,random_state=False))
selected_features = sfs.fit(normalizedTrainDF, df_train['label'].values).k_feature_names_
model = LogisticRegression(multi_class='multinomial', solver='lbfgs',max_iter=12000, class_weight='balanced')
model.fit(normalizedTrainDF[np.asarray(selected_features)], df_train['label'].values, sample_weight = weights)

yhat = model.predict(normalizedTestDF[np.asarray(selected_features)])
testy= df_test['label'].values
yhat0 = model.predict(normalizedTrainDF[np.asarray(selected_features)])
trainy = df_train['label'].values


# Check accuracy on testing data
yhat = model.predict(normalizedTestDF[np.asarray(selected_features)])
testy= df_test['label'].values
yhat0 = model.predict(normalizedTrainDF[np.asarray(selected_features)])
trainy = df_train['label'].values

yhat2 = model.predict(normalizedTestLabDF[np.asarray(selected_features)])
testy2= labtest['label'].values

print('Train Accuracy: '+ str(np.sum(yhat0==trainy)/len(trainy)))
print( fbeta_score(trainy, yhat0, average='weighted', beta=2))

print('Test Accuracy: '+ str(np.sum(yhat==testy)/len(testy)))
print(fbeta_score(testy, yhat, average='weighted', beta=2))

print('Lab Test Accuracy: '+ str(np.sum(yhat==testy)/len(testy)))
print(fbeta_score(testy2, yhat2, average='weighted', beta=2))

print(selected_features)

Train Accuracy: 0.9875621890547264
0.9875423429381873
Test Accuracy: 0.9875621890547264
0.9875366978452786
Lab Test Accuracy: 0.9875621890547264
0.9318181818181818
('meanStepDisplacement', 'meanStepTravel', 'sdStepDisplacement', 'sdStepTravel', 'shapiroDisplacement', 'meanAngle', 'sdAngle', 'sdVideoMeanStepDisplacement', 'meanVideoStepTravel', 'sdVideoMeanStepTravel', 'meanVideoAngle', 'sdVideoAngle')


In [286]:
sfs = SequentialFeatureSelector(linear_model.LogisticRegression(max_iter=12000),
                                k_features='parsimonious',
                                forward=False,
                                scoring=ftwo_scorer,
                                cv=KFold(n_splits=5, shuffle=True,random_state=False))
selected_features = sfs.fit(normalizedTrainDF, df_train['label'].values).k_feature_names_
model = LogisticRegression(multi_class='multinomial', solver='lbfgs',max_iter=12000, class_weight='balanced')
model.fit(normalizedTrainDF[np.asarray(selected_features)], df_train['label'].values, sample_weight = weights)
yhat = model.predict(normalizedTestDF[np.asarray(selected_features)])
testy= df_test['label'].values

yhat = model.predict(normalizedTestDF[np.asarray(selected_features)])
testy= df_test['label'].values
yhat0 = model.predict(normalizedTrainDF[np.asarray(selected_features)])
trainy = df_train['label'].values


# Check accuracy on testing data
yhat = model.predict(normalizedTestDF[np.asarray(selected_features)])
testy= df_test['label'].values
yhat0 = model.predict(normalizedTrainDF[np.asarray(selected_features)])
trainy = df_train['label'].values

yhat2 = model.predict(normalizedTestLabDF[np.asarray(selected_features)])
testy2= labtest['label'].values

print('Train Accuracy: '+ str(np.sum(yhat0==trainy)/len(trainy)))
print( fbeta_score(trainy, yhat0, average='weighted', beta=2))

print('Test Accuracy: '+ str(np.sum(yhat==testy)/len(testy)))
print(fbeta_score(testy, yhat, average='weighted', beta=2))

print('Lab Test Accuracy: '+ str(np.sum(yhat==testy)/len(testy)))
print(fbeta_score(testy2, yhat2, average='weighted', beta=2))

print(selected_features)

Train Accuracy: 0.9835820895522388
0.9835427292910329
Test Accuracy: 0.9823383084577114
0.9822712661392045
Lab Test Accuracy: 0.9823383084577114
0.8815927602658195
('meanStepDisplacement', 'meanStepTravel', 'shapiroDisplacement', 'meanAngle', 'sdVideoMeanStepDisplacement', 'meanVideoAngle', 'sdVideoAngle')


### SVM

In [149]:
from sklearn import svm
clf = svm.SVC(C=10, kernel = 'sigmoid', degree = 3)
clf = svm.SVC(C=10, kernel = 'poly', degree = 3, probability = True)
model2 = svm.SVC(C=15, kernel = 'poly', degree = 3)
clf = CalibratedClassifierCV(model2, method='isotonic', cv=10, class_weight='balanced')
clf.fit(normalizedTrainDF,df_train['label'].values)

NameError: name 'CalibratedClassifierCV' is not defined

In [875]:
yhat = clf.predict(normalizedTestDF)
testy= df_test['label'].values

yhat0 = clf.predict(normalizedTrainDF)
trainy = df_train['label'].values


print('Train Accuracy: '+ str(np.sum(yhat0==trainy)/len(trainy)))
print( fbeta_score(trainy, yhat0, average='weighted', beta=2))

print('Test Accuracy: '+ str(np.sum(yhat==testy)/len(testy)))
print( fbeta_score(testy, yhat, average='weighted', beta=2))

Train Accuracy: 0.9973569651741293
0.9973565150656055
Test Accuracy: 0.9934701492537313
0.9934696422037985


## Decision Trees

In [151]:
from sklearn.ensemble import RandomForestClassifier
np.random.seed(1)

n_estimators = 1000
clf = RandomForestClassifier(n_estimators = n_estimators, criterion = 'gini', class_weight='balanced')
clf.set_params(max_depth = 100).fit(normalizedTrainDF,df_train['label'].values, sample_weight = weights)

In [152]:
yhat = clf.predict(normalizedTestDF)
testy= df_test['label'].values

yhat0 = clf.predict(normalizedTrainDF)
trainy = df_train['label'].values


print('Train Accuracy: '+ str(np.sum(yhat0==trainy)/len(trainy)))
print( fbeta_score(trainy, yhat0, average='weighted', beta=2))

print('Test Accuracy: '+ str(np.sum(yhat==testy)/len(testy)))
print( fbeta_score(testy, yhat, average='weighted', beta=2))

Train Accuracy: 0.999822316986496
0.999822309551156
Test Accuracy: 0.9954394693200663
0.9954393388563515


In [166]:
def f2(yhat, testy):
    tp = 0
    fp = 0
    fn = 0
    
    for i in range(0,len(yhat)):
        if yhat[i]==1 and testy[i] ==1:
            tp += 1
        if yhat[i]==1 and testy[i] !=1:
            fp += 1
        if yhat[i]==0 and testy[i] !=0:
            fn += 1
    
    p=tp/(tp+fp)
    r=(tp)/(tp+fn)
    
    return 5*p*r/(4*p+r)

In [167]:
f2(yhat, testy)

0.9958418194161572

## Converting to data for Kaggle Upload.

In [256]:
TESTdf = pd.read_csv("testingData2.csv")
NPTEST = TESTdf.drop(['lab','UID'], axis=1).values
NPTEST = scaler.transform(NPTEST)
normalizedTESTDF= pd.DataFrame(NPTEST, columns = TESTdf.drop(['lab','UID'], axis=1).columns.values)

In [132]:
# RUN FOR NON STEPWISE REGRESSION
UIDs = TESTdf['UID'].values
preds = model.predict(NPTEST).astype(int)



In [273]:
# RUN FOR STEPWISE REGRESSION
UIDs = TESTdf['UID'].values
preds = model.predict(normalizedTESTDF[np.asarray(selected_features)]).astype(int)

In [272]:
# RUN FOR SVMs AND DECISION TREES
UIDs = TESTdf['UID'].values
preds = clf.predict(normalizedTESTDF).astype(int)

In [274]:
submission = pd.DataFrame({'UID': UIDs, 'label': preds})
submission.to_csv("submission.csv",index=False)

In [275]:
submission

Unnamed: 0,UID,label
0,lab_19_0,0
1,lab_19_1,0
2,lab_19_10,0
3,lab_19_11,0
4,lab_19_12,1
...,...,...
472,lab_42_5,0
473,lab_42_6,0
474,lab_42_7,1
475,lab_42_8,1


## Historical Comparison

Also good to check we haven't replicated a worse result

In [135]:
baddf = pd.read_csv("submissionbad.csv")
gooddf = pd.read_csv("submissionbest.csv")
predsbad = baddf['label'].values
predsgood = gooddf['label'].values

In [276]:
print("overlap with worst prediction")
print(np.sum(preds==predsbad)/len(preds))

overlap with worst prediction
0.7484276729559748


In [277]:
print("overlap with best prediction")
print(np.sum(preds==predsgood)/len(preds))

overlap with best prediction
0.7672955974842768


In [None]:
vals = [0.7425, 0.55397, 0.63186, 0.74453, 0.75172]
for i in vals:
    