In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

import datetime

from scipy.io import arff

from sklearn.model_selection import KFold, GridSearchCV
from sklearn.preprocessing import RobustScaler, StandardScaler
from sklearn.model_selection import train_test_split

from sklearn.metrics import f1_score, roc_auc_score, accuracy_score

from cuml.svm import SVC
from cuml.ensemble import RandomForestClassifier
from cuml.linear_model import LogisticRegression

from mlproject.helperfunc import *

# EEG Data

In [45]:
# Load data from arff
data = arff.loadarff('data/eeg/EEG Eye State.arff')

In [46]:
# Create dataframe from data
df = pd.DataFrame(data[0])

# Encode eyeDetection column as int32
df.eyeDetection = df.eyeDetection.astype('int32')

# Rename eyeDetection to our desired label 'y'
df = df.rename(columns = {'eyeDetection' : 'y'})

# Shuffle dataframe
df = df.sample(frac = 1).reset_index(drop = True)

# Set X and y
X = df.iloc[:, :-1]
y = df.y

df

Unnamed: 0,AF3,F7,F3,FC5,T7,P7,O1,O2,P8,T8,FC6,F4,F8,AF4,y
0,4313.33,4042.56,4275.90,4156.41,4343.59,4630.26,4066.67,4591.79,4180.51,4218.97,4185.13,4268.21,4577.44,4352.82,1
1,4300.00,4023.59,4253.33,4136.41,4341.54,4604.10,4056.41,4611.79,4196.41,4224.10,4200.51,4275.38,4596.92,4351.79,0
2,4398.97,4099.49,4377.44,4232.31,4443.59,4750.26,4157.44,4714.87,4290.26,4328.21,4303.59,4379.49,4709.23,4456.92,0
3,4298.46,4013.85,4266.15,4106.67,4338.46,4619.49,4060.00,4613.85,4200.00,4238.97,4163.08,4266.67,4602.05,4358.97,0
4,4288.72,3988.21,4243.59,4093.85,4319.49,4602.56,4093.33,4615.38,4196.92,4216.41,4196.41,4270.26,4592.31,4354.87,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14975,4292.82,4005.13,4265.64,4123.59,4337.44,4618.46,4048.72,4614.36,4196.41,4231.79,4197.95,4276.92,4606.67,4347.69,0
14976,4293.33,4009.23,4245.64,4120.51,4327.18,4586.15,4089.74,4607.69,4191.79,4225.13,4177.44,4274.87,4593.85,4350.77,0
14977,4289.74,3998.97,4263.59,4107.69,4333.33,4614.87,4055.90,4620.00,4210.77,4227.69,4192.31,4278.46,4607.18,4358.46,0
14978,4282.05,3966.67,4240.00,4106.67,4313.33,4590.77,4067.69,4594.87,4192.31,4225.64,4178.97,4271.28,4604.62,4353.33,1


In [47]:
# Scale dataframe by a robust scaler due to large outliers
scaler = RobustScaler().fit(X)
X = pd.DataFrame(scaler.transform(X))

# Create scaled_df from X and y
scaled_df = X
scaled_df['y'] = y

In [48]:
scaled_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,y
0,0.606458,1.142680,0.650414,1.489627,0.300195,0.827285,-0.140016,-1.105749,-1.028169,-0.540569,-0.731684,-0.446896,-0.961380,-0.066623,1
1,0.180307,0.555556,-0.450024,0.659751,0.166992,-0.930780,-0.540172,-0.079055,-0.166847,-0.270285,0.000000,-0.079015,-0.230971,-0.100097,0
2,3.344309,2.904673,5.601170,4.639004,6.797921,8.891801,3.400156,5.212526,4.917118,5.214963,4.903901,5.262699,3.980127,3.316542,0
3,0.131074,0.254101,0.175037,-0.574274,-0.033138,0.103495,-0.400156,0.026694,0.027627,0.513172,-1.780685,-0.525911,-0.038620,0.133247,0
4,-0.180307,-0.539461,-0.924915,-1.106224,-1.265757,-1.034274,0.899766,0.105236,-0.139220,-0.675448,-0.195052,-0.341714,-0.403825,0.000000,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14975,-0.049233,-0.015785,0.150171,0.127801,-0.099415,0.034274,-0.840094,0.052875,-0.166847,0.134879,-0.121789,0.000000,0.134608,-0.233344,0
14976,-0.032928,0.111111,-0.824963,0.000000,-0.766082,-2.137097,0.759750,-0.289528,-0.417118,-0.216017,-1.097526,-0.105182,-0.346082,-0.133247,0
14977,-0.147698,-0.206438,0.050219,-0.531950,-0.366472,-0.206989,-0.560062,0.342402,0.611051,-0.081138,-0.390105,0.079015,0.153731,0.116672,0
14978,-0.393542,-1.206128,-1.099951,-0.574274,-1.666017,-1.826613,-0.100234,-0.947639,-0.388949,-0.189146,-1.024738,-0.289379,0.057743,-0.050049,1


Run all classifiers and store data as csv:

In [49]:
eeg_svm_train_df, eeg_svm_test_df = run_svm(scaled_df)

In [50]:
eeg_svm_train_df

Unnamed: 0,accuracy,f1,auc
0,0.9748,0.970968,0.973812
1,0.9716,0.968289,0.970761
2,0.9732,0.969266,0.972232
3,0.9758,0.97326,0.975084
4,0.9922,0.991162,0.96878


In [51]:
eeg_svm_test_df

Unnamed: 0,accuracy,f1,auc
0,0.93978,0.932706,0.937903
1,0.93517,0.927279,0.934005
2,0.938677,0.931924,0.93747
3,0.932665,0.923879,0.931148
4,0.941283,0.935105,0.935907


In [52]:
eeg_rf_train_df, eeg_rf_test_df = run_random_forest(scaled_df)

In [53]:
eeg_rf_train_df

Unnamed: 0,accuracy,f1,auc
0,0.9796,0.977173,0.977944
1,0.9782,0.975385,0.975561
2,0.9804,0.977637,0.980423
3,0.979,0.975654,0.976768
4,0.9862,0.984491,0.984793


In [54]:
eeg_rf_test_df

Unnamed: 0,accuracy,f1,auc
0,0.86503,0.837884,0.858867
1,0.87515,0.850168,0.867713
2,0.868337,0.849517,0.863875
3,0.87014,0.84058,0.860245
4,0.878657,0.856804,0.872555


In [55]:
eeg_lg_train_df, eeg_lg_test_df = run_log_reg(scaled_df)

In [56]:
eeg_lg_train_df

Unnamed: 0,accuracy,f1,auc
0,0.648,0.547558,0.630797
1,0.6334,0.543689,0.620211
2,0.6322,0.543559,0.619418
3,0.643,0.544991,0.627018
4,0.636,0.564593,0.626622


In [57]:
eeg_lg_test_df

Unnamed: 0,accuracy,f1,auc
0,0.64479,0.549956,0.630371
1,0.643287,0.554889,0.629345
2,0.629459,0.535777,0.614934
3,0.642385,0.54897,0.628003
4,0.638277,0.555638,0.625318


In [58]:
eeg_svm_train_df['CLF'] = 'SVM'
eeg_svm_test_df['CLF'] = 'SVM'

eeg_rf_train_df['CLF'] = 'rf'
eeg_rf_test_df['CLF'] = 'rf'

eeg_lg_train_df['CLF'] = 'LogReg'
eeg_lg_test_df['CLF'] = 'LogReg'

In [59]:
eeg_comb_train_df = pd.concat([eeg_svm_train_df, eeg_rf_train_df, eeg_lg_train_df])
eeg_comb_train_df

Unnamed: 0,accuracy,f1,auc,CLF
0,0.9748,0.970968,0.973812,SVM
1,0.9716,0.968289,0.970761,SVM
2,0.9732,0.969266,0.972232,SVM
3,0.9758,0.97326,0.975084,SVM
4,0.9922,0.991162,0.96878,SVM
0,0.9796,0.977173,0.977944,rf
1,0.9782,0.975385,0.975561,rf
2,0.9804,0.977637,0.980423,rf
3,0.979,0.975654,0.976768,rf
4,0.9862,0.984491,0.984793,rf


In [60]:
grouped_eeg_train = eeg_comb_train_df.groupby('CLF').mean()
grouped_eeg_train

Unnamed: 0_level_0,accuracy,f1,auc
CLF,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
LogReg,0.63852,0.548878,0.624813
SVM,0.97752,0.974589,0.972134
rf,0.98068,0.978068,0.979098


In [61]:
grouped_eeg_train.to_csv('raw_out/grouped_eeg_train.csv')

In [62]:
eeg_comb_test_df = pd.concat([eeg_svm_test_df, eeg_rf_test_df, eeg_lg_test_df])
eeg_comb_test_df

Unnamed: 0,accuracy,f1,auc,CLF
0,0.93978,0.932706,0.937903,SVM
1,0.93517,0.927279,0.934005,SVM
2,0.938677,0.931924,0.93747,SVM
3,0.932665,0.923879,0.931148,SVM
4,0.941283,0.935105,0.935907,SVM
0,0.86503,0.837884,0.858867,rf
1,0.87515,0.850168,0.867713,rf
2,0.868337,0.849517,0.863875,rf
3,0.87014,0.84058,0.860245,rf
4,0.878657,0.856804,0.872555,rf


In [63]:
grouped_eeg_test = eeg_comb_test_df.groupby('CLF').mean()
grouped_eeg_test

Unnamed: 0_level_0,accuracy,f1,auc
CLF,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
LogReg,0.639639,0.549046,0.625594
SVM,0.937515,0.930179,0.935286
rf,0.871463,0.84699,0.864651


In [64]:
grouped_eeg_test.to_csv('raw_out/grouped_eeg_test.csv')

# Occupancy Data

In [66]:
# Load all occupancy data
data1 = pd.read_csv('data/occupancy_data/datatest.txt')
data2 = pd.read_csv('data/occupancy_data/datatest2.txt')
data3 = pd.read_csv('data/occupancy_data/datatraining.txt')

# Concatenate all data into one dataframe
data = pd.concat([data1, data2, data3])

# Drop date column as could not encode effectively (looked into encoding cyclical features:
# https://ianlondon.github.io/blog/encoding-cyclical-features-24hour-time/)
# But 5/6 features work fine for this study
df = data.drop(columns = ['date'])

# Shuffle dataframe
df = df.sample(frac = 1).reset_index(drop = True)

# Rename occupancy to desired label 'y'
df = df.rename(columns = {'Occupancy' : 'y'})

# Select X and y
X = df.iloc[:, :-1]
y = df.y

df

Unnamed: 0,Temperature,Humidity,Light,CO2,HumidityRatio,y
0,21.000000,24.600000,0.0,441.000000,0.003779,0
1,21.200000,31.340000,433.0,961.000000,0.004883,1
2,19.390000,31.100000,0.0,434.000000,0.004329,0
3,23.100000,17.066667,266.0,446.333333,0.002976,0
4,19.290000,31.000000,6.0,430.000000,0.004288,0
...,...,...,...,...,...,...
20555,20.600000,30.390000,0.0,784.000000,0.004561,0
20556,22.390000,28.200000,489.0,1053.000000,0.004724,1
20557,20.500000,33.290000,0.0,596.500000,0.004969,0
20558,22.890000,25.600000,58.0,560.000000,0.004418,0


In [67]:
# Scale dataframe by a robust scaler due to large outliers
scaler = RobustScaler().fit(X)
X = pd.DataFrame(scaler.transform(X))

# Create scaled_df from X and y
scaled_df = X
scaled_df['y'] = y

In [68]:
scaled_df

Unnamed: 0,0,1,2,3,4,y
0,0.226415,-0.396171,0.000000,-0.360977,-0.460665,0
1,0.377358,0.596465,1.438538,1.147727,0.531091,1
2,-0.988679,0.561119,0.000000,-0.381286,0.033184,0
3,1.811321,-1.505646,0.883721,-0.345503,-1.182298,0
4,-1.064151,0.546392,0.019934,-0.392892,-0.003645,0
...,...,...,...,...,...,...
20555,-0.075472,0.456554,0.000000,0.634188,0.241779,0
20556,1.275472,0.134021,1.624585,1.414652,0.388114,1
20557,-0.150943,0.883652,0.000000,0.090184,0.608207,0
20558,1.652830,-0.248895,0.192691,-0.015716,0.113532,0


Run all classifiers and store data as csv:

In [69]:
occ_svm_train_df, occ_svm_test_df = run_svm(scaled_df)

In [70]:
occ_svm_train_df

Unnamed: 0,accuracy,f1,auc
0,0.991,0.980973,0.992768
1,0.9906,0.980474,0.993533
2,0.9932,0.985618,0.99536
3,0.9896,0.978041,0.991987
4,0.9932,0.985822,0.995255


In [71]:
occ_svm_test_df

Unnamed: 0,accuracy,f1,auc
0,0.989396,0.977468,0.992188
1,0.989267,0.97707,0.988229
2,0.988625,0.97579,0.991298
3,0.989653,0.977996,0.99152
4,0.989139,0.97677,0.991575


In [72]:
occ_rf_train_df, occ_rf_test_df = run_random_forest(scaled_df)

In [73]:
occ_rf_train_df

Unnamed: 0,accuracy,f1,auc
0,0.9908,0.980721,0.991633
1,0.9878,0.974185,0.988159
2,0.9912,0.981197,0.991557
3,0.9898,0.977893,0.991224
4,0.988,0.97331,0.991003


In [74]:
occ_rf_test_df

Unnamed: 0,accuracy,f1,auc
0,0.987532,0.973322,0.988666
1,0.988625,0.97583,0.991339
2,0.987982,0.974338,0.988
3,0.987018,0.972577,0.988362
4,0.987211,0.973134,0.987088


In [75]:
occ_lg_train_df, occ_lg_test_df = run_log_reg(scaled_df)

In [76]:
occ_lg_train_df

Unnamed: 0,accuracy,f1,auc
0,0.9894,0.977647,0.992798
1,0.988,0.975021,0.990389
2,0.9896,0.977052,0.991068
3,0.9894,0.977243,0.991898
4,0.9896,0.978387,0.992315


In [77]:
occ_lg_test_df

Unnamed: 0,accuracy,f1,auc
0,0.988753,0.976063,0.990545
1,0.989396,0.977351,0.991841
2,0.98856,0.976004,0.991007
3,0.988689,0.976054,0.990606
4,0.988753,0.975945,0.990735


In [78]:
occ_svm_train_df['CLF'] = 'SVM'
occ_svm_test_df['CLF'] = 'SVM'

occ_rf_train_df['CLF'] = 'rf'
occ_rf_test_df['CLF'] = 'rf'

occ_lg_train_df['CLF'] = 'LogReg'
occ_lg_test_df['CLF'] = 'LogReg'

In [79]:
occ_comb_train_df = pd.concat([occ_svm_train_df, occ_rf_train_df, occ_lg_train_df])
occ_comb_train_df

Unnamed: 0,accuracy,f1,auc,CLF
0,0.991,0.980973,0.992768,SVM
1,0.9906,0.980474,0.993533,SVM
2,0.9932,0.985618,0.99536,SVM
3,0.9896,0.978041,0.991987,SVM
4,0.9932,0.985822,0.995255,SVM
0,0.9908,0.980721,0.991633,rf
1,0.9878,0.974185,0.988159,rf
2,0.9912,0.981197,0.991557,rf
3,0.9898,0.977893,0.991224,rf
4,0.988,0.97331,0.991003,rf


In [80]:
grouped_occ_train = occ_comb_train_df.groupby('CLF').mean()
grouped_occ_train

Unnamed: 0_level_0,accuracy,f1,auc
CLF,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
LogReg,0.9892,0.97707,0.991694
SVM,0.99152,0.982185,0.99378
rf,0.98952,0.977461,0.990715


In [81]:
grouped_occ_train.to_csv('raw_out/grouped_occ_train.csv')

In [82]:
occ_comb_test_df = pd.concat([occ_svm_test_df, occ_rf_test_df, occ_lg_test_df])
occ_comb_test_df

Unnamed: 0,accuracy,f1,auc,CLF
0,0.989396,0.977468,0.992188,SVM
1,0.989267,0.97707,0.988229,SVM
2,0.988625,0.97579,0.991298,SVM
3,0.989653,0.977996,0.99152,SVM
4,0.989139,0.97677,0.991575,SVM
0,0.987532,0.973322,0.988666,rf
1,0.988625,0.97583,0.991339,rf
2,0.987982,0.974338,0.988,rf
3,0.987018,0.972577,0.988362,rf
4,0.987211,0.973134,0.987088,rf


In [83]:
grouped_occ_test = occ_comb_test_df.groupby('CLF').mean()
grouped_occ_test

Unnamed: 0_level_0,accuracy,f1,auc
CLF,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
LogReg,0.98883,0.976284,0.990947
SVM,0.989216,0.977019,0.990962
rf,0.987674,0.97384,0.988691


In [84]:
grouped_occ_test.to_csv('raw_out/grouped_occ_test.csv')

# League Data

In [3]:
# Load data
df = pd.read_csv('data/league/high_diamond_ranked_10min.csv')

# Drop gameId unique identifier
df = df.drop(columns = 'gameId')

# Shuffle dataframe
df = df.sample(frac = 1).reset_index(drop = True)

# Choose X and y
X = df.iloc[:, 1:]
y = df.iloc[:, 0]

df

Unnamed: 0,blueWins,blueWardsPlaced,blueWardsDestroyed,blueFirstBlood,blueKills,blueDeaths,blueAssists,blueEliteMonsters,blueDragons,blueHeralds,...,redTowersDestroyed,redTotalGold,redAvgLevel,redTotalExperience,redTotalMinionsKilled,redTotalJungleMinionsKilled,redGoldDiff,redExperienceDiff,redCSPerMin,redGoldPerMin
0,1,18,3,1,14,5,12,1,1,0,...,0,14489,5.8,13682,178,28,-5805,-4777,17.8,1448.9
1,1,19,5,1,11,6,12,0,0,0,...,0,16579,6.8,17751,208,54,-1775,5,20.8,1657.9
2,0,13,4,0,5,4,4,0,0,0,...,0,15923,7.0,18503,221,60,-1116,-1123,22.1,1592.3
3,1,18,4,1,5,1,5,0,0,0,...,0,14253,6.8,17005,248,51,-2861,-2287,24.8,1425.3
4,1,20,4,0,7,5,8,0,0,0,...,0,15689,6.6,16609,198,46,-1058,-1735,19.8,1568.9
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9874,0,53,3,1,7,5,7,1,0,1,...,0,15915,7.0,18064,253,59,-936,-386,25.3,1591.5
9875,0,16,3,1,3,2,4,1,1,0,...,0,13937,6.6,16273,197,54,-1559,-131,19.7,1393.7
9876,1,44,7,0,8,5,9,0,0,0,...,0,15924,7.0,18903,207,68,-1421,792,20.7,1592.4
9877,0,21,2,1,11,11,13,0,0,0,...,0,17666,7.0,18533,212,40,-1212,257,21.2,1766.6


In [4]:
# Scale dataframe by a robust scaler due to large outliers
scaler = RobustScaler().fit(X)
X = pd.DataFrame(scaler.transform(X))

# Create scaled_df from X and y
scaled_df = X
scaled_df['y'] = y

In [5]:
scaled_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,29,30,31,32,33,34,35,36,37,y
0,0.333333,0.000000,0.0,2.00,-0.25,1.2,1.0,1.0,0.0,0.0,...,-0.948769,-3.0,-2.760129,-1.333333,-1.769231,-1.820211,-1.920080,-1.333333,-0.948769,1
1,0.500000,0.666667,0.0,1.25,0.00,1.2,0.0,0.0,0.0,0.0,...,0.100954,-0.5,-0.143408,-0.333333,0.230769,-0.553512,-0.009191,-0.333333,0.100954,1
2,-0.500000,0.333333,-1.0,-0.25,-0.50,-0.4,0.0,0.0,0.0,0.0,...,-0.228528,0.0,0.340193,0.100000,0.692308,-0.346377,-0.459940,0.100000,-0.228528,0
3,0.333333,0.333333,0.0,-0.25,-1.25,-0.2,0.0,0.0,0.0,0.0,...,-1.067303,-0.5,-0.623151,1.000000,0.000000,-0.894861,-0.925075,1.000000,-1.067303,1
4,0.666667,0.333333,-1.0,0.25,-0.25,0.4,0.0,0.0,0.0,0.0,...,-0.346057,-1.0,-0.877814,-0.666667,-0.384615,-0.328147,-0.704496,-0.666667,-0.346057,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9874,6.166667,0.000000,0.0,0.25,-0.25,0.2,1.0,0.0,1.0,0.0,...,-0.232546,0.0,0.057878,1.166667,0.615385,-0.289800,-0.165435,1.166667,-0.232546,0
9875,0.000000,0.000000,0.0,-0.75,-1.00,-0.4,1.0,1.0,0.0,0.0,...,-1.226017,-1.0,-1.093891,-0.700000,0.230769,-0.485620,-0.063536,-0.700000,-1.226017,0
9876,4.666667,1.333333,-1.0,0.50,-0.25,0.6,0.0,0.0,0.0,0.0,...,-0.228026,0.0,0.597428,-0.366667,1.307692,-0.442244,0.305295,-0.366667,-0.228026,1
9877,0.833333,-0.333333,0.0,1.25,1.25,1.4,0.0,0.0,0.0,0.0,...,0.646911,0.0,0.359486,-0.200000,-0.846154,-0.376552,0.091508,-0.200000,0.646911,0


Run all classifiers and store data as csv:

In [6]:
lea_svm_train_df, lea_svm_test_df = run_svm(scaled_df)

In [7]:
lea_svm_train_df

Unnamed: 0,accuracy,f1,auc
0,0.7354,0.741462,0.735414
1,0.7362,0.738449,0.73605
2,0.7378,0.741319,0.739732
3,0.7256,0.723052,0.724379
4,0.7282,0.722166,0.729209


In [8]:
lea_svm_test_df

Unnamed: 0,accuracy,f1,auc
0,0.726583,0.733069,0.726525
1,0.723509,0.723396,0.726042
2,0.725559,0.721658,0.726286
3,0.737651,0.740576,0.737473
4,0.732732,0.726283,0.733711


In [9]:
lea_rf_train_df, lea_rf_test_df = run_random_forest(scaled_df)

In [10]:
lea_rf_train_df

Unnamed: 0,accuracy,f1,auc
0,0.9994,0.999398,0.999399
1,1.0,1.0,1.0
2,0.9996,1.0,0.9996
3,0.9992,0.999007,0.999404
4,0.9998,0.999797,0.999595


In [11]:
lea_rf_test_df

Unnamed: 0,accuracy,f1,auc
0,0.71941,0.711139,0.721248
1,0.729863,0.72731,0.730101
2,0.725968,0.715812,0.72484
3,0.714901,0.709436,0.713977
4,0.723714,0.721641,0.722758


In [12]:
lea_lg_train_df, lea_lg_test_df = run_log_reg(scaled_df)

In [13]:
lea_lg_train_df

Unnamed: 0,accuracy,f1,auc
0,0.7324,0.727606,0.732487
1,0.732,0.732961,0.731996
2,0.74,0.736201,0.739935
3,0.7322,0.727458,0.732101
4,0.7254,0.723576,0.72539


In [14]:
lea_lg_test_df

Unnamed: 0,accuracy,f1,auc
0,0.730068,0.729368,0.730364
1,0.733757,0.735922,0.733825
2,0.726173,0.728124,0.726167
3,0.728633,0.725653,0.728837
4,0.740316,0.741164,0.740324


In [15]:
lea_svm_train_df['CLF'] = 'SVM'
lea_svm_test_df['CLF'] = 'SVM'

lea_rf_train_df['CLF'] = 'rf'
lea_rf_test_df['CLF'] = 'rf'

lea_lg_train_df['CLF'] = 'LogReg'
lea_lg_test_df['CLF'] = 'LogReg'

In [16]:
lea_comb_train_df = pd.concat([lea_svm_train_df, lea_rf_train_df, lea_lg_train_df])
lea_comb_train_df

Unnamed: 0,accuracy,f1,auc,CLF
0,0.7354,0.741462,0.735414,SVM
1,0.7362,0.738449,0.73605,SVM
2,0.7378,0.741319,0.739732,SVM
3,0.7256,0.723052,0.724379,SVM
4,0.7282,0.722166,0.729209,SVM
0,0.9994,0.999398,0.999399,rf
1,1.0,1.0,1.0,rf
2,0.9996,1.0,0.9996,rf
3,0.9992,0.999007,0.999404,rf
4,0.9998,0.999797,0.999595,rf


In [17]:
grouped_lea_train = lea_comb_train_df.groupby('CLF').mean()
grouped_lea_train

Unnamed: 0_level_0,accuracy,f1,auc
CLF,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
LogReg,0.7324,0.72956,0.732382
SVM,0.73264,0.73329,0.732957
rf,0.9996,0.99964,0.9996


In [18]:
grouped_lea_train.to_csv('raw_out/grouped_lea_train.csv')

In [19]:
lea_comb_test_df = pd.concat([lea_svm_test_df, lea_rf_test_df, lea_lg_test_df])
lea_comb_test_df

Unnamed: 0,accuracy,f1,auc,CLF
0,0.726583,0.733069,0.726525,SVM
1,0.723509,0.723396,0.726042,SVM
2,0.725559,0.721658,0.726286,SVM
3,0.737651,0.740576,0.737473,SVM
4,0.732732,0.726283,0.733711,SVM
0,0.71941,0.711139,0.721248,rf
1,0.729863,0.72731,0.730101,rf
2,0.725968,0.715812,0.72484,rf
3,0.714901,0.709436,0.713977,rf
4,0.723714,0.721641,0.722758,rf


In [20]:
grouped_lea_test = lea_comb_test_df.groupby('CLF').mean()
grouped_lea_test

Unnamed: 0_level_0,accuracy,f1,auc
CLF,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
LogReg,0.731789,0.732046,0.731903
SVM,0.729207,0.728996,0.730008
rf,0.722771,0.717068,0.722585


In [21]:
grouped_lea_test.to_csv('raw_out/grouped_lea_test.csv')

# CSGO Data

In [22]:
# Load data
df = pd.read_csv('data/csgo/csgo_round_snapshots.csv')

In [23]:
# Convert label of winner and loser to 0 and 1
def conv_winner(winner):
    if winner == 'T':
        return 0
    elif winner == 'CT':
        return 1
    
# Apply conversion
df.round_winner = df.round_winner.apply(conv_winner)

# Cast column of bomb_planted as int32
df.bomb_planted = df.bomb_planted.astype('int32')

# One hot encode 'map'
df = pd.concat((df, pd.get_dummies(df.map)), 1)
df = df.drop(columns = ['map'])

# Shuffle dataframe
df = df.sample(frac = 1).reset_index(drop = True)

In [24]:
# Choose X and y
y = df.round_winner
X = df.drop(columns = ['round_winner'])

df

Unnamed: 0,time_left,ct_score,t_score,bomb_planted,ct_health,t_health,ct_armor,t_armor,ct_money,t_money,...,t_grenade_decoygrenade,round_winner,de_cache,de_dust2,de_inferno,de_mirage,de_nuke,de_overpass,de_train,de_vertigo
0,74.95,6.0,7.0,0,300.0,351.0,300.0,382.0,200.0,8050.0,...,0.0,0,0,0,1,0,0,0,0,0
1,158.45,8.0,12.0,0,500.0,500.0,500.0,485.0,5600.0,800.0,...,0.0,0,0,0,1,0,0,0,0,0
2,175.00,14.0,14.0,0,500.0,500.0,0.0,380.0,15600.0,8150.0,...,0.0,0,0,0,1,0,0,0,0,0
3,114.91,7.0,11.0,0,500.0,500.0,473.0,400.0,13600.0,8900.0,...,0.0,1,0,0,0,1,0,0,0,0
4,34.94,4.0,4.0,0,362.0,85.0,379.0,98.0,5100.0,50.0,...,0.0,1,0,1,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
122405,29.45,0.0,1.0,1,200.0,294.0,100.0,386.0,900.0,2750.0,...,1.0,0,0,0,1,0,0,0,0,0
122406,11.67,10.0,9.0,1,201.0,0.0,285.0,0.0,4500.0,0.0,...,0.0,1,0,0,0,0,1,0,0,0
122407,30.93,7.0,11.0,1,300.0,144.0,300.0,258.0,150.0,11350.0,...,0.0,0,0,0,1,0,0,0,0,0
122408,54.95,6.0,8.0,0,435.0,458.0,497.0,440.0,300.0,17550.0,...,0.0,1,0,0,1,0,0,0,0,0


In [25]:
# Scale dataframe by a robust scaler due to large outliers
# For this data, using a robust scaler does not affect our one hot encode as seen below
scaler = RobustScaler().fit(X)
X = pd.DataFrame(scaler.transform(X))

# Create scaled_df from X and y
scaled_df = X
scaled_df['y'] = y

In [26]:
scaled_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,94,95,96,97,98,99,100,101,102,y
0,-0.178218,0.000000,0.142857,0.0,-1.333333,-0.837079,-0.263699,0.163265,-0.398496,0.054711,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0
1,0.567334,0.285714,0.857143,0.0,0.000000,0.000000,0.421233,0.513605,0.007519,-0.386018,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0
2,0.715105,1.142857,1.142857,0.0,0.000000,0.000000,-1.291096,0.156463,0.759398,0.060790,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0
3,0.178575,0.142857,0.714286,0.0,0.000000,0.000000,0.328767,0.224490,0.609023,0.106383,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1
4,-0.535458,-0.285714,-0.285714,0.0,-0.920000,-2.331461,0.006849,-0.802721,-0.030075,-0.431611,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
122405,-0.584477,-0.857143,-0.714286,1.0,-2.000000,-1.157303,-0.948630,0.176871,-0.345865,-0.267477,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0
122406,-0.743231,0.571429,0.428571,1.0,-1.993333,-2.808989,-0.315068,-1.136054,-0.075188,-0.434650,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1
122407,-0.571263,0.142857,0.714286,1.0,-1.333333,-2.000000,-0.263699,-0.258503,-0.402256,0.255319,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0
122408,-0.356794,0.000000,0.285714,0.0,-0.433333,-0.235955,0.410959,0.360544,-0.390977,0.632219,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1


Run all classifiers and store data as csv:

In [27]:
csg_svm_train_df, csg_svm_test_df = run_svm(scaled_df)

In [28]:
csg_svm_train_df

Unnamed: 0,accuracy,f1,auc
0,0.7706,0.784439,0.771225
1,0.767,0.777884,0.816936
2,0.7736,0.777341,0.774018
3,0.7608,0.776782,0.774176
4,0.7704,0.759127,0.770525


In [29]:
csg_svm_test_df

Unnamed: 0,accuracy,f1,auc
0,0.749272,0.763448,0.750778
1,0.749842,0.759898,0.749681
2,0.750549,0.751653,0.750933
3,0.750158,0.764003,0.752989
4,0.747849,0.737486,0.747974


In [30]:
csg_rf_train_df, csg_rf_test_df = run_random_forest(scaled_df)

In [31]:
csg_rf_train_df

Unnamed: 0,accuracy,f1,auc
0,0.9802,0.980465,0.98005
1,0.9812,0.980632,0.980337
2,0.9838,0.98165,0.982
3,0.978,0.977386,0.975786
4,0.9758,0.974307,0.973962


In [32]:
csg_rf_test_df

Unnamed: 0,accuracy,f1,auc
0,0.755907,0.754278,0.755563
1,0.760923,0.76747,0.76177
2,0.758564,0.759314,0.758181
3,0.753122,0.759358,0.754384
4,0.756809,0.764524,0.757458


In [33]:
csg_lg_train_df, csg_lg_test_df = run_log_reg(scaled_df)

In [34]:
csg_lg_train_df

Unnamed: 0,accuracy,f1,auc
0,0.7584,0.763153,0.754293
1,0.7444,0.71986,0.759217
2,0.766,0.760892,0.5
3,0.7668,0.764895,0.7648
4,0.7516,0.739566,0.751338


In [35]:
csg_lg_test_df

Unnamed: 0,accuracy,f1,auc
0,0.745149,0.745162,0.749359
1,0.741785,0.722887,0.744417
2,0.745158,0.739861,0.5
3,0.745856,0.743666,0.74621
4,0.743949,0.737893,0.746008


In [36]:
csg_svm_train_df['CLF'] = 'SVM'
csg_svm_test_df['CLF'] = 'SVM'

csg_rf_train_df['CLF'] = 'rf'
csg_rf_test_df['CLF'] = 'rf'

csg_lg_train_df['CLF'] = 'LogReg'
csg_lg_test_df['CLF'] = 'LogReg'

In [37]:
csg_comb_train_df = pd.concat([csg_svm_train_df, csg_rf_train_df, csg_lg_train_df])
csg_comb_train_df

Unnamed: 0,accuracy,f1,auc,CLF
0,0.7706,0.784439,0.771225,SVM
1,0.767,0.777884,0.816936,SVM
2,0.7736,0.777341,0.774018,SVM
3,0.7608,0.776782,0.774176,SVM
4,0.7704,0.759127,0.770525,SVM
0,0.9802,0.980465,0.98005,rf
1,0.9812,0.980632,0.980337,rf
2,0.9838,0.98165,0.982,rf
3,0.978,0.977386,0.975786,rf
4,0.9758,0.974307,0.973962,rf


In [38]:
grouped_csg_train = csg_comb_train_df.groupby('CLF').mean()
grouped_csg_train

Unnamed: 0_level_0,accuracy,f1,auc
CLF,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
LogReg,0.75744,0.749673,0.70593
SVM,0.76848,0.775115,0.781376
rf,0.9798,0.978888,0.978427


In [39]:
grouped_csg_train.to_csv('raw_out/grouped_csg_train.csv')

In [40]:
csg_comb_test_df = pd.concat([csg_svm_test_df, csg_rf_test_df, csg_lg_test_df])
csg_comb_test_df

Unnamed: 0,accuracy,f1,auc,CLF
0,0.749272,0.763448,0.750778,SVM
1,0.749842,0.759898,0.749681,SVM
2,0.750549,0.751653,0.750933,SVM
3,0.750158,0.764003,0.752989,SVM
4,0.747849,0.737486,0.747974,SVM
0,0.755907,0.754278,0.755563,rf
1,0.760923,0.76747,0.76177,rf
2,0.758564,0.759314,0.758181,rf
3,0.753122,0.759358,0.754384,rf
4,0.756809,0.764524,0.757458,rf


In [41]:
grouped_csg_test = csg_comb_test_df.groupby('CLF').mean()
grouped_csg_test

Unnamed: 0_level_0,accuracy,f1,auc
CLF,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
LogReg,0.74438,0.737894,0.697199
SVM,0.749534,0.755298,0.750471
rf,0.757065,0.760989,0.757471


In [42]:
grouped_csg_test.to_csv('raw_out/grouped_csg_test.csv')

In [85]:
eeg_comb_test_df.to_csv('raw_out/eeg_comb_test.csv')
occ_comb_test_df.to_csv('raw_out/occ_comb_test.csv')
lea_comb_test_df.to_csv('raw_out/lea_comb_test.csv')
csg_comb_test_df.to_csv('raw_out/csg_comb_test.csv')