# Notebook 2 - Benchmarking

In [9]:
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
import pandas as pd
import numpy as np

In [3]:
import pickle
from sklearn.externals import joblib

In [4]:
uci_madelon = joblib.load('./pickles/uci_madelonALL.pkl')

In [5]:
uci_madelon.shape

(2000, 501)

In [6]:
X = uci_madelon.drop(['target'], axis = 1)
y = uci_madelon['target']

In [7]:
X_train_1, X_test_1, y_train_1, y_test_1 = train_test_split(X, y, test_size = 0.33)

## Naive Models

In [10]:
logr_sc = make_pipeline(StandardScaler(),
                        LogisticRegression(C=1e10))

dectree_sc = make_pipeline(StandardScaler(),
                           DecisionTreeClassifier())

knc_sc = make_pipeline(StandardScaler(),
                       KNeighborsClassifier())

svc_sc = make_pipeline(StandardScaler(),
                       SVC(C=1e10))

## Benchmark Results

### UCI Madelon

These benchmark models are run on the entire UCI Madelon set; 2,000 rows by 501 columns.

In [12]:
logr = logr_sc.fit(X_train_1, y_train_1)
dectree = dectree_sc.fit(X_train_1, y_train_1)
knc = knc_sc.fit(X_train_1, y_train_1)
svc = svc_sc.fit(X_train_1, y_train_1)

In [13]:
logr_train_score = logr.score(X_train_1, y_train_1)
logr_test_score  = logr.score(X_test_1, y_test_1)

dectree_train_score = dectree.score(X_train_1, y_train_1)
dectree_test_score  = dectree.score(X_test_1, y_test_1)

knc_train_score = knc.score(X_train_1, y_train_1)
knc_test_score  = knc.score(X_test_1, y_test_1)

svc_train_score = svc.score(X_train_1, y_train_1)
svc_test_score  = svc.score(X_test_1, y_test_1)

In [30]:
results = []

results.append({'model': 'LogisticRegression',
                'dataset' : 'train',
                'preprocessing': 'scaled',
                'score': logr_train_score})

results.append({'model': 'LogisticRegression',
                'dataset' : 'test',
                'preprocessing': 'scaled',
                'score': logr_test_score})

results.append({'model': 'DecisionTreeClassifier',
                'dataset' : 'train',
                'preprocessing': 'scaled',
                'score': dectree_train_score})

results.append({'model': 'DecisionTreeClassifier',
                'dataset' : 'test',
                'preprocessing': 'scaled',
                'score': dectree_test_score})

results.append({'model': 'KNeighborsClassifier',
                'dataset' : 'train',
                'preprocessing': 'scaled',
                'score': knc_train_score})

results.append({'model': 'KNeighborsClassifier',
                'dataset' : 'test',
                'preprocessing': 'scaled',
                'score': knc_test_score})

results.append({'model': 'SVC',
                'dataset' : 'train',
                'preprocessing': 'scaled',
                'score': svc_train_score})

results.append({'model': 'SVC',
                'dataset' : 'test',
                'preprocessing': 'scaled',
                'score': svc_test_score})

In [31]:
uci_madelon_benchmark_df = pd.DataFrame(results)
uci_madelon_benchmark_df

Unnamed: 0,dataset,model,preprocessing,score
0,train,LogisticRegression,scaled,0.815672
1,test,LogisticRegression,scaled,0.525758
2,train,DecisionTreeClassifier,scaled,1.0
3,test,DecisionTreeClassifier,scaled,0.715152
4,train,KNeighborsClassifier,scaled,0.723881
5,test,KNeighborsClassifier,scaled,0.55
6,train,SVC,scaled,1.0
7,test,SVC,scaled,0.545455


### Josh's Madelon

**The same benchmark models as above are run on a random sample set of 2,000 rows from Josh's Madelon set.**

Shape size: `(2000, 1002)`

In [16]:
j_madelon_test = joblib.load('./pickles/j_madelon_test.pkl')

In [17]:
j_madelon_test.shape

(2000, 1002)

In [18]:
Xj = j_madelon_test.drop(['_id', 'target'], axis = 1)
yj = j_madelon_test['target']

In [19]:
X_train_2, X_test_2, y_train_2, y_test_2 = train_test_split(Xj, yj, test_size = 0.33)

In [20]:
logr_j    = logr_sc.fit(X_train_2, y_train_2)
dectree_j = dectree_sc.fit(X_train_2, y_train_2)
knc_j     = knc_sc.fit(X_train_2, y_train_2)
svc_j     = svc_sc.fit(X_train_2, y_train_2)

In [21]:
logr_train_score_j = logr.score(X_train_2, y_train_2)
logr_test_score_j  = logr.score(X_test_2, y_test_2)

dectree_train_score_j = dectree.score(X_train_2, y_train_2)
dectree_test_score_j  = dectree.score(X_test_2, y_test_2)

knc_train_score_j = knc.score(X_train_2, y_train_2)
knc_test_score_j  = knc.score(X_test_2, y_test_2)

svc_train_score_j = svc.score(X_train_2, y_train_2)
svc_test_score_j  = svc.score(X_test_2, y_test_2)

In [24]:
results_j = []

results_j.append({'model': 'LogisticRegression',
                  'dataset' : 'train',
                  'preprocessing': 'scaled',
                  'score': logr_train_score_j})

results_j.append({'model': 'LogisticRegression',
                  'dataset' : 'test',
                  'preprocessing': 'scaled',
                  'score': logr_test_score_j})

results_j.append({'model': 'DecisionTreeClassifier',
                  'dataset' : 'train',
                  'preprocessing': 'scaled',
                  'score': dectree_train_score_j})

results_j.append({'model': 'DecisionTreeClassifier',
                  'dataset' : 'test',
                  'preprocessing': 'scaled',
                  'score': dectree_test_score_j})

results_j.append({'model': 'KNeighborsClassifier',
                  'dataset' : 'train',
                  'preprocessing': 'scaled',
                  'score': knc_train_score_j})

results_j.append({'model': 'KNeighborsClassifier',
                  'dataset' : 'test',
                  'preprocessing': 'scaled',
                  'score': knc_test_score_j})

results_j.append({'model': 'SVC',
                  'dataset' : 'train',
                  'preprocessing': 'scaled',
                  'score': svc_train_score_j})

results_j.append({'model': 'SVC',
                  'dataset' : 'test',
                  'preprocessing': 'scaled',
                  'score': svc_test_score_j})

In [25]:
j_madelon_benchmark_results_df = pd.DataFrame(results_j)
j_madelon_benchmark_results_df

Unnamed: 0,dataset,model,preprocessing,score
0,train,LogisticRegression,scaled,1.0
1,test,LogisticRegression,scaled,0.530303
2,train,DecisionTreeClassifier,scaled,1.0
3,test,DecisionTreeClassifier,scaled,0.624242
4,train,KNeighborsClassifier,scaled,0.721642
5,test,KNeighborsClassifier,scaled,0.518182
6,train,SVC,scaled,1.0
7,test,SVC,scaled,0.595455


small change