#### Jupyter Notebook, Step 4 

In [1]:
cd ..

/Users/johnphillips/Desktop/DSI-Class-Stuff/Project03_on_AWS/Project_03_on_AWS


In [2]:
# Standard Imports

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

from scipy import stats
%matplotlib inline

In [3]:
# Start with reading the data from the pickle
train_data = pd.read_pickle('data/train_data.p')
huge = pd.read_pickle('data/huge.p')
new_huge = pd.read_pickle('data/new_huge.p')

### Basic Benchmark:

In [4]:
new_huge['target'].isnull().value_counts()

False    10000
Name: target, dtype: int64

In [5]:
new_huge['target'].value_counts()

0    5009
1    4991
Name: target, dtype: int64

In [6]:
print((5009.0/10000)) # What % 0?
print((4991.0/10000)) # What % 1?

# Still very balanced classes.

0.5009
0.4991


In [7]:
# Several imports:
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier 
from sklearn.svm import SVR
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import Lasso #, Ridge
from sklearn.feature_selection import SelectFromModel, SelectKBest, RFE, f_regression, chi2, f_classif
from sklearn.model_selection import GridSearchCV

## Big Data Work
Can the model perform on the new bigger data set?

In [8]:
# This is for the NEW BIG Dataset with 13 features and 10,000 rows
# Now take 10% samples of the new_huge.DataFrame

big_data1 = new_huge.sample(frac=0.1, replace=False)
big_data2 = new_huge.sample(frac=0.1, replace=False)
big_data3 = new_huge.sample(frac=0.1, replace=False)

#Need to Separate X, y to build models...

X_1_big = big_data1.drop('target', axis=1)
X_2_big = big_data2.drop('target', axis=1)
X_3_big = big_data3.drop('target', axis=1)

y_1_big = big_data1['target']
y_2_big = big_data2['target']
y_3_big = big_data3['target']

In [9]:
# Train-test split big_data ...
from sklearn.model_selection import train_test_split
X_1b_train, X_1b_test, y_1b_train, y_1b_test = train_test_split(X_1_big, y_1_big, random_state = 42)
X_2b_train, X_2b_test, y_2b_train, y_2b_test = train_test_split(X_2_big, y_2_big, random_state = 42)
X_3b_train, X_3b_test, y_3b_train, y_3b_test = train_test_split(X_3_big, y_3_big, random_state = 42)

In [10]:
# Let's try Recursive Feature Elimination with Logistic Regression to find some features
rfe1_new = RFE(LogisticRegression(C=1), n_features_to_select=5, step=1, verbose=0)
scaler = StandardScaler()
X1b_train_scaled = scaler.fit_transform(X_1b_train)
X1b_test_scaled = scaler.transform(X_1b_test)

In [11]:
rfe2_new = RFE(LogisticRegression(C=1), n_features_to_select=5, step=1, verbose=0)
scaler = StandardScaler()
X2b_train_scaled = scaler.fit_transform(X_2b_train)
X2b_test_scaled = scaler.transform(X_2b_test)

In [12]:
rfe3_new = RFE(LogisticRegression(C=1), n_features_to_select=5, step=10, verbose=0)
scaler = StandardScaler()
X3b_train_scaled = scaler.fit_transform(X_3b_train)
X3b_test_scaled = scaler.transform(X_3b_test)

In [13]:
rfe1_new.fit(X1b_train_scaled, y_1b_train)
rfe2_new.fit(X2b_train_scaled, y_2b_train)
rfe3_new.fit(X3b_train_scaled, y_3b_train)

RFE(estimator=LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False),
  n_features_to_select=5, step=10, verbose=0)

In [14]:
# Print the feature index...
rfe1_new_feats = np.where(rfe1_new.get_support())[0]
print("The features are :", rfe1_new_feats)

rfe2_new_feats = np.where(rfe2_new.get_support())[0]
print("The features are :", rfe2_new_feats)

rfe3_new_feats = np.where(rfe3_new.get_support())[0]
print("The features are :", rfe3_new_feats)


The features are : [ 0  1  4  7 12]
The features are : [ 0  2  4  7 10]
The features are : [ 1  6  7  9 12]


In [15]:
# Recall the column names we are working with:
new_huge.columns

Index(['feat_269', 'feat_315', 'feat_341', 'feat_345', 'feat_429', 'feat_504',
       'feat_623', 'feat_681', 'feat_701', 'feat_769', 'feat_808', 'feat_829',
       'feat_920', 'target'],
      dtype='object')

So Here the top features would be:
    feat_829, feat_808, feat_623, feat_681, and feat_769.

In [16]:
print("The RFE score is : {:.2f}".format(rfe1_new.score(X1b_train_scaled, y_1b_train)))
print("The RFE score is : {:.2f}".format(rfe2_new.score(X2b_train_scaled, y_2b_train)))
print("The RFE score is : {:.2f}".format(rfe3_new.score(X3b_train_scaled, y_3b_train)))

The RFE score is : 0.61
The RFE score is : 0.60
The RFE score is : 0.59


Not as accurate as I'd like, so let's try another way...

In [17]:
# Let's look at KNC ...

knc_params = {
    'n_neighbors': range(3,13,1)
}
knc_gs_new1 = GridSearchCV(KNeighborsClassifier(), param_grid= knc_params, cv=5)

In [18]:
knc_gs_new1.fit(X1b_train_scaled, y_1b_train)

GridSearchCV(cv=5, error_score='raise',
       estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform'),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'n_neighbors': range(3, 13)}, pre_dispatch='2*n_jobs',
       refit=True, return_train_score='warn', scoring=None, verbose=0)

In [20]:
cv1_results = pd.DataFrame(knc_gs_new1.cv_results_)
cv1_results.sort_values('mean_test_score', ascending=False).head(10)



Unnamed: 0,mean_fit_time,mean_score_time,mean_test_score,mean_train_score,param_n_neighbors,params,rank_test_score,split0_test_score,split0_train_score,split1_test_score,...,split2_test_score,split2_train_score,split3_test_score,split3_train_score,split4_test_score,split4_train_score,std_fit_time,std_score_time,std_test_score,std_train_score
8,0.00104,0.0041,0.68,0.747012,11,{'n_neighbors': 11},1,0.668874,0.754591,0.622517,...,0.7,0.745,0.718121,0.75208,0.691275,0.725458,0.000133,0.000394,0.032927,0.011584
6,0.000768,0.002905,0.678667,0.751346,9,{'n_neighbors': 9},2,0.675497,0.75793,0.615894,...,0.68,0.765,0.731544,0.74376,0.691275,0.733777,7.4e-05,8.1e-05,0.037186,0.011137
9,0.000823,0.003784,0.673333,0.726343,12,{'n_neighbors': 12},3,0.668874,0.72621,0.609272,...,0.666667,0.731667,0.697987,0.725458,0.724832,0.710483,7.3e-05,0.000207,0.038559,0.009101
0,0.001772,0.003436,0.669333,0.830336,3,{'n_neighbors': 3},4,0.668874,0.828047,0.655629,...,0.64,0.816667,0.697987,0.836938,0.684564,0.826955,0.001549,0.00105,0.020473,0.009048
5,0.000726,0.002922,0.669333,0.740679,8,{'n_neighbors': 8},4,0.662252,0.747913,0.629139,...,0.693333,0.741667,0.684564,0.740433,0.677852,0.72213,1.7e-05,0.000108,0.022607,0.010092
7,0.000848,0.003441,0.668,0.737016,10,{'n_neighbors': 10},6,0.682119,0.737896,0.622517,...,0.66,0.745,0.677852,0.733777,0.697987,0.712146,0.000229,0.000705,0.025833,0.014587
4,0.000725,0.00281,0.661333,0.757006,7,{'n_neighbors': 7},7,0.688742,0.761269,0.615894,...,0.64,0.758333,0.718121,0.753744,0.644295,0.750416,1.9e-05,0.000144,0.036828,0.004294
2,0.000767,0.002736,0.657333,0.777347,5,{'n_neighbors': 5},8,0.688742,0.776294,0.635762,...,0.653333,0.775,0.684564,0.77371,0.624161,0.762063,7.2e-05,7.4e-05,0.025718,0.01226
3,0.000703,0.002765,0.644,0.747679,6,{'n_neighbors': 6},9,0.668874,0.752922,0.629139,...,0.64,0.746667,0.651007,0.748752,0.630872,0.728785,1.3e-05,0.000167,0.014705,0.010689
1,0.000833,0.003095,0.636,0.764997,4,{'n_neighbors': 4},10,0.655629,0.767947,0.615894,...,0.613333,0.746667,0.684564,0.777038,0.610738,0.767055,9.5e-05,0.000721,0.029275,0.009956


Here 11 neighbors perfoms best, but not a huge difference between that and even 8th Ranking option of 5 neighbors [less than 3% difference in mean test scores].