##### Jupyter Notebook, Step 2 - Identify Features
- Build feature selection pipelines using at least three different techniques
- **NOTE**: these pipelines are being used for feature selection not prediction

In [1]:
cd ..

/home/jovyan/Project_03_on_AWS


In [2]:
# Standard Imports

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

from scipy import stats
%matplotlib inline

In [3]:
# Start with reading the data from the pickle
train_data = pd.read_pickle('data/train_data.p')

In [4]:
#Need to Separate X, y to build models

X = train_data.drop('Label', axis=1)
y = train_data['Label']

In [5]:
# Train-test split the our data.
from sklearn.model_selection import train_test_split

# Train-test split some data...
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42)

Build some Pipelines for Feature Selection!

In [68]:
# I'll start with LR, DTC, and KNN

from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier 
from sklearn.svm import SVR
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import Lasso #, Ridge
from sklearn.feature_selection import SelectFromModel, SelectKBest, RFE, f_regression, chi2, f_classif
from sklearn.model_selection import GridSearchCV

In [7]:
# Make first Pipeline with the train_data.

train_data_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('logreg', LogisticRegression(C=100)),
    ('dectre', DecisionTreeClassifier()),
    ('knn', KNeighborsClassifier(n_neighbors=17)),
    ])

In [8]:
train_data_pipeline.fit(X_train, y_train)



Pipeline(steps=[('scaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('logreg', LogisticRegression(C=100, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear'...wski',
           metric_params=None, n_jobs=1, n_neighbors=17, p=2,
           weights='uniform'))])

In [9]:
# What are the train and test r2 scores?
print("Training set score: {:.2f}".format(train_data_pipeline.score(X_train, y_train)))
print("Test set score: {:.2f}".format(train_data_pipeline.score(X_test, y_test)))

Training set score: 0.79
Test set score: 0.73




This is best score so far with roughly 83% Train and 80% Test scores

In [13]:
# Make a Pipeline with SVC
data_pipeline_svc = Pipeline([
    ('scaler', StandardScaler()),
    ('logreg', LogisticRegression(C=10)),
    ('dectre', DecisionTreeClassifier()),
    ('svc', svm.SVC(C=10))
    ])

In [14]:
data_pipeline_svc.fit(X_train, y_train)



Pipeline(steps=[('scaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('logreg', LogisticRegression(C=10, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear',...,
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False))])

In [15]:
# What about those train & test r2 scores?
print("Training set score: {:.2f}".format(data_pipeline_svc.score(X_train, y_train)))
print("Test set score: {:.2f}".format(data_pipeline_svc.score(X_test, y_test)))

Training set score: 0.99
Test set score: 0.74




With a train set score of 99%, this is over fit...

Not performing as good at the first pipeline... hmm...

Lots of warnings! Everybody gets a warning!

## More pipelines for experiments.

For references of these experiements, I'm using this lovely diagram: http://scikit-learn.org/stable/tutorial/machine_learning_map/index.html

And also 'Into to Machine Learning with Python' by Müller & Guido.

In [16]:
# Experiment Pipeline3
train_data_pipeline3 = Pipeline([
    ('scaler', StandardScaler()),
    ('logreg', LogisticRegression()),
    ('linear_svc', svm.LinearSVC(C=10)),
    ('svc', svm.SVC(C=10))
    ])

In [17]:
train_data_pipeline3.fit(X_train, y_train)



Pipeline(steps=[('scaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('logreg', LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear'...,
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False))])

In [18]:
# What are the train and test r2 scores?
print("Training set score: {:.2f}".format(train_data_pipeline3.score(X_train, y_train)))
print("Test set score: {:.2f}".format(train_data_pipeline3.score(X_test, y_test)))
# 



Training set score: 1.00
Test set score: 0.67


In [19]:
#Another experiment with RandomForestClassifier

train_data_pipeline_rf = Pipeline([
    ('scaler', StandardScaler()),
    ('logreg', LogisticRegression()),
    ('rfc', RandomForestClassifier()),
    ])

In [20]:
train_data_pipeline_rf.fit(X_train, y_train)



Pipeline(steps=[('scaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('logreg', LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear'...imators=10, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False))])

In [21]:
# What are the train and test r2 scores?
print("Training set score: {:.2f}".format(train_data_pipeline_rf.score(X_train, y_train)))
print("Test set score: {:.2f}".format(train_data_pipeline_rf.score(X_test, y_test)))

Training set score: 0.99
Test set score: 0.64




Interesting, this has the best test score so far!
But is it really overfit on the Train data?
And I didn't even tune the hyperparameters of RFC yet...

# Time to get some features!

First let's split some of the data:

In [97]:
data1 = train_data.sample(frac=0.1, replace=False)
data2 = train_data.sample(frac=0.1, replace=False)
data3 = train_data.sample(frac=0.1, replace=False)

X_1 = data1.drop('Label', axis=1)
X_2 = data2.drop('Label', axis=1)
X_3 = data3.drop('Label', axis=1)

y_1 = data1['Label']
y_2 = data2['Label']
y_3 = data3['Label']

In [98]:
X_1_train, X_1_test, y_1_train, y_1_test = train_test_split(X_1, y_1, random_state = 42)
X_2_train, X_2_test, y_2_train, y_2_test = train_test_split(X_2, y_2, random_state = 42)
X_3_train, X_3_test, y_3_train, y_3_test = train_test_split(X_3, y_3, random_state = 42)

In [106]:
# Let's try Recursive Feature Elimination with Logistic Regression to find some features
rfe1 = RFE(LogisticRegression(C=10), n_features_to_select=9, step=10, verbose=0)
scaler = StandardScaler()
X1_train_scaled = scaler.fit_transform(X_1_train)
X1_test_scaled = scaler.transform(X_1_test)

In [107]:
rfe2 = RFE(LogisticRegression(C=10), n_features_to_select=9, step=10, verbose=0)
scaler = StandardScaler()
X2_train_scaled = scaler.fit_transform(X_2_train)
X2_test_scaled = scaler.transform(X_2_test)

In [108]:
rfe3 = RFE(LogisticRegression(C=10), n_features_to_select=9, step=10, verbose=0)
scaler = StandardScaler()
X3_train_scaled = scaler.fit_transform(X_3_train)
X3_test_scaled = scaler.transform(X_3_test)

In [109]:
rfe1.fit(X1_train_scaled, y_1_train)
rfe2.fit(X2_train_scaled, y_2_train)
rfe3.fit(X3_train_scaled, y_3_train)

RFE(estimator=LogisticRegression(C=10, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False),
  n_features_to_select=9, step=10, verbose=0)

In [110]:
# Print the features.  These are different from when I ran this for the entire dataset.
rfe1_feats = np.where(rfe1.get_support())[0]
print("The features are :", rfe1_feats)

rfe2_feats = np.where(rfe2.get_support())[0]
print("The features are :", rfe2_feats)

rfe3_feats = np.where(rfe3.get_support())[0]
print("The features are :", rfe3_feats)

The features are : [  2 188 209 264 328 375 377 398 452]
The features are : [ 10  65 181 232 240 304 309 466 475]
The features are : [ 40 226 229 241 248 304 388 439 444]


In [111]:
print("The RFE score is : {:.2f}".format(rfe1.score(X1_train_scaled, y_1_train)))
print("The RFE score is : {:.2f}".format(rfe2.score(X2_train_scaled, y_2_train)))
print("The RFE score is : {:.2f}".format(rfe3.score(X3_train_scaled, y_3_train)))

The RFE score is : 0.72
The RFE score is : 0.79
The RFE score is : 0.77


Use Select K Best to get features.

In [112]:
# Build SKB two ways, one with f_classif score function, another with chi2 score function.

# Here I am working with the entire data set rather than samples
skb_FC = SelectKBest(f_classif, k=13)
skb_CH = SelectKBest(chi2, k=13)

In [113]:
skb_FC.fit(X_train, y_train)
skb_CH.fit(X_train, y_train)

SelectKBest(k=13, score_func=<function chi2 at 0x7f378b047d08>)

In [114]:
# Print top 15 Features for each score function using .argsort().
print(skb_FC.pvalues_.argsort()[:9])
print(skb_CH.pvalues_.argsort()[:9])

[475 241 336  64  48 105 128 378 338]
[475 336 105  64 493 241 453 338 442]


In [115]:
# Put those results in a DF.
pd.DataFrame([skb_FC.pvalues_.argsort()[:9], skb_CH.pvalues_.argsort()[:9]], 
             index=['f_classif','chi2']).T

Unnamed: 0,f_classif,chi2
0,475,475
1,241,336
2,336,105
3,64,64
4,48,493
5,105,241
6,128,453
7,378,338
8,338,442


Here are the top 9 features.  This gave me the most consistent output so far.  We are looking for the top 5 features, but I wanted to include just beyond the top 5 since each method gave slightly different output.

In [116]:
# Alternate Pipeline to transform the data to only important features.

In [117]:
transformer_pipe = make_pipeline(SelectKBest(score_func=f_regression, k=9),
                                 StandardScaler(),
                                SelectFromModel(Lasso(), threshold='mean'))

In [118]:
transformer_pipe.fit(X_train, y_train)



Pipeline(steps=[('selectkbest', SelectKBest(k=9, score_func=<function f_regression at 0x7f378b047d90>)), ('standardscaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('selectfrommodel', SelectFromModel(estimator=Lasso(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=1000,
   normalize=False, positive=False, precompute=False, random_state=None,
   selection='cyclic', tol=0.0001, warm_start=False),
        prefit=False, threshold='mean'))])

In [119]:
features_skb_scaled_sfm = transformer_pipe.transform(X_train)



In [120]:
X_train.shape

(1500, 500)

In [121]:
features_skb_scaled_sfm.shape

(1500, 9)

In [122]:
# Use .get_support to get the features
skb_support = transformer_pipe.named_steps['selectkbest'].get_support()
sfm_support = transformer_pipe.named_steps['selectfrommodel'].get_support()

In [123]:
# Find the columns with the features:
X_train.columns[skb_support][sfm_support]

Index([48, 64, 105, 128, 241, 336, 338, 378, 475], dtype='object')

# Let's Do Gridsearch with KNC

In [124]:
knc_params = {
    'n_neighbors': range(3,19,2)
}
knc_gs = GridSearchCV(KNeighborsClassifier(), param_grid= knc_params, cv=5)

In [125]:
knc_gs.fit(X_train_scaled, y_train)

GridSearchCV(cv=5, error_score='raise',
       estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform'),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'n_neighbors': range(3, 19, 2)},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=0)

In [126]:
cv_results = pd.DataFrame(knc_gs.cv_results_)

In [127]:
cv_results.sort_values('mean_test_score', ascending=False).head(10)

Unnamed: 0,mean_fit_time,mean_score_time,mean_test_score,mean_train_score,param_n_neighbors,params,rank_test_score,split0_test_score,split0_train_score,split1_test_score,...,split2_test_score,split2_train_score,split3_test_score,split3_train_score,split4_test_score,split4_train_score,std_fit_time,std_score_time,std_test_score,std_train_score
5,0.014749,0.321094,0.582667,0.655501,13,{'n_neighbors': 13},1,0.601329,0.659716,0.573333,...,0.58,0.660833,0.566667,0.655,0.591973,0.65612,0.000117,0.003648,0.012538,0.005297
4,0.014924,0.319898,0.578667,0.666668,11,{'n_neighbors': 11},2,0.58804,0.671393,0.583333,...,0.58,0.675,0.57,0.671667,0.571906,0.665279,0.000553,0.003922,0.006817,0.008905
2,0.014798,0.319776,0.572667,0.6935,7,{'n_neighbors': 7},3,0.581395,0.689741,0.58,...,0.55,0.705,0.62,0.700833,0.531773,0.690258,0.000343,0.00271,0.03017,0.008375
6,0.014596,0.319997,0.570667,0.65017,15,{'n_neighbors': 15},4,0.594684,0.661384,0.593333,...,0.573333,0.651667,0.523333,0.650833,0.568562,0.641965,0.0002,0.005528,0.025863,0.006671
7,0.015272,0.323704,0.568,0.646167,17,{'n_neighbors': 17},5,0.58804,0.645538,0.58,...,0.573333,0.643333,0.54,0.655833,0.558528,0.645296,0.000598,0.00513,0.017022,0.00512
3,0.015104,0.323192,0.567333,0.681333,9,{'n_neighbors': 9},6,0.581395,0.685571,0.58,...,0.573333,0.691667,0.556667,0.685833,0.545151,0.688593,0.000547,0.002091,0.014135,0.01335
0,0.014914,0.315133,0.562667,0.77933,3,{'n_neighbors': 3},7,0.594684,0.770642,0.543333,...,0.563333,0.780833,0.603333,0.775833,0.508361,0.789342,0.000791,0.005585,0.034629,0.006175
1,0.014889,0.319736,0.556,0.716165,5,{'n_neighbors': 5},8,0.601329,0.71643,0.543333,...,0.533333,0.715833,0.58,0.710833,0.521739,0.72856,0.000542,0.006484,0.029939,0.0068


In [95]:
knc_gs.best_params_

{'n_neighbors': 13}

In [96]:
knc_gs.best_estimator_

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=13, p=2,
           weights='uniform')

Really interesting results.  13 scores the best, but not a huge difference in the scores (ie: just 1% difference in score for 7 neighbors).  Interesting that 7 neighbors scores above 15, 17, and 9 neighbors.  Also 3 scores above 5 neighbors!  