##### Jupyter Notebook, Step 3 - Feature Importance
- Use the results from step 2 to discuss feature importance in the dataset
- Considering these results, develop a strategy for building a final predictive model

In [1]:
cd ..

/Users/johnphillips/Desktop/DSI-Class-Stuff/Project03_on_AWS/Project_03_on_AWS


In [2]:
# Standard Imports

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

from scipy import stats
%matplotlib inline

In [3]:
# Start with reading the data from the pickle
train_data = pd.read_pickle('data/train_data.p')
huge = pd.read_pickle('data/huge.p')

In [4]:
huge.shape

(2300, 1002)

In [5]:
huge.head(2)

Unnamed: 0,_id,feat_000,feat_001,feat_002,feat_003,feat_004,feat_005,feat_006,feat_007,feat_008,...,feat_991,feat_992,feat_993,feat_994,feat_995,feat_996,feat_997,feat_998,feat_999,target
0,81264,-0.619314,-0.980879,0.260013,0.109861,-1.09166,-2.345588,0.727887,0.189447,-0.400514,...,0.524966,1.865985,0.47681,-0.562234,0.295281,-0.128997,0.679676,0.085488,-0.375616,0
1,81265,-0.254716,-0.507283,0.586206,0.522276,0.689763,0.083975,1.165854,-0.269793,0.509566,...,-1.476176,0.742824,-0.388359,-0.536324,1.268221,0.015912,-1.016712,0.072405,1.152787,0


### Basic Benchmarking:

In [6]:
# Check for nulls and counts:
huge['target'].isnull().value_counts()

False    2300
Name: target, dtype: int64

In [7]:
# Find count of each value:
huge['target'].value_counts()

0    1151
1    1149
Name: target, dtype: int64

In [8]:
print((1151.0/2300)) # What % are '0'?
print(1149.0/2300) # What % are '1'?

0.5004347826086957
0.49956521739130433


### Now separate target column...

In [9]:
X_big = huge.drop('target', axis =1)
y_big = huge['target']
X_big.shape, y_big.shape

((2300, 1001), (2300,))

In [10]:
# Train-test split big_data ...
from sklearn.model_selection import train_test_split
X_b_train, X_b_test, y_b_train, y_b_test = train_test_split(X_big, y_big, random_state = 42)

## Let's reduce the number of features!

In [11]:
# Imports
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier 
from sklearn.svm import SVR
from sklearn import svm
from sklearn.linear_model import Lasso
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectFromModel, SelectKBest, RFE, f_regression, chi2, f_classif
from sklearn.model_selection import GridSearchCV

Let's make a transformer Pipeline to reduce the features

In [12]:
transformer_pipe_big = make_pipeline(SelectKBest(score_func=f_regression, k=13),
                                 StandardScaler(),
                                SelectFromModel(Lasso(), threshold='mean'))

In [13]:
transformer_pipe_big.fit(X_b_train, y_b_train)

Pipeline(memory=None,
     steps=[('selectkbest', SelectKBest(k=13, score_func=<function f_regression at 0x1172ec9d8>)), ('standardscaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('selectfrommodel', SelectFromModel(estimator=Lasso(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=1000,
   normalize=False, positive=False, precompute=False, random_state=None,
   selection='cyclic', tol=0.0001, warm_start=False),
        norm_order=1, prefit=False, threshold='mean'))])

In [14]:
features_skb_scaled_sfm_big = transformer_pipe_big.transform(X_b_train)

In [15]:
X_b_train.shape, features_skb_scaled_sfm_big.shape

((1725, 1001), (1725, 13))

In [16]:
# Use .get_support to get the features
skb_support_big = transformer_pipe_big.named_steps['selectkbest'].get_support()
sfm_support_big = transformer_pipe_big.named_steps['selectfrommodel'].get_support()

In [17]:
# Find the columns with the features:
new_cols = X_b_train.columns[skb_support_big][sfm_support_big]
new_cols

Index(['feat_269', 'feat_315', 'feat_341', 'feat_345', 'feat_429', 'feat_504',
       'feat_623', 'feat_681', 'feat_701', 'feat_769', 'feat_808', 'feat_829',
       'feat_920'],
      dtype='object')

Ok, using my first SQL query of the full Madelon dataset (in Notebook 00) with LIMIT 2300, I have narrowed it down to these 21 features.  Now I want to do a larger SQL query with more rows using only these columns.

In [None]:
# Let's connect and get more rows with these columns

In [15]:
!conda install psycopg2 --yes

Fetching package metadata ...........
Solving package specifications: .

# All requested packages already installed.
# packages in environment at /opt/conda:
#
psycopg2                  2.7.3.2                  py36_0    conda-forge


In [16]:
# Imports to connect to madelon data:

import psycopg2 as pg2
from psycopg2.extras import RealDictCursor
import pandas as pd

In [17]:
# Connection to the huge madelon data set

# Careful on changing 'LIMIT' ... its 200000 rows!!!

connection = pg2.connect(host='34.211.227.227',
                  dbname='postgres',
                  user='postgres')

curs = connection.cursor(cursor_factory=RealDictCursor)
curs.execute("SELECT feat_269, feat_315, feat_341, feat_345, feat_429, feat_504, feat_623, feat_681,feat_701, feat_769, feat_808, feat_829,feat_920, target FROM madelon LIMIT 10000")  

# Change LIMIT based upon how much I want, this time up the limit to 10,000
results = curs.fetchall()
connection.close()  # Close the connection, ALWAYS!

In [18]:
# Create a DataFrame from results
new_huge = pd.DataFrame(results)

In [19]:
new_huge.shape # Shape of the 13 features plus the Target column.

(10000, 14)

In [20]:
new_huge.columns

Index(['feat_269', 'feat_315', 'feat_341', 'feat_345', 'feat_429', 'feat_504',
       'feat_623', 'feat_681', 'feat_701', 'feat_769', 'feat_808', 'feat_829',
       'feat_920', 'target'],
      dtype='object')

In [21]:
new_huge.head(5)

Unnamed: 0,feat_269,feat_315,feat_341,feat_345,feat_429,feat_504,feat_623,feat_681,feat_701,feat_769,feat_808,feat_829,feat_920,target
0,-1.83504,-0.971776,-0.421223,0.650474,1.466596,0.225235,0.581141,-0.857985,2.229031,0.002163,-0.262653,-0.867325,1.746534,1
1,-2.961621,-3.457891,0.731378,-0.591778,-0.502956,0.886778,2.21584,0.005938,3.417978,-0.058692,-1.118034,2.523811,0.06378,1
2,-1.129669,-2.230469,2.668291,1.126456,-0.764214,1.622566,2.456739,0.33881,2.803537,-1.661122,0.011031,1.974763,-0.288689,1
3,-0.232256,-0.158282,2.196579,0.14878,1.87699,1.681118,0.149048,-1.05736,1.402952,-0.913604,-0.192377,1.0534,0.455959,0
4,0.814023,0.80887,2.653546,-0.394069,-0.471106,1.998847,0.372262,-1.485622,-0.207665,-0.292007,-0.794153,2.512284,-0.543798,1


In [22]:
# Now Pickle Time
# Make the pickle to use in Notebook04
new_huge.to_pickle('data/new_huge.p')