In [32]:
from lib.project_5 import load_data_from_database, make_data_dict, general_model, general_transformer

In [33]:
import pandas as pd
from sqlalchemy import create_engine
from sklearn.model_selection import train_test_split 
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import SelectKBest
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.svm import SVC



# Step 3 - Build Model



### Domain and Data

We will be using the same data from steps 1 and 2.

### Problem Statement

We want to see if using SelectKBest and/or a gridsearch will improve the scores of our models.

### Solution Statement

We will develop a binary classification model and attempt to augment its performance using automatic feature selection techniques.

### Metric

We will look at the results of all models to determine which one has the highest scores for our data.

### Benchmark

We will use as the benchmark determined in Step 1 to evaluate the new models we have generated.

## Implementation

Implement the following code pipeline using the functions you write in `lib/project_5.py`.

<img src="assets/build_model.png" width="600px">

In [34]:
mandelon_df = load_data_from_database()

In [35]:
data_dict = make_data_dict(mandelon_df, random_state=77)

In [36]:
scaled_mandelon = general_transformer(StandardScaler(), data_dict)

In [56]:
kbest = general_transformer(SelectKBest(), data_dict)
kbest

{'X_test': array([[-0.58706238, -0.49447172,  0.97103468, ..., -0.11839187,
         -0.10567   , -0.31523019],
        [-1.0773316 , -0.27210065, -0.5492734 , ...,  0.82775579,
          0.6173749 , -0.73006544],
        [-0.54037007, -0.49447172, -0.3107937 , ..., -0.7457289 ,
         -0.69314398,  0.86236667],
        ..., 
        [-0.4236393 , -0.04972958, -0.35550864, ...,  1.62992445,
          1.74713254, -1.05122822],
        [ 1.14055298,  1.0287701 , -0.57908336, ...,  0.81747157,
          0.59477974, -0.74344723],
        [-0.65710084, -0.12755946, -0.29588872, ..., -1.34221329,
         -1.23542765,  0.87574845]]),
 'X_train': array([[ 0.79036068,  0.39501256, -1.0187803 , ..., -0.00526552,
          0.02990092,  0.52782211],
        [-1.35748544, -0.39440474, -1.14547264, ..., -0.01554973,
         -0.01528939,  0.78207597],
        [-0.93725468, -1.2171777 ,  0.05437834, ...,  0.86889264,
          1.0014925 ,  0.50105854],
        ..., 
        [ 1.02382222,  0.506198

In [57]:
kbest_LR = general_model(LogisticRegression(),kbest)
kbest_LR

{'X_test': array([[-0.58706238, -0.49447172,  0.97103468, ..., -0.11839187,
         -0.10567   , -0.31523019],
        [-1.0773316 , -0.27210065, -0.5492734 , ...,  0.82775579,
          0.6173749 , -0.73006544],
        [-0.54037007, -0.49447172, -0.3107937 , ..., -0.7457289 ,
         -0.69314398,  0.86236667],
        ..., 
        [-0.4236393 , -0.04972958, -0.35550864, ...,  1.62992445,
          1.74713254, -1.05122822],
        [ 1.14055298,  1.0287701 , -0.57908336, ...,  0.81747157,
          0.59477974, -0.74344723],
        [-0.65710084, -0.12755946, -0.29588872, ..., -1.34221329,
         -1.23542765,  0.87574845]]),
 'X_train': array([[ 0.79036068,  0.39501256, -1.0187803 , ..., -0.00526552,
          0.02990092,  0.52782211],
        [-1.35748544, -0.39440474, -1.14547264, ..., -0.01554973,
         -0.01528939,  0.78207597],
        [-0.93725468, -1.2171777 ,  0.05437834, ...,  0.86889264,
          1.0014925 ,  0.50105854],
        ..., 
        [ 1.02382222,  0.506198

In [58]:
print(kbest_LR['train_score'])
print(kbest_LR['test_score'])

0.612
0.608


In [72]:
knn_model = general_model(KNeighborsClassifier(), kbest)
knn_model

{'X_test': array([[-0.58706238, -0.49447172,  0.97103468, ..., -0.11839187,
         -0.10567   , -0.31523019],
        [-1.0773316 , -0.27210065, -0.5492734 , ...,  0.82775579,
          0.6173749 , -0.73006544],
        [-0.54037007, -0.49447172, -0.3107937 , ..., -0.7457289 ,
         -0.69314398,  0.86236667],
        ..., 
        [-0.4236393 , -0.04972958, -0.35550864, ...,  1.62992445,
          1.74713254, -1.05122822],
        [ 1.14055298,  1.0287701 , -0.57908336, ...,  0.81747157,
          0.59477974, -0.74344723],
        [-0.65710084, -0.12755946, -0.29588872, ..., -1.34221329,
         -1.23542765,  0.87574845]]),
 'X_train': array([[ 0.79036068,  0.39501256, -1.0187803 , ..., -0.00526552,
          0.02990092,  0.52782211],
        [-1.35748544, -0.39440474, -1.14547264, ..., -0.01554973,
         -0.01528939,  0.78207597],
        [-0.93725468, -1.2171777 ,  0.05437834, ...,  0.86889264,
          1.0014925 ,  0.50105854],
        ..., 
        [ 1.02382222,  0.506198

In [68]:
print(knn_model['train_score']) 
print(knn_model['test_score'])

0.884666666667
0.808


In [61]:
params = {"C": [0.001, 0.01, 0.1, 1.0], 'penalty': ["l1", "l2"]}

In [70]:
gridsearch_model = general_model(GridSearchCV(LogisticRegression(), params, n_jobs=-1, verbose=1), data_dict)

Fitting 3 folds for each of 8 candidates, totalling 24 fits


[Parallel(n_jobs=-1)]: Done  24 out of  24 | elapsed:    0.4s finished


In [63]:
gridsearch_model['train_score'], gridsearch_model['test_score']

(0.6166666666666667, 0.60999999999999999)

In [73]:
gridsearchLR = general_model(LogisticRegression(), gridsearch_model)

In [75]:
gridsearchLR['train_score'], gridsearch_model['test_score']

(0.61199999999999999, 0.60799999999999998)