<a href="https://colab.research.google.com/github/kcompher/test/blob/master/refs.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Refs

## Feature Engineering and Feature Selection
* Principal Component Analysis (PCA)
* Non-Negative Matrix Factorization (NMF)
* Latent Dirichlet Allocation (LDA)
* Independent component analysis (ICA)
* SelectKBest
* Dimensionality expansion
* Polynomial Features
* One-Hot Encoding
* Scaling with StandardScaler, RobustScaler, MinMaxScaler, Normalizer, and others
* Binning values with quantiles or binarize

In [None]:
# dimension reduction w breast cancer 

from sklearn.datasets import load_breast_cancer
from sklearn.preprocessing import StandardScaler

cancer = load_breast_cancer()
# cancer.feature_names

# standard scalar 
scaler = StandardScaler()
scaler.fit(cancer.data)
X_scaled = scaler.transform(cancer.data)

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, cancer.target, random_state=1)

# test raw performance on lr & KNN  
from sklearn.linear_model import LinearRegression
print((LinearRegression()
    .fit(X_train, y_train)
    .score(X_test, y_test)))

from sklearn.neighbors import KNeighborsClassifier
print((KNeighborsClassifier()
    .fit(X_train, y_train)
    .score(X_test, y_test)))

0.7222686011972143
0.951048951048951


## PCA

In [None]:
# PCA 
from sklearn.decomposition import PCA

pca = PCA(n_components=2) # how to select # of componenents?
pca.fit(X_scaled)
X_pca = pca.transform(X_scaled) # fit and transform 

print("Original shape: %s" % str(X_scaled.shape))
print("Reduced shape: %s" % str(X_pca.shape))

Original shape: (569, 30)
Reduced shape: (569, 2)


In [None]:
# split pca set
X_train, X_test, y_train, y_test = train_test_split(
    X_pca, cancer.target, random_state=1)

In [None]:
(LinearRegression()
    .fit(X_train, y_train)
    .score(X_test, y_test))

0.6279864206554074

In [None]:
(KNeighborsClassifier()
    .fit(X_train, y_train)
    .score(X_test, y_test))

0.9440559440559441

## NMF 
+ non-negative matrices 
+ good for topic extraction

In [None]:
from sklearn.decomposition import NMF

nmf = NMF(n_components=2)
nmf.fit(cancer.data)
X_nmf = nmf.transform(cancer.data)

print("Original shape: %s" % str(cancer.data.shape))
print("Reduced shape: %s" % str(X_pca.shape))

Original shape: (569, 30)
Reduced shape: (569, 2)


In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X_nmf, cancer.target, random_state=1)

print(LinearRegression()
    .fit(X_train, y_train)
    .score(X_test, y_test))

print((KNeighborsClassifier()
    .fit(X_train, y_train)
    .score(X_test, y_test)))

0.48745290456283197
0.9370629370629371


## LDA 
+ Bayesian generative probabilistic model for collections of discrete dataset
+ such as text corpora. It is also 
+ a topic model that is used for discovering abstract topics from a collection of documents.


In [None]:
from sklearn.decomposition import LatentDirichletAllocation

lda = LatentDirichletAllocation(n_components=2)
lda.fit(cancer.data)
X_lda = lda.transform(cancer.data)

print("Original shape: %s" % str(cancer.data.shape))
print("Reduced shape: %s" % str(X_lda.shape))

Original shape: (569, 30)
Reduced shape: (569, 2)


In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X_lda, cancer.target, random_state=1)

print(LinearRegression()
    .fit(X_train, y_train)
    .score(X_test, y_test))

print((KNeighborsClassifier()
    .fit(X_train, y_train)
    .score(X_test, y_test)))

0.5211595015092328
0.8741258741258742


## ICA 
+ separates multivaraiate singal into additive subcomponents 
+ used to sepearte superimposed signals vs. dimension reduction 

## t-distributed Stochastic Neighbor Embedding
+ visualize high-dimensional data 
+ good for seperating clusters after PCA

## Feature Selection 

### Univariate KBest
+ used on full dataset not just split

In [None]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

selector = SelectKBest(chi2, k=5)

X_new = selector.fit_transform(cancer.data, cancer.target)
cancer.data.shape, X_new.shape

((569, 30), (569, 5))

In [None]:
# shows the top best through univariate filter

kbest = pd.Series(selector.scores_, 
                  index=cancer.feature_names).sort_values(ascending=False)
kbest.head(10) 


worst area         112598.431564
mean area           53991.655924
area error           8758.504705
worst perimeter      3665.035416
mean perimeter       2011.102864
worst radius          491.689157
mean radius           266.104917
perimeter error       250.571896
worst texture         174.449400
mean texture           93.897508
dtype: float64

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    cancer.data, cancer.target, random_state=1)

print(LinearRegression()
    .fit(X_train, y_train)
    .score(X_test, y_test))
print((KNeighborsClassifier()
    .fit(X_train, y_train)
    .score(X_test, y_test)))

0.7222686011972117
0.9370629370629371


In [None]:
# not better than before 
X_train, X_test, y_train, y_test = train_test_split(
    X_new, cancer.target, random_state=1)

print(LinearRegression()
    .fit(X_train, y_train)
    .score(X_test, y_test))
print((KNeighborsClassifier()
    .fit(X_train, y_train)
    .score(X_test, y_test)))

0.5857835011372032
0.9230769230769231


## Recursive feature elimination

In [None]:
from sklearn.feature_selection import RFE

lr = LinearRegression()
rfe = RFE(estimator=lr, n_features_to_select=5, step=1)
rfe.fit(cancer.data, cancer.target)

pd.Series(rfe.ranking_, 
          index=cancer.feature_names).sort_values(ascending=False).head(10)



mean area            26
worst area           25
mean texture         24
area error           23
worst perimeter      22
texture error        21
worst texture        20
worst compactness    19
mean perimeter       18
mean radius          17
dtype: int64

In [None]:
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import RFECV # includes cross validation 

rf = RandomForestClassifier(max_depth=7, random_state=1)
rfecv = RFECV(estimator=rf)
rfecv.fit(cancer.data, cancer.target)

rfecv.n_features_

19

## Dimensionality Expansion - Polynomial
 

In [None]:
from sklearn.preprocessing import PolynomialFeatures
poly = PolynomialFeatures(2) # good number to start with 

X_poly = poly.fit_transform(cancer.data)
cancer.data.shape, X_poly.shape # increases nearly square

((569, 30), (569, 496))

In [None]:
poly_names = poly.get_feature_names(cancer.feature_names)

pd.DataFrame(X_poly, columns=poly_names).head()

Unnamed: 0,1,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,...,worst concavity^2,worst concavity worst concave points,worst concavity worst symmetry,worst concavity worst fractal dimension,worst concave points^2,worst concave points worst symmetry,worst concave points worst fractal dimension,worst symmetry^2,worst symmetry worst fractal dimension,worst fractal dimension^2
0,1.0,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,...,0.506802,0.188938,0.327545,0.084645,0.070437,0.122111,0.031556,0.211692,0.054706,0.014137
1,1.0,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,...,0.058371,0.044938,0.06644,0.021507,0.034596,0.05115,0.016558,0.075625,0.024481,0.007925
2,1.0,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,...,0.20286,0.109447,0.16273,0.039446,0.059049,0.087796,0.021282,0.130538,0.031643,0.00767
3,1.0,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,...,0.471832,0.176877,0.455964,0.118834,0.066306,0.170928,0.044547,0.44063,0.114837,0.029929
4,1.0,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,...,0.16,0.065,0.09456,0.030712,0.026406,0.038415,0.012477,0.055885,0.018151,0.005895


In [None]:
# use RFE with random forest
rfc = RandomForestClassifier(max_depth=7, random_state=1)
rfe = RFE(estimator=rfc, n_features_to_select=50, step=1)

X_poly_top = rfe.fit_transform(X_poly, cancer.target)

X_poly_top.shape

(569, 50)

In [None]:
# top 5 reduced 
pd.Series(rfe.ranking_, index=poly_names).sort_values(ascending=False).head(5)



1                                    447
mean radius                          446
mean compactness^2                   445
radius error concave points error    444
perimeter error worst symmetry       443
dtype: int64

In [None]:
# Scale the engineered features (makes little difference for this model, but is good practice)
X_poly_top_scaled = StandardScaler().fit_transform(X_poly)

X_train, X_test, y_train, y_test = train_test_split(
    X_poly_top_scaled, cancer.target, random_state=42)

rfc.fit(X_train, y_train).score(X_test, y_test)

0.9790209790209791

### One-Hot Encoding

We have looked in previous lessons at the need to encode categorical values in **one-hot encoding**.  That is, we might have one feature with a a number of class values encoded in it.  For many models, this is either better quality—or simply required for the code to operate—than trying to use the class labels.  In some cases, integer values might work algorithmically, but will distort the training by being interpreted in a quantitative or ordinal way.

The interfaces provided by scikit-learn are servicable, but somewhat awkward.  `sklearn.preprocessing.LabelBinarizer` does almost what you want in some cases, but doesn't expose the clearest API.  The same can be said of `sklearn.preprocessing.OneHotEncoder` and `sklearn.preprocessing.LabelEncoder` and a couple others.  I simply recommend using `pandas.get_dummies()` in place of these others.  The result will be the same, in any case.

Let us look at a small toy example with catgorical data.

In [None]:
import pandas as pd
pets = pd.read_csv('mlrefs/data/pets.csv')
pets


Unnamed: 0,species,sex
0,cat,M
1,dog,F
2,fish,M
3,dog,M
4,dog,F
5,cat,M
6,fish,M
7,fish,F


In [None]:
pd.get_dummies(pets)

Unnamed: 0,species_cat,species_dog,species_fish,sex_F,sex_M
0,1,0,0,0,1
1,0,1,0,1,0
2,0,0,1,0,1
3,0,1,0,0,1
4,0,1,0,1,0
5,1,0,0,0,1
6,0,0,1,0,1
7,0,0,1,1,0


In [None]:
from sklearn.preprocessing import OneHotEncoder
enc = OneHotEncoder()

enc.fit(pets)

one_hot_pets = enc.transform(pets)

columns = enc.get_feature_names()

pd.DataFrame(one_hot_pets.toarray(), columns=columns)

Unnamed: 0,x0_cat,x0_dog,x0_fish,x1_F,x1_M
0,1.0,0.0,0.0,0.0,1.0
1,0.0,1.0,0.0,1.0,0.0
2,0.0,0.0,1.0,0.0,1.0
3,0.0,1.0,0.0,0.0,1.0
4,0.0,1.0,0.0,1.0,0.0
5,1.0,0.0,0.0,0.0,1.0
6,0.0,0.0,1.0,0.0,1.0
7,0.0,0.0,1.0,1.0,0.0


In [None]:
# simplier form for one hot
one_hot_pets = enc.fit_transform(pets)

pd.DataFrame(one_hot_pets.toarray(), columns=enc.get_feature_names())

Unnamed: 0,x0_cat,x0_dog,x0_fish,x1_F,x1_M
0,1.0,0.0,0.0,0.0,1.0
1,0.0,1.0,0.0,1.0,0.0
2,0.0,0.0,1.0,0.0,1.0
3,0.0,1.0,0.0,0.0,1.0
4,0.0,1.0,0.0,1.0,0.0
5,1.0,0.0,0.0,0.0,1.0
6,0.0,0.0,1.0,0.0,1.0
7,0.0,0.0,1.0,1.0,0.0


## DictVectorizer

In [None]:
data = [
    {'price': 850000, 'rooms': 4, 'neighborhood': 'Queen Anne'},
    {'price': 700000, 'rooms': 3, 'neighborhood': 'Fremont'},
    {'price': 650000, 'rooms': 3, 'neighborhood': 'Wallingford'},
    {'price': 600000, 'rooms': 2, 'neighborhood': 'Fremont'}
]

In [None]:
from sklearn.feature_extraction import DictVectorizer
vec = DictVectorizer(sparse=False, dtype=int) # defaults to compressed float, good for larger dataset
vec.fit_transform(data)

array([[     0,      1,      0, 850000,      4],
       [     1,      0,      0, 700000,      3],
       [     0,      0,      1, 650000,      3],
       [     1,      0,      0, 600000,      2]], dtype=int64)

In [None]:
vec.get_feature_names()

['neighborhood=Fremont',
 'neighborhood=Queen Anne',
 'neighborhood=Wallingford',
 'price',
 'rooms']

In [None]:
pd.DataFrame(vec.fit_transform(data),
             columns=vec.get_feature_names())

Unnamed: 0,neighborhood=Fremont,neighborhood=Queen Anne,neighborhood=Wallingford,price,rooms
0,0,1,0,850000,4
1,1,0,0,700000,3
2,0,0,1,650000,3
3,1,0,0,600000,2


## CountVectorizer

In [None]:
sample = ['problem of evil',
          'evil queen',
          'horizon problem']

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

vec = CountVectorizer()
X = vec.fit_transform(sample)
X

<3x5 sparse matrix of type '<class 'numpy.longlong'>'
	with 7 stored elements in Compressed Sparse Row format>

In [None]:
pd.DataFrame(X.toarray(), columns=vec.get_feature_names()) # toarray 

Unnamed: 0,evil,horizon,of,problem,queen
0,1,0,1,1,0
1,1,0,0,0,1
2,0,1,0,1,0


In [None]:
# better to use Tfidf
from sklearn.feature_extraction.text import TfidfVectorizer
vec = TfidfVectorizer()
X = vec.fit_transform(sample)
pd.DataFrame(X.toarray(), columns=vec.get_feature_names())

Unnamed: 0,evil,horizon,of,problem,queen
0,0.517856,0.0,0.680919,0.517856,0.0
1,0.605349,0.0,0.0,0.0,0.795961
2,0.0,0.795961,0.0,0.605349,0.0


## LabelBinarizer 

In [None]:
# LabelBinarizer 
from sklearn.preprocessing import LabelBinarizer
lb = LabelBinarizer()

lb.fit(np.array([[0, 1, 1], [1, 0, 0]]))

LabelBinarizer()

In [None]:
lb.classes_

array([0, 1, 2])

In [None]:
lb.transform([0, 1, 2, 1])

array([[1, 0, 0],
       [0, 1, 0],
       [0, 0, 1],
       [0, 1, 0]])

In [None]:
## DictVectorizer

## Scaling 

## Pipelines

* Feature Selection and Engineering
* Grid search
* Model


In [None]:
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

# Some libraries tend to be in flux for their dependency versions
import warnings
warnings.simplefilter("ignore")

In [None]:
# load 
from sklearn.datasets import load_breast_cancer
cancer = load_breast_cancer()

# From here on, we refer to features and target by the
# generic X, y rather than tie it to the dataset
X, y = cancer.data, cancer.target
X.shape

(569, 30)

In [None]:
# Synthetic features via polynominal expansion
from sklearn.preprocessing import PolynomialFeatures
poly = PolynomialFeatures(2)
X_poly = poly.fit_transform(X)
X_poly.shape

(569, 496)

In [None]:
# Scale
from sklearn.preprocessing import MinMaxScaler # standard 

# compute minimum and maximum on the training data
scaler = MinMaxScaler()
scaler.fit(X_poly)

# rescale training data
X_poly_scaled = scaler.transform(X_poly)
X_poly_scaled.shape

(569, 496)

In [None]:
# select top twenty percent performing
from sklearn.feature_selection import SelectPercentile

select = SelectPercentile(percentile=20)
select.fit(X_poly_scaled, y)

X_selected = select.transform(X_poly_scaled)
X_selected.shape

(569, 99)

In [None]:
# Test Feature Engineered Data Against Model

from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(max_depth=7, random_state=1) 

from sklearn.metrics import f1_score, make_scorer
scorer = make_scorer(f1_score)

from sklearn.model_selection import KFold # use KFold for cv
kf = KFold(5)# , random_state=false)

from sklearn.model_selection import cross_val_score 
cv_scores = cross_val_score(rfc, X_selected, y, scoring=scorer, cv=kf)

print(" CV scores:", cv_scores)
print("Mean score:", np.mean(cv_scores))

 CV scores: [0.92929293 0.98461538 0.98013245 0.99408284 0.98245614]
Mean score: 0.9741159489654005


In [None]:
# how we did on the raw 
cv_scores = cross_val_score(rfc, X, y, scoring=scorer, cv=kf)
print("Raw data CV scores:", cv_scores)
print("    Raw mean score:", np.mean(cv_scores))

Raw data CV scores: [0.89795918 0.94573643 0.98666667 0.9704142  0.98245614]
    Raw mean score: 0.9566465251965944


### pipeline one

In [None]:
# turn into pipeline
from sklearn.pipeline import Pipeline

pipe = Pipeline([
    ("Polynomial features", PolynomialFeatures(2)), # expansion 
    ("MinMax scaling", MinMaxScaler()), # scaling 
    ("Top 20% features", SelectPercentile(percentile=20)), # selection 
    ("Random Forest", RandomForestClassifier(max_depth=7)), # model
])

In [None]:
# run pipeline through cv 
cv_scores = cross_val_score(pipe, 
                            X, y, 
                            scoring=make_scorer(f1_score), 
                            cv=KFold(5))

print("Pipeline CV scores:", cv_scores)
print("Pipeline mean score:", np.mean(cv_scores))

Pipeline CV scores: [0.92929293 0.98461538 0.98666667 0.98224852 0.98245614]
Pipeline mean score: 0.9730559283271834


In [None]:
#  recover (and even modify in-place) the steps of a pipeline
pipe.steps # does not show the KFold cv

[('Polynomial features', PolynomialFeatures()),
 ('MinMax scaling', MinMaxScaler()),
 ('Top 20% features', SelectPercentile(percentile=20)),
 ('Random Forest', RandomForestClassifier(max_depth=7))]

In [None]:
# pickle for later use
from pickle import dump, load
dump(pipe, open('cancer-pipeline.pkl','wb')) 

In [None]:
# open and reinstantiate
newpipe = load(open('cancer-pipeline.pkl','rb'))
cv_scores = cross_val_score(newpipe, 
                            X, y, 
                            scoring=make_scorer(f1_score), 
                            cv=KFold(5))

print(" Pipeline CV scores:", cv_scores)
print("Pipeline mean score:", np.mean(cv_scores))

 Pipeline CV scores: [0.93877551 0.97709924 0.98666667 0.98224852 0.98245614]
Pipeline mean score: 0.9734492149145811


In [None]:
# use pipe to train model 
pipe.fit(X, y)

Pipeline(steps=[('Polynomial features', PolynomialFeatures()),
                ('MinMax scaling', MinMaxScaler()),
                ('Top 20% features', SelectPercentile(percentile=20)),
                ('Random Forest', RandomForestClassifier(max_depth=7))])

In [None]:
# pip to predict 
pipe.predict(X)

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0,
       1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0,
       1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1,
       1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0,
       0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1,
       1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0,
       0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0,
       1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1,
       1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0,

### Pipeline factory 

In [None]:
# only limited in naming, but same steps 

from sklearn.pipeline import make_pipeline

pipef = make_pipeline(
    PolynomialFeatures(2),
    MinMaxScaler(),
    SelectPercentile(percentile=20),
    RandomForestClassifier(max_depth=7))

pipef.steps

[('polynomialfeatures', PolynomialFeatures()),
 ('minmaxscaler', MinMaxScaler()),
 ('selectpercentile', SelectPercentile(percentile=20)),
 ('randomforestclassifier', RandomForestClassifier(max_depth=7))]

In [None]:
# run pipeline through cv 
cv_scores = cross_val_score(pipef, 
                            X, y, 
                            scoring=make_scorer(f1_score), 
                            cv=KFold(5))

print("Pipeline CV scores:", cv_scores)
print("Pipeline mean score:", np.mean(cv_scores))

Pipeline CV scores: [0.92929293 0.98461538 0.98666667 0.98203593 0.98245614]
Pipeline mean score: 0.9730134098139139


## Pipe w grid search 

In [None]:
%%time
# Takes about a minute for this grid search

from sklearn.model_selection import GridSearchCV

params = {'polynomialfeatures__degree': [1, 2, 3],
          'selectpercentile__percentile': [10, 15, 20, 50],
          'randomforestclassifier__max_depth': [5, 7, 9],
          'randomforestclassifier__criterion': ['entropy', 'gini']}

grid = GridSearchCV(pipe, param_grid=params, cv=5)
grid.fit(X, y)

print("best cross-validation accuracy:", grid.best_score_)
print("best dataset score: ", grid.score(X, y))   # Overfitting against entire dataset
print("best parameters: ", grid.best_params_)

best cross-validation accuracy: 0.9789318428815401
best dataset score:  1.0
best parameters:  {'polynomialfeatures__degree': 3, 'randomforestclassifier__criterion': 'entropy', 'randomforestclassifier__max_depth': 7, 'selectpercentile__percentile': 50}
CPU times: user 1min 30s, sys: 4.85 s, total: 1min 35s
Wall time: 1min 38s


In [None]:
# instantiate and test best estimator a model  
model = grid.best_estimator_

cv_scores = cross_val_score(model, 
                            X, y, 
                            scoring=make_scorer(f1_score), 
                            cv=KFold(5))

print("Grid CV scores:", cv_scores)
print("Grid mean score:", np.mean(cv_scores))

Grid CV scores: [0.95833333 0.98461538 0.98013245 0.98809524 0.99421965]
Grid mean score: 0.9810792119108547


In [None]:
# steps of model like pipe 
model.steps

[('polynomialfeatures', PolynomialFeatures(degree=3)),
 ('minmaxscaler', MinMaxScaler()),
 ('selectpercentile', SelectPercentile(percentile=50)),
 ('randomforestclassifier',
  RandomForestClassifier(criterion='entropy', max_depth=7))]

In [None]:
# evalute the other configurations of the the grid search in .cv_results
df_grid = pd.DataFrame(grid.cv_results_).set_index('rank_test_score').sort_index()

df_params = df_grid.loc[:,df_grid.columns.str.contains('param_')]

cols = [c.split('_')[-1] for c in df_params.columns]

df_params.columns = cols
df_params.head(10)

Unnamed: 0_level_0,degree,criterion,depth,percentile
rank_test_score,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,3,entropy,7,50
2,3,gini,7,20
3,3,gini,5,15
4,3,entropy,7,20
4,3,entropy,9,50
6,3,gini,5,50
7,3,entropy,9,20
8,2,gini,7,50
8,3,gini,7,50
8,2,entropy,7,50


## Robust Train/Test Splits 

* cross_val_score
* ShuffleSplit
* KFold, RepeatedKFold, LeaveOneOut, LeavePOut, StratifiedKFold