In [None]:
%matplotlib inline


# Selecting dimensionality reduction with Pipeline and GridSearchCV


This example constructs a pipeline that does dimensionality
reduction followed by prediction with a support vector
classifier. It demonstrates the use of GridSearchCV and
Pipeline to optimize over different classes of estimators in a
single CV run -- unsupervised PCA and NMF dimensionality
reductions are compared to univariate feature selection during
the grid search.


In [140]:


from __future__ import print_function, division

import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import load_digits
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC
from sklearn.decomposition import PCA, NMF
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.feature_selection import SelectPercentile, chi2
from sklearn.feature_selection import SelectFromModel, f_classif
from sklearn.ensemble import ExtraTreesClassifier
print(__doc__)

##categories = ['alt.atheism', 'soc.religion.christian', 'comp.graphics', 'sci.med']
'''
pipe = Pipeline([
    ('feature_selection', SelectFromModel(LinearSVC(penalty="l1",dual=False))),
    ('reduce_dim', PCA()),
    ('classify', LinearSVC())
])
'''
pipe = Pipeline([
    ('feature_selection', SelectFromModel(ExtraTreesClassifier())),
    ('classify', LinearSVC())
])




N_FEATURES_OPTIONS = [2, 4, 8]
N_FEATURES_chi = [10, 20, 30]
C_OPTIONS = [1, 10, 100, 1000]
NUMBER_OF_ESTIMATORS_RF = [50, 80, 100]
##penalty param
L1_C1 = [0.01, 0.02, 0.05]





##the below code is working..Remeber __ is imp instead of _ for parameters
'''
param_grid = [
    {
        'reduce_dim': [PCA(iterated_power=7), NMF()],
        'reduce_dim__n_components': N_FEATURES_OPTIONS,
        'classify__C': C_OPTIONS
    },
    {
        'reduce_dim': [SelectKBest(chi2)],
        'reduce_dim__k': N_FEATURES_OPTIONS,
        'classify__C': C_OPTIONS
    },
    {
        'reduce_dim': [SelectPercentile(chi2)],
        'reduce_dim__percentile': N_FEATURES_chi
    }
    ]
    

'''

'''
param_grid = [
    {
        'reduce_dim': [PCA(iterated_power=7), NMF()],
        'reduce_dim__n_components': N_FEATURES_OPTIONS,
        'classify__C': C_OPTIONS
    },
    {
        'reduce_dim': [SelectKBest(chi2)],
        'reduce_dim__k': N_FEATURES_OPTIONS,
        'classify__C': C_OPTIONS
    },
    {
        'reduce_dim': [SelectPercentile(chi2)],
        'reduce_dim__percentile': N_FEATURES_chi
    }
    ]

'''
'''
##BELOW CODEBEST SO FAR
param_grid = [
    {
        'reduce_dim': [SelectKBest(chi2)],
        'reduce_dim__k': N_FEATURES_OPTIONS,
        'classify__C': C_OPTIONS
    },
    {
        'reduce_dim': [SelectPercentile(chi2)],
        'reduce_dim__percentile': N_FEATURES_chi
    },
    {
        'reduce_dim': [PCA(), NMF()],
        'reduce_dim__n_components': N_FEATURES_OPTIONS,
        'classify__C': C_OPTIONS
    },
    {
        'feature_selection': [SelectFromModel(LinearSVC(penalty="l1",dual=False))]
    },
    {
        'feature_selection': [SelectFromModel(ExtraTreesClassifier(n_estimators=50))]

    }
    
]
'''

##my experiment(included chi2/ensemble tree classifier/mutual_info_classif(f_classif),L1based feature selection(Linear SVC))
param_grid = [
  
      {
        'feature_selection': [SelectKBest(chi2)],
        'feature_selection__k': N_FEATURES_OPTIONS,
        'classify__C': C_OPTIONS
    },
    {
        'feature_selection': [SelectPercentile(chi2)],
        'feature_selection__percentile': N_FEATURES_chi
    },
    {
        'feature_selection': [SelectFromModel(ExtraTreesClassifier())],
        'feature_selection__estimator__n_estimators' : NUMBER_OF_ESTIMATORS_RF
                             
    },
    {   'feature_selection': [SelectPercentile(f_classif)],
        'feature_selection__percentile': N_FEATURES_chi
           
    },
    {
        'feature_selection': [SelectFromModel(LinearSVC(penalty="l1",dual=False, C =0.01))],
        
        
    }
    
]


''' 
    
'''



###reducer_labels = ['PCA', 'NMF', 'KBest(chi2)']
reducer_labels = ['KBest(chi2)', 'SelectPercentile', 'featureselection' ]

pipe.steps




 
    



[('feature_selection',
  SelectFromModel(estimator=ExtraTreesClassifier(bootstrap=False, class_weight=None, criterion='gini',
             max_depth=None, max_features='auto', max_leaf_nodes=None,
             min_impurity_decrease=0.0, min_impurity_split=None,
             min_samples_leaf=1, min_samples_split=2,
             min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
             oob_score=False, random_state=None, verbose=0, warm_start=False),
          norm_order=1, prefit=False, threshold=None)),
 ('classify',
  LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
       intercept_scaling=1, loss='squared_hinge', max_iter=1000,
       multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
       verbose=0))]

In [141]:
pipe.get_params().keys()

dict_keys(['memory', 'steps', 'feature_selection', 'classify', 'feature_selection__estimator__bootstrap', 'feature_selection__estimator__class_weight', 'feature_selection__estimator__criterion', 'feature_selection__estimator__max_depth', 'feature_selection__estimator__max_features', 'feature_selection__estimator__max_leaf_nodes', 'feature_selection__estimator__min_impurity_decrease', 'feature_selection__estimator__min_impurity_split', 'feature_selection__estimator__min_samples_leaf', 'feature_selection__estimator__min_samples_split', 'feature_selection__estimator__min_weight_fraction_leaf', 'feature_selection__estimator__n_estimators', 'feature_selection__estimator__n_jobs', 'feature_selection__estimator__oob_score', 'feature_selection__estimator__random_state', 'feature_selection__estimator__verbose', 'feature_selection__estimator__warm_start', 'feature_selection__estimator', 'feature_selection__norm_order', 'feature_selection__prefit', 'feature_selection__threshold', 'classify__C', '

In [142]:
grid = GridSearchCV(pipe, cv=2, n_jobs=2, param_grid=param_grid)
digits = load_digits()
grid.fit(digits.data, digits.target)


GridSearchCV(cv=2, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('feature_selection', SelectFromModel(estimator=ExtraTreesClassifier(bootstrap=False, class_weight=None, criterion='gini',
           max_depth=None, max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_sam...ax_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0))]),
       fit_params=None, iid=True, n_jobs=2,
       param_grid=[{'feature_selection': [SelectKBest(k=10, score_func=<function chi2 at 0x000002D16FEF3EA0>)], 'feature_selection__k': [2, 4, 8], 'classify__C': [1, 10, 100, 1000]}, {'feature_selection': [SelectPercentile(percentile=10,
         score_func=<function chi2 at 0x000002D16FEF3EA0>)], 'feature...ndom_state=None, tol=0.0001,
     verbose=0),
        norm_order=1, prefit=False, threshold=None)]}],
       pre_dispatch='2*n_jobs', refit=True, return_tra

In [143]:
model1 = grid.best_estimator_
#print(model1.named_steps['reduce_dim'])
print(model1.named_steps['classify'])
print(model1.named_steps['feature_selection'])

LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0)
SelectFromModel(estimator=LinearSVC(C=0.01, class_weight=None, dual=False, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l1', random_state=None, tol=0.0001,
     verbose=0),
        norm_order=1, prefit=False, threshold=None)


In [144]:
grid.best_params_

{'feature_selection': SelectFromModel(estimator=LinearSVC(C=0.01, class_weight=None, dual=False, fit_intercept=True,
      intercept_scaling=1, loss='squared_hinge', max_iter=1000,
      multi_class='ovr', penalty='l1', random_state=None, tol=0.0001,
      verbose=0),
         norm_order=1, prefit=False, threshold=None)}

In [None]:
mean_scores = np.array(grid.cv_results_['mean_test_score'])
# scores are in the order of param_grid iteration, which is alphabetical
mean_scores = mean_scores.reshape(len(C_OPTIONS), -1, len(N_FEATURES_OPTIONS))
# select score for best C
mean_scores = mean_scores.max(axis=0)
bar_offsets = (np.arange(len(N_FEATURES_OPTIONS)) *
               (len(reducer_labels) + 1) + .5)

plt.figure()
COLORS = 'bgrcmyk'
for i, (label, reducer_scores) in enumerate(zip(reducer_labels, mean_scores)):
    plt.bar(bar_offsets + i, reducer_scores, label=label, color=COLORS[i])

plt.title("Comparing feature reduction techniques")
plt.xlabel('Reduced number of features')
plt.xticks(bar_offsets + len(reducer_labels) / 2, N_FEATURES_OPTIONS)
plt.ylabel('Digit classification accuracy')
plt.ylim((0, 1))
plt.legend(loc='upper left')
plt.show()