#  Grid-Searching Which Model To Use

In [1]:
import pandas as pd
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plt

In [2]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC

pipe = Pipeline([('preprocessing', StandardScaler()), ('classifier', SVC())])

In [3]:
from sklearn.ensemble import RandomForestClassifier

param_grid = [
    {'classifier': [SVC()], 
     'preprocessing': [StandardScaler(), None],
     'classifier__gamma': [0.001, 0.01, 0.1, 1, 10, 100],
     'classifier__C': [0.001, 0.01, 0.1, 1, 10, 100]},
    {'classifier': [RandomForestClassifier(n_estimators=100)],
     'preprocessing': [None], 
     'classifier__max_features': [1, 2, 3]}]

In [4]:
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import GridSearchCV

cancer = load_breast_cancer()
X_train, X_test, y_train, y_test = train_test_split(cancer.data, cancer.target, random_state=0)

grid = GridSearchCV(pipe, param_grid, cv=5)
grid.fit(X_train, y_train)

print(f"Best params:\n{grid.best_params_}\n")
print(f"Best cross-validation score: {grid.best_score_:.2f}")
print(f"Test-set score: {grid.score(X_test, y_test):.2f}")

Best params:
{'classifier': SVC(C=10, gamma=0.01), 'classifier__C': 10, 'classifier__gamma': 0.01, 'preprocessing': StandardScaler()}

Best cross-validation score: 0.99
Test-set score: 0.98


## Vaja

In [43]:
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler, Normalizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC

In [14]:
iris_df=load_iris()

X_train, X_test, y_train, y_test = train_test_split(iris_df.data ,iris_df.target, test_size=0.2, random_state=0)

In [18]:
pipeline_lr = make_pipeline(StandardScaler(), PCA(n_components=2), LogisticRegression(random_state=0))
model = pipeline_lr.fit(X_train, y_train)
final_score = model.score(X_test,y_test)
print(f"Final score - {model.steps}: {final_score:.2}")

Final score - [('standardscaler', StandardScaler()), ('pca', PCA(n_components=2)), ('logisticregression', LogisticRegression(random_state=0))]: 0.87


In [78]:
def run_grid_pipeline_iris_df():
    iris_df = load_iris()
    X_train, X_test, y_train, y_test = train_test_split(iris_df.data ,iris_df.target, test_size=0.25, random_state=0)
    
    # start pipeline
    pipe = Pipeline([('preprocessing_scaling', StandardScaler()), 
                     ('predprocessing_pca', PCA()),
                     ('classifier', LogisticRegression(random_state=0))])

    # PARAMETERS
    param_grid = [
        {'classifier': [LogisticRegression(max_iter=10000, solver="saga")], 
         'predprocessing_pca': [PCA(n_components=2), None],
         'preprocessing_scaling': [MinMaxScaler(), StandardScaler(), RobustScaler(), Normalizer(), None],
         'classifier__penalty': ["l1", "l2"], 
        },
        {'classifier': [DecisionTreeClassifier(random_state=0)], 
         'predprocessing_pca': [PCA(n_components=2), None],
         'preprocessing_scaling': [MinMaxScaler(), StandardScaler(), RobustScaler(), Normalizer(), None],  
        },
        {'classifier': [SVC()], 
         'predprocessing_pca': [PCA(n_components=2), None],
         'preprocessing_scaling': [MinMaxScaler(), StandardScaler(), RobustScaler(), Normalizer(), None],
        },
        {'classifier': [KNeighborsClassifier()], 
         'predprocessing_pca': [PCA(n_components=2), None],
         'preprocessing_scaling': [MinMaxScaler(), StandardScaler(), RobustScaler(), Normalizer(), None],
        },
    ]

    # run grid search
    grid = GridSearchCV(pipe, param_grid, cv=5)
    grid.fit(X_train, y_train)
    
    print(f"Best params:\n{grid.best_params_}\n")
    print(f"Test-set score: {grid.score(X_test, y_test):.2f}")
    results = pd.DataFrame(grid.cv_results_)
    # confusion matrix
    prediction = grid.predict(X_test)
    confusion = confusion_matrix(y_test, prediction)
    print(f"Confusion matrix:\n{confusion}\n")
    return results

In [81]:
grids_result = run_grid_pipeline_iris_df()

Best params:
{'classifier': DecisionTreeClassifier(random_state=0), 'predprocessing_pca': PCA(n_components=2), 'preprocessing_scaling': Normalizer()}

Test-set score: 0.97
Confusion matrix:
[[13  0  0]
 [ 0 15  1]
 [ 0  0  9]]



In [82]:
grids_result

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_classifier,param_classifier__penalty,param_predprocessing_pca,param_preprocessing_scaling,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.003285,0.000565,0.000699,7.6e-05,"LogisticRegression(max_iter=10000, solver='saga')",l1,PCA(n_components=2),MinMaxScaler(),{'classifier': LogisticRegression(max_iter=100...,0.956522,0.869565,1.0,0.909091,1.0,0.947036,0.051268,25
1,0.004829,0.000347,0.000686,3.2e-05,"LogisticRegression(max_iter=10000, solver='saga')",l1,PCA(n_components=2),StandardScaler(),{'classifier': LogisticRegression(max_iter=100...,0.956522,0.782609,0.954545,0.863636,0.954545,0.902372,0.069599,37
2,0.005521,0.000315,0.000676,1.3e-05,"LogisticRegression(max_iter=10000, solver='saga')",l1,PCA(n_components=2),RobustScaler(),{'classifier': LogisticRegression(max_iter=100...,0.913043,0.695652,0.909091,0.818182,0.954545,0.858103,0.092632,45
3,0.003535,0.000384,0.000803,2.4e-05,"LogisticRegression(max_iter=10000, solver='saga')",l1,PCA(n_components=2),Normalizer(),{'classifier': LogisticRegression(max_iter=100...,0.956522,0.869565,0.863636,0.909091,0.909091,0.901581,0.033455,40
4,0.00496,0.00028,0.000551,1.9e-05,"LogisticRegression(max_iter=10000, solver='saga')",l1,PCA(n_components=2),,{'classifier': LogisticRegression(max_iter=100...,1.0,0.913043,1.0,0.863636,1.0,0.955336,0.05689,19
5,0.017072,0.005978,0.000556,2.2e-05,"LogisticRegression(max_iter=10000, solver='saga')",l1,,MinMaxScaler(),{'classifier': LogisticRegression(max_iter=100...,0.956522,0.956522,1.0,0.954545,0.954545,0.964427,0.017809,6
6,0.01987,0.005984,0.00062,0.000125,"LogisticRegression(max_iter=10000, solver='saga')",l1,,StandardScaler(),{'classifier': LogisticRegression(max_iter=100...,0.956522,0.913043,1.0,0.954545,0.954545,0.955731,0.027515,17
7,0.017269,0.003563,0.000563,2.3e-05,"LogisticRegression(max_iter=10000, solver='saga')",l1,,RobustScaler(),{'classifier': LogisticRegression(max_iter=100...,0.956522,0.869565,1.0,0.954545,0.954545,0.947036,0.042449,25
8,0.00416,0.000122,0.000613,1e-05,"LogisticRegression(max_iter=10000, solver='saga')",l1,,Normalizer(),{'classifier': LogisticRegression(max_iter=100...,0.695652,0.73913,0.727273,0.772727,0.772727,0.741502,0.02919,48
9,0.12096,0.017948,0.000422,1.3e-05,"LogisticRegression(max_iter=10000, solver='saga')",l1,,,{'classifier': LogisticRegression(max_iter=100...,1.0,0.956522,1.0,0.909091,0.954545,0.964032,0.033918,8
