<a href="https://colab.research.google.com/github/jadhav-rakesh/ML/blob/main/ds12.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Model Selection

In [1]:
import numpy as np
import pandas as pd

#Selecting the Best Models Using Exhaustive Search

In [3]:
#select the best model by searching over a range of hyperparameters.

from sklearn import linear_model, datasets
from sklearn.model_selection import GridSearchCV

iris = datasets.load_iris()

features, target = iris.data, iris.target

logistic = linear_model.LogisticRegression(max_iter=500, solver="liblinear")

penalty = ["l1", "l2"]

C = np.logspace(0, 4, 10)

hyperparameters = dict(C=C, penalty=penalty)

gridsearch = GridSearchCV(logistic, hyperparameters, cv=5, verbose=0)

best_model = gridsearch.fit(features, target)

print(best_model.best_estimator_)

LogisticRegression(C=7.742636826811269, max_iter=500, penalty='l1',
                   solver='liblinear')


* GridSearchCV is a brute-force approach to model selection using cross-validation
*

In [6]:
print(best_model.best_estimator_.get_params()["penalty"])
print(best_model.best_estimator_.get_params()["C"])

l1
7.742636826811269


In [7]:
best_model.predict(features)

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2])

#Selecting the Best Models Using Randomized Search

In [9]:
from scipy.stats import uniform
from sklearn import linear_model, datasets
from sklearn.model_selection import RandomizedSearchCV

iris = datasets.load_iris()

features, target = iris.data, iris.target

logistic = linear_model.LogisticRegression(max_iter=500, solver="liblinear")

penalty = ["l1", "l2"]

C = uniform(loc=0, scale=4)

hyperparameters = dict(C=C, penalty=penalty)

randomizedsearch = RandomizedSearchCV(logistic,
                                      hyperparameters,
                                      random_state=42, n_iter=100,
                                      cv=5,
                                      verbose=0)

best_model = randomizedsearch.fit(features, target)

In [10]:
print(best_model.best_estimator_)

LogisticRegression(C=1.49816047538945, max_iter=500, penalty='l1',
                   solver='liblinear')


In [11]:
uniform(loc=0, scale=4).rvs(10)

array([0.04951994, 2.97332351, 1.40374119, 2.11573469, 3.21565657,
       1.17080252, 1.50736275, 3.18840785, 1.79310404, 2.31265585])

In [12]:
print(best_model.best_estimator_.get_params()["penalty"])
print(best_model.best_estimator_.get_params()["C"])

l1
1.49816047538945


In [13]:
best_model.predict(features)

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2,
       2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2])

#Selecting the Best Models from Multiple Learning Algorithms

In [16]:
from sklearn import datasets
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline

np.random.seed(42)

iris = datasets.load_iris()

features, target = iris.data, iris.target

pipe = Pipeline([("classifier", RandomForestClassifier())])

search_space = [{"classifier": [LogisticRegression(max_iter=500, solver="liblinear")],
                 "classifier__penalty": ["l1", "l2"],
                 "classifier__C": np.logspace(0, 4, 10)},
                {"classifier":[RandomForestClassifier()],
                 "classifier__n_estimators": [10, 100, 1000],
                 "classifier__max_features":[1, 2, 3]}]

gridsearch = GridSearchCV(pipe, search_space, cv=5, verbose=0)

best_model = gridsearch.fit(features, target)

print(best_model.best_estimator_)


Pipeline(steps=[('classifier',
                 LogisticRegression(C=7.742636826811269, max_iter=500,
                                    penalty='l1', solver='liblinear'))])


In [17]:
print(best_model.best_estimator_.get_params()["classifier"])

LogisticRegression(C=7.742636826811269, max_iter=500, penalty='l1',
                   solver='liblinear')


In [18]:
best_model.predict(features)

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2])

#Selecting the Best Models When Preprocessing

In [23]:
from re import L
#include a preprocessing step during model selection.

from sklearn import datasets
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

np.random.seed(42)

iris = datasets.load_iris()

features, target = iris.data, iris.target

preprocess = FeatureUnion([("std", StandardScaler()), ("pca", PCA())])

pipe = Pipeline([("preprocess", preprocess),
                 ("classifier", LogisticRegression(max_iter=1000, solver="liblinear"))])

search_space = [{"preprocess__pca__n_components": [1, 2, 3],
                "classifier__penalty": ["l1", "l2"],
                "classifier__C": np.logspace(0, 4, 10)}]

clf = GridSearchCV(pipe, search_space, cv=5, verbose=0, n_jobs=-1)

best_model = clf.fit(features, target)

print(best_model.best_estimator_)

Pipeline(steps=[('preprocess',
                 FeatureUnion(transformer_list=[('std', StandardScaler()),
                                                ('pca', PCA(n_components=1))])),
                ('classifier',
                 LogisticRegression(C=7.742636826811269, max_iter=1000,
                                    penalty='l1', solver='liblinear'))])


In [24]:
best_model.best_estimator_.get_params()["preprocess__pca__n_components"]

1

#Speeding Up Model Selection with Parallelization

In [25]:
from sklearn import linear_model, datasets
from sklearn.model_selection import GridSearchCV

iris = datasets.load_iris()
features = iris.data
target = iris.target

logistic = linear_model.LogisticRegression(max_iter=500, solver="liblinear")

penality = ["l1", "l2"]

C = np.logspace(0, 4, 1000)

hyperparameters = dict(C=C, penalty=penalty)

gridsearch = GridSearchCV(logistic, hyperparameters, cv=5, n_jobs=-1, verbose=1)

best_model = gridsearch.fit(features, target)

print(best_model.best_estimator_)

Fitting 5 folds for each of 2000 candidates, totalling 10000 fits
LogisticRegression(C=5.926151812475554, max_iter=500, penalty='l1',
                   solver='liblinear')


In [26]:
clf = GridSearchCV(logistic, hyperparameters, cv=5, n_jobs=1, verbose=1)

best_model = clf.fit(features, target)

print(best_model.best_estimator_)

Fitting 5 folds for each of 2000 candidates, totalling 10000 fits
LogisticRegression(C=5.926151812475554, max_iter=500, penalty='l1',
                   solver='liblinear')


#Speeding Up Model Selection Using Algorithm-Specific Methods

In [27]:
from sklearn import linear_model, datasets

iris = datasets.load_iris()

features, target = iris.data, iris.target

logit = linear_model.LogisticRegressionCV(Cs=100, max_iter=500, solver="liblinear")

logit.fit(features, target)

print(logit)

LogisticRegressionCV(Cs=100, max_iter=500, solver='liblinear')


#Evaluating Performance After Model Selection

In [28]:
from sklearn import linear_model, datasets
from sklearn.model_selection import GridSearchCV, cross_val_score

iris = datasets.load_iris()

features, target = iris.data, iris.target

logistic = linear_model.LogisticRegression(max_iter=500, solver="liblinear")

C = np.logspace(0, 4, 20)

hyperparameters = dict(C=C)

gridsearch = GridSearchCV(logistic, hyperparameters, cv=5, n_jobs=-1, verbose=1)

cross_val_score(gridsearch, features, target).mean()


Fitting 5 folds for each of 20 candidates, totalling 100 fits
Fitting 5 folds for each of 20 candidates, totalling 100 fits
Fitting 5 folds for each of 20 candidates, totalling 100 fits
Fitting 5 folds for each of 20 candidates, totalling 100 fits
Fitting 5 folds for each of 20 candidates, totalling 100 fits


0.9733333333333334