# Exercise: Machine Learning Finding Optimal Model and Hyperparameters

For digits dataset in sklearn.dataset, please try following classifiers and find out the one that gives best performance.

Also find the optimal parameters for that classifier.

In [1]:
import pandas as pd
import numpy as np
from sklearn.datasets import load_digits
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV

In [2]:
digits=load_digits()

In [3]:
dir(digits)

['DESCR', 'data', 'images', 'target', 'target_names']

In [4]:
X=digits.data

In [5]:
y=digits.target

In [7]:
model_params = {
    'svm': {
        'model': SVC(),
        'params' : {
            'C': list(range(1,10)),
            'gamma':['auto','scale'],
            'kernel': ['rbf','linear']
        }  
    },
    'random_forest': {
        'model': RandomForestClassifier(),
        'params' : {
            'n_estimators': list(np.arange(1,100,10))
        }
    },
    'logistic_regression' : {
        'model': LogisticRegression(solver='liblinear',multi_class='auto'),
        'params': {
            'C': list(range(1,10))
        }
    },
    'GaussianNB': {
        'model': GaussianNB(),
        'params' : {
        }  
    },
    'MultinomialNB': {
        'model': MultinomialNB(),
        'params' : {
        }
    },
    'DecisionTreeClassifier' : {
        'model': DecisionTreeClassifier(),
        'params': {
            'criterion': ['gini','entropy']
        }
    }
}

In [8]:
scores = []

for model_name, mp in model_params.items():
    clf =  GridSearchCV(mp['model'], mp['params'], cv=10, return_train_score=False)
    clf.fit(X, y)
    scores.append({
        'model': model_name,
        'best_score': clf.best_score_,
        'best_params': clf.best_params_
    })

In [9]:
scores

[{'model': 'svm',
  'best_score': 0.9821818746120423,
  'best_params': {'C': 5, 'gamma': 'scale', 'kernel': 'rbf'}},
 {'model': 'random_forest',
  'best_score': 0.9515704531346989,
  'best_params': {'n_estimators': 51}},
 {'model': 'logistic_regression',
  'best_score': 0.9259745499689634,
  'best_params': {'C': 1}},
 {'model': 'GaussianNB', 'best_score': 0.8113904407200497, 'best_params': {}},
 {'model': 'MultinomialNB',
  'best_score': 0.8797858472998138,
  'best_params': {}},
 {'model': 'DecisionTreeClassifier',
  'best_score': 0.8297330850403476,
  'best_params': {'criterion': 'entropy'}}]

In [10]:
df = pd.DataFrame(scores,columns=['model','best_score','best_params'])
df

Unnamed: 0,model,best_score,best_params
0,svm,0.982182,"{'C': 5, 'gamma': 'scale', 'kernel': 'rbf'}"
1,random_forest,0.95157,{'n_estimators': 51}
2,logistic_regression,0.925975,{'C': 1}
3,GaussianNB,0.81139,{}
4,MultinomialNB,0.879786,{}
5,DecisionTreeClassifier,0.829733,{'criterion': 'entropy'}


In [11]:
#best model and params
df[df['best_score']==df.best_score.max()]

Unnamed: 0,model,best_score,best_params
0,svm,0.982182,"{'C': 5, 'gamma': 'scale', 'kernel': 'rbf'}"
