In [28]:
from sklearn import datasets
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV

digits = datasets.load_digits()

In [5]:
df = pd.DataFrame(digits.data)
df.head()


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,54,55,56,57,58,59,60,61,62,63
0,0.0,0.0,5.0,13.0,9.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,6.0,13.0,10.0,0.0,0.0,0.0
1,0.0,0.0,0.0,12.0,13.0,5.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,11.0,16.0,10.0,0.0,0.0
2,0.0,0.0,0.0,4.0,15.0,12.0,0.0,0.0,0.0,0.0,...,5.0,0.0,0.0,0.0,0.0,3.0,11.0,16.0,9.0,0.0
3,0.0,0.0,7.0,15.0,13.0,1.0,0.0,0.0,0.0,8.0,...,9.0,0.0,0.0,0.0,7.0,13.0,13.0,9.0,0.0,0.0
4,0.0,0.0,0.0,1.0,11.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,2.0,16.0,4.0,0.0,0.0


In [8]:
df.shape

(1797, 64)

In [46]:
scaler = StandardScaler()
X = digits.data
y = digits.target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, stratify=y, random_state=38)
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [None]:
scoring_metrics = ["accuracy", "precision", "recall", "f1"]

In [48]:
# Params for different models
model_params = {
    'random_forest_classifier': {
        'model': RandomForestClassifier(),
        'params': {
            'n_estimators': [100,200],
            'max_depth': [None, 10, 20],
            'class_weight': ['balanced']
        }
    },
    'logistic_regression' : {
        'model': LogisticRegression(solver='lbfgs', class_weight='balanced'),
        'params': {
            'C': [0.1, 1, 10]
        }
    },
    'gausiann_NB': {
        'model': GaussianNB(),
        'params': {
            'var_smoothing': [1e-09, 1e-08, 1e-07]
        }
    },
    'decision_tree_classifier': {
        'model': DecisionTreeClassifier(),
        'params': {
            'max_depth': [None, 5, 10],
            'min_samples_split': [2, 5, 10],
            'criterion': ['gini', 'entropy']
        }
    }
}

In [63]:
# Grid search  with 5 cv for each modelusing single metric
scores_single = []

for model_name, mp in model_params.items():
    clf = GridSearchCV(mp['model'], mp['params'], cv=5, return_train_score=False)
    clf.fit(X_train_scaled, y_train)
    scores_single.append({
        'model': model_name,
        'best_score': clf.best_score_,
        'best_params': clf.best_params_
    })

In [80]:
pd.set_option('display.max_colwidth', None)
scores_df_single = pd.DataFrame(scores_single)
scores_df_single

Unnamed: 0,model,best_score,best_params
0,random_forest_classifier,0.974956,"{'class_weight': 'balanced', 'max_depth': 20, 'n_estimators': 100}"
1,logistic_regression,0.967296,{'C': 1}
2,gausiann_NB,0.842714,{'var_smoothing': 1e-07}
3,decision_tree_classifier,0.857341,"{'criterion': 'entropy', 'max_depth': 10, 'min_samples_split': 2}"


In [65]:
# Grid search  with 5 cv for each modelusing 2 metrics : accuracy and f1
scoring = {
    'accuracy': 'accuracy',
    'f1': 'f1_macro'
}

scores_double = []
for model_name, mp in model_params.items():
    clf = GridSearchCV(mp['model'], mp['params'], scoring=scoring, refit='accuracy', cv=5, return_train_score=False)
    clf.fit(X_train_scaled, y_train)
    scores_double.append({
        'model': model_name,
        'best_accuracy': clf.cv_results_['mean_test_accuracy'][clf.best_index_],
        'best_f1': clf.cv_results_['mean_test_f1'][clf.best_index_],
        'best_params': clf.best_params_
    })

In [77]:
scores_df_double = pd.DataFrame(scores_double)
scores_df_double

Unnamed: 0,model,best_accuracy,best_f1,best_params
0,random_forest_classifier,0.975651,0.975447,"{'class_weight': 'balanced', 'max_depth': 20, 'n_estimators': 200}"
1,logistic_regression,0.967296,0.967284,{'C': 1}
2,gausiann_NB,0.842714,0.844591,{'var_smoothing': 1e-07}
3,decision_tree_classifier,0.846905,0.847301,"{'criterion': 'entropy', 'max_depth': 10, 'min_samples_split': 5}"


In [24]:
# Params for MultinomialNB model
multinomial_params = {
        'model': MultinomialNB(),
        'params': {
            'alpha': [0.1, 0.5, 1.0],
            'fit_prior': [True, False]
        }
}

In [35]:
# Scaling data is not recommended for MultinomialNB - raw data is used. Creating list of results for all hyperparameter combinations.
mn_scores=[]
clf = GridSearchCV(multinomial_params['model'], multinomial_params['params'], cv=5, return_train_score=False)
clf.fit(digits.data, digits.target)

for mean_score, params in zip(clf.cv_results_['mean_test_score'], clf.cv_results_['params']):
    mn_scores.append({
        'params': params,
        'mean_score': mean_score
    })

In [37]:
for score in mn_scores:
    print(score)

{'params': {'alpha': 0.1, 'fit_prior': True}, 'mean_score': np.float64(0.87090683998762)}
{'params': {'alpha': 0.1, 'fit_prior': False}, 'mean_score': np.float64(0.87090683998762)}
{'params': {'alpha': 0.5, 'fit_prior': True}, 'mean_score': np.float64(0.8697941813679975)}
{'params': {'alpha': 0.5, 'fit_prior': False}, 'mean_score': np.float64(0.8697941813679975)}
{'params': {'alpha': 1.0, 'fit_prior': True}, 'mean_score': np.float64(0.8703497369235531)}
{'params': {'alpha': 1.0, 'fit_prior': False}, 'mean_score': np.float64(0.8703497369235531)}
