In [55]:
from src.data_container import DataContainer
import numpy as np
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
import pandas as pd
from sklearn.metrics import ConfusionMatrixDisplay
from matplotlib import pyplot as plt
import src.emnist_utils as utils
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint

In [None]:
dc = DataContainer()
dc.load_emnist()

In [None]:
y_train_series = pd.Series(dc.y_train)

In [None]:
y_train_series.value_counts()

In [None]:
knn_clf = KNeighborsClassifier(n_neighbors=5, algorithm='ball_tree', weights='distance', n_jobs=4)

In [None]:
cross_val_score(knn_clf, dc.X_train, dc.y_train, cv=5, scoring='accuracy')

In [17]:
knn_clf.score(dc.X_test, dc.y_test)

0.7888297872340425

In [21]:
scaler = StandardScaler()
knn_est = KNeighborsClassifier(n_neighbors=5, algorithm='ball_tree', weights='distance', n_jobs=4)
knn_pipe = Pipeline([('scaler', scaler), ('knn', knn_est)])

In [22]:
knn_pipe.get_params()

{'memory': None,
 'steps': [('scaler', StandardScaler()),
  ('knn',
   KNeighborsClassifier(algorithm='ball_tree', n_jobs=4, weights='distance'))],
 'verbose': False,
 'scaler': StandardScaler(),
 'knn': KNeighborsClassifier(algorithm='ball_tree', n_jobs=4, weights='distance'),
 'scaler__copy': True,
 'scaler__with_mean': True,
 'scaler__with_std': True,
 'knn__algorithm': 'ball_tree',
 'knn__leaf_size': 30,
 'knn__metric': 'minkowski',
 'knn__metric_params': None,
 'knn__n_jobs': 4,
 'knn__n_neighbors': 5,
 'knn__p': 2,
 'knn__weights': 'distance'}

In [23]:
scaled_X = scaler.fit_transform(dc.X_train[0:1000])

In [24]:
scaled_X

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [25]:
knn_pipe.fit(dc.X_train, dc.y_train)

In [26]:
knn_pipe.score(dc.X_test, dc.y_test)

0.7567553191489361

In [29]:
scaler = MinMaxScaler()
knn_est = KNeighborsClassifier(n_neighbors=5, algorithm='ball_tree', weights='distance', n_jobs=4)
knn_pipe = Pipeline([('scaler', scaler), ('knn', knn_est)])

In [30]:
scaler.fit_transform(dc.X_train[0:1000])

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [31]:
knn_pipe.fit(dc.X_train, dc.y_train)

In [32]:
knn_pipe.score(dc.X_test, dc.y_test)

0.7888297872340425

In [34]:
scaler = MinMaxScaler()
rf_clf = RandomForestClassifier(n_estimators=100, n_jobs=4, random_state=42)
rf_pipe = Pipeline([('scaler', scaler), ('rf', rf_clf)])

In [35]:
rf_pipe.fit(dc.X_train, dc.y_train)

In [36]:
cross_val_score(rf_pipe, dc.X_train, dc.y_train, cv=5, scoring='accuracy')

array([0.80531915, 0.81050532, 0.80585106, 0.81276596, 0.80802305])

In [37]:
rf_pipe.score(dc.X_train, dc.y_train)

1.0

In [38]:
rf_pipe.get_params()

{'memory': None,
 'steps': [('scaler', MinMaxScaler()),
  ('rf', RandomForestClassifier(n_jobs=4, random_state=42))],
 'verbose': False,
 'scaler': MinMaxScaler(),
 'rf': RandomForestClassifier(n_jobs=4, random_state=42),
 'scaler__clip': False,
 'scaler__copy': True,
 'scaler__feature_range': (0, 1),
 'rf__bootstrap': True,
 'rf__ccp_alpha': 0.0,
 'rf__class_weight': None,
 'rf__criterion': 'gini',
 'rf__max_depth': None,
 'rf__max_features': 'sqrt',
 'rf__max_leaf_nodes': None,
 'rf__max_samples': None,
 'rf__min_impurity_decrease': 0.0,
 'rf__min_samples_leaf': 1,
 'rf__min_samples_split': 2,
 'rf__min_weight_fraction_leaf': 0.0,
 'rf__n_estimators': 100,
 'rf__n_jobs': 4,
 'rf__oob_score': False,
 'rf__random_state': 42,
 'rf__verbose': 0,
 'rf__warm_start': False}

In [52]:
rf_pipe[:-1].get_feature_names_out()

array(['x0', 'x1', 'x2', 'x3', 'x4', 'x5', 'x6', 'x7', 'x8', 'x9', 'x10',
       'x11', 'x12', 'x13', 'x14', 'x15', 'x16', 'x17', 'x18', 'x19',
       'x20', 'x21', 'x22', 'x23', 'x24', 'x25', 'x26', 'x27', 'x28',
       'x29', 'x30', 'x31', 'x32', 'x33', 'x34', 'x35', 'x36', 'x37',
       'x38', 'x39', 'x40', 'x41', 'x42', 'x43', 'x44', 'x45', 'x46',
       'x47', 'x48', 'x49', 'x50', 'x51', 'x52', 'x53', 'x54', 'x55',
       'x56', 'x57', 'x58', 'x59', 'x60', 'x61', 'x62', 'x63', 'x64',
       'x65', 'x66', 'x67', 'x68', 'x69', 'x70', 'x71', 'x72', 'x73',
       'x74', 'x75', 'x76', 'x77', 'x78', 'x79', 'x80', 'x81', 'x82',
       'x83', 'x84', 'x85', 'x86', 'x87', 'x88', 'x89', 'x90', 'x91',
       'x92', 'x93', 'x94', 'x95', 'x96', 'x97', 'x98', 'x99', 'x100',
       'x101', 'x102', 'x103', 'x104', 'x105', 'x106', 'x107', 'x108',
       'x109', 'x110', 'x111', 'x112', 'x113', 'x114', 'x115', 'x116',
       'x117', 'x118', 'x119', 'x120', 'x121', 'x122', 'x123', 'x124',
       'x125

In [57]:
param_distribs = {
    'rf__n_estimators': randint(low=10, high=200),
    'rf__max_features': randint(low=1, high=28),
    'rf__max_depth': randint(low=1, high=10)
}

rnd_search = RandomizedSearchCV(rf_pipe, param_distributions=param_distribs,
                                n_iter=10, cv=3, scoring='accuracy', random_state=42)

In [59]:
rnd_search.fit(dc.X_train, dc.y_train)

In [60]:
rnd_search.best_params_

{'rf__max_depth': 8, 'rf__max_features': 12, 'rf__n_estimators': 167}

In [62]:
rnd_search.cv_results_

{'mean_fit_time': array([4.61755888, 5.57678564, 2.34960985, 5.69693446, 7.72029209,
        1.00773517, 5.3193206 , 1.91551892, 1.00224916, 2.87133845]),
 'std_fit_time': array([0.23148658, 0.07017355, 0.05441925, 0.06383741, 0.11894803,
        0.00414528, 0.00189012, 0.01254095, 0.00657324, 0.01320327]),
 'mean_score_time': array([0.49548864, 0.55035734, 0.39745116, 0.54559008, 0.67631221,
        0.1589574 , 0.77958655, 0.85771275, 0.35280736, 0.51424615]),
 'std_score_time': array([0.00680365, 0.00771001, 0.02562405, 0.00949737, 0.00520516,
        0.00163704, 0.01626487, 0.02798901, 0.00441384, 0.01101841]),
 'param_rf__max_depth': masked_array(data=[7, 8, 3, 8, 8, 6, 8, 6, 5, 6],
              mask=[False, False, False, False, False, False, False, False,
                    False, False],
        fill_value='?',
             dtype=object),
 'param_rf__max_features': masked_array(data=[20, 21, 23, 21, 24, 21, 12, 2, 1, 12],
              mask=[False, False, False, False, False, F

In [63]:
rnd_search.best_score_

0.661968085106383

In [64]:
score_cols = ["split0", "split1", "split2", "mean_score"]

In [69]:
cv_res = pd.DataFrame(rnd_search.cv_results_)
cv_res.sort_values(by="mean_test_score", ascending=False, inplace=True)
cv_res = cv_res[["param_rf__n_estimators", "param_rf__max_features",
                 "param_rf__max_depth", "split0_test_score",
                 "split1_test_score", "split2_test_score", "mean_test_score"]]
cv_res.columns = ["n_estimators", "max_features", "max_depth"] + score_cols
cv_res

Unnamed: 0,n_estimators,max_features,max_depth,split0,split1,split2,mean_score
6,167,12,8,0.659309,0.662926,0.66367,0.661968
4,140,24,8,0.657926,0.661543,0.661702,0.66039
1,112,21,8,0.655266,0.660559,0.661835,0.65922
3,109,21,8,0.654734,0.660186,0.661835,0.658918
0,102,20,7,0.616543,0.618484,0.620638,0.618555
7,197,2,6,0.584282,0.587181,0.589176,0.586879
9,98,12,6,0.576676,0.57758,0.580612,0.578289
8,67,1,5,0.521676,0.520532,0.520957,0.521055
5,11,21,6,0.502952,0.506117,0.508218,0.505762
2,84,23,3,0.371888,0.36891,0.376702,0.3725


In [70]:
param_distribs = {
    'rf__n_estimators': randint(low=150, high=250),
    'rf__max_features': randint(low=8, high=16),
    'rf__max_depth': randint(low=6, high=12)
}

rnd_search = RandomizedSearchCV(rf_pipe, param_distributions=param_distribs,
                                n_iter=10, cv=3, scoring='accuracy', random_state=42)

In [71]:
rnd_search.fit(dc.X_train, dc.y_train)

In [72]:
cv_res = pd.DataFrame(rnd_search.cv_results_)
cv_res.sort_values(by="mean_test_score", ascending=False, inplace=True)
cv_res = cv_res[["param_rf__n_estimators", "param_rf__max_features",
                 "param_rf__max_depth", "split0_test_score",
                 "split1_test_score", "split2_test_score", "mean_test_score"]]
cv_res.columns = ["n_estimators", "max_features", "max_depth"] + score_cols
cv_res

Unnamed: 0,n_estimators,max_features,max_depth,split0,split1,split2,mean_score
7,213,9,11,0.740186,0.740904,0.740718,0.740603
2,232,14,10,0.721569,0.723191,0.724388,0.72305
4,173,11,10,0.717926,0.720904,0.721436,0.720089
8,182,12,9,0.691649,0.695931,0.692686,0.693422
0,164,12,9,0.691436,0.69484,0.691968,0.692748
9,171,9,9,0.688511,0.691436,0.69367,0.691206
3,237,10,8,0.658856,0.664016,0.665585,0.662819
5,202,13,8,0.659282,0.662979,0.663298,0.661853
1,210,15,8,0.657899,0.663032,0.664548,0.661826
6,179,15,7,0.619309,0.619229,0.624601,0.621046


In [73]:
param_grid = {
    'rf__n_estimators': [200, 220, 240],
    'rf__max_features': [8, 12],
    'rf__max_depth': [10, 15, 20]
}

grid_search = GridSearchCV(rf_pipe, param_grid=param_grid,
                           cv=3, scoring='accuracy', return_train_score=True)

In [74]:
grid_search.fit(dc.X_train, dc.y_train)

In [75]:
cv_res = pd.DataFrame(grid_search.cv_results_)
cv_res

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_rf__max_depth,param_rf__max_features,param_rf__n_estimators,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score,split0_train_score,split1_train_score,split2_train_score,mean_train_score,std_train_score
0,5.8189,0.223031,0.959079,0.011069,10,8,200,"{'rf__max_depth': 10, 'rf__max_features': 8, '...",0.715399,0.718005,0.718537,0.717314,0.001371,18,0.775598,0.774269,0.77391,0.774592,0.000726
1,6.051052,0.018685,1.043403,0.01036,10,8,220,"{'rf__max_depth': 10, 'rf__max_features': 8, '...",0.715505,0.718378,0.718271,0.717385,0.00133,17,0.775931,0.774255,0.774295,0.774827,0.000781
2,6.749746,0.110672,1.149856,0.035325,10,8,240,"{'rf__max_depth': 10, 'rf__max_features': 8, '...",0.716037,0.719229,0.718351,0.717872,0.001346,16,0.775785,0.774747,0.774681,0.775071,0.000505
3,7.660205,0.084818,0.949422,0.004279,10,12,200,"{'rf__max_depth': 10, 'rf__max_features': 12, ...",0.719495,0.721702,0.72016,0.720452,0.000925,15,0.776436,0.77629,0.776955,0.77656,0.000285
4,8.320156,0.176406,1.030707,0.008696,10,12,220,"{'rf__max_depth': 10, 'rf__max_features': 12, ...",0.719521,0.722128,0.720957,0.720869,0.001066,14,0.776569,0.775824,0.776995,0.776463,0.000484
5,9.254023,0.28181,1.121901,0.008366,10,12,240,"{'rf__max_depth': 10, 'rf__max_features': 12, ...",0.720931,0.722181,0.720426,0.721179,0.000738,13,0.776968,0.776476,0.777952,0.777132,0.000614
6,9.120827,0.120528,1.216845,0.077755,15,8,200,"{'rf__max_depth': 15, 'rf__max_features': 8, '...",0.784495,0.788378,0.787074,0.786649,0.001614,12,0.959521,0.959229,0.959402,0.959384,0.00012
7,10.254289,0.280434,1.405691,0.086486,15,8,220,"{'rf__max_depth': 15, 'rf__max_features': 8, '...",0.784761,0.788617,0.787713,0.78703,0.001647,11,0.959375,0.959548,0.959867,0.959597,0.000204
8,10.869624,0.208776,1.386781,0.012956,15,8,240,"{'rf__max_depth': 15, 'rf__max_features': 8, '...",0.785532,0.789495,0.788059,0.787695,0.001638,10,0.959282,0.959495,0.959668,0.959481,0.000158
9,11.63699,0.091744,1.160446,0.009721,15,12,200,"{'rf__max_depth': 15, 'rf__max_features': 12, ...",0.790718,0.792473,0.791676,0.791622,0.000718,9,0.960678,0.961742,0.961809,0.96141,0.000518


In [76]:
score_cols = ["test0", "test1", "test2", "mean_test", "train0", "train1", "train2", "mean_train"]

In [77]:
cv_res.sort_values(by="mean_test_score", ascending=False, inplace=True)
cv_res = cv_res[["param_rf__n_estimators", "param_rf__max_features",
                 "param_rf__max_depth", "split0_test_score",
                 "split1_test_score", "split2_test_score", "mean_test_score",
                 "split0_train_score", "split1_train_score", "split2_train_score", "mean_train_score"]]
cv_res.columns = ["n_estimators", "max_features", "max_depth"] + score_cols

In [78]:
cv_res

Unnamed: 0,n_estimators,max_features,max_depth,test0,test1,test2,mean_test,train0,train1,train2,mean_train
17,240,12,20,0.804069,0.804681,0.804628,0.804459,0.990532,0.990066,0.990439,0.990346
16,220,12,20,0.80375,0.80508,0.804521,0.80445,0.990598,0.989947,0.990386,0.99031
15,200,12,20,0.803298,0.804415,0.803963,0.803892,0.990412,0.990013,0.990386,0.99027
14,240,8,20,0.800452,0.802207,0.801144,0.801268,0.989654,0.989122,0.989242,0.98934
13,220,8,20,0.799761,0.801197,0.800638,0.800532,0.989734,0.989229,0.989282,0.989415
12,200,8,20,0.799867,0.80117,0.800133,0.80039,0.989721,0.989176,0.989149,0.989348
11,240,12,15,0.791463,0.792074,0.792553,0.79203,0.96109,0.961729,0.961981,0.9616
10,220,12,15,0.790745,0.792074,0.792154,0.791658,0.960678,0.961795,0.961782,0.961418
9,200,12,15,0.790718,0.792473,0.791676,0.791622,0.960678,0.961742,0.961809,0.96141
8,240,8,15,0.785532,0.789495,0.788059,0.787695,0.959282,0.959495,0.959668,0.959481


In [79]:
param_grid = {
    'rf__n_estimators': [240, 280, 320],
    'rf__max_features': [15, 25],
    'rf__max_depth': [18, 27, 36]
}

grid_search = GridSearchCV(rf_pipe, param_grid=param_grid,
                           cv=3, scoring='accuracy', return_train_score=True)

In [80]:
grid_search.fit(dc.X_train, dc.y_train)