In [11]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from utils import read_classification_dataset, features_histograms_mean_std
# import sklearn
from sklearnex import patch_sklearn
from sklearn.calibration import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.model_selection import cross_val_score
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer, SimpleImputer, KNNImputer
from sklearn.decomposition import PCA
from sklearn.model_selection import GridSearchCV
import joblib

from kneed import KneeLocator

patch_sklearn()

Intel(R) Extension for Scikit-learn* enabled (https://github.com/intel/scikit-learn-intelex)


In [12]:
model = Pipeline([
      ('Scaler', StandardScaler()),
      ('Imputer', KNNImputer()),
      ('PCA', PCA(n_components=24)),
      ('KNN', KNeighborsClassifier())
    ])
param_grid = [
    {
        "Imputer": [KNNImputer(weights='distance')],
        "Imputer__n_neighbors":[1,3,7,21],
        "KNN__n_neighbors":[1,3,7,21],
        "KNN": [KNeighborsClassifier(weights="distance", n_jobs=-1)]
    },
]

grid = GridSearchCV(model, param_grid, scoring="f1_weighted")
train, target, test = read_classification_dataset(1)
X, y = train.values, target.values.flatten()
grid.fit(X, y)
grid.best_estimator_

c:\Users\Morri\Documents\Notebooks\ML\Project




In [13]:
print(grid.best_score_)
print(grid.best_params_)
prediction = grid.best_estimator_.predict(test.values)
display(prediction)

0.9616363636363635
{'Imputer': KNNImputer(weights='distance'), 'Imputer__n_neighbors': 1, 'KNN': KNeighborsClassifier(n_jobs=-1, weights='distance'), 'KNN__n_neighbors': 3}


array([2, 1, 1, 1, 1, 2, 1, 1, 3, 2, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 4,
       3, 3, 4, 1, 5, 4, 1, 3, 1, 1, 4, 1, 1, 1, 1, 4, 3, 5, 1, 1, 4, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1], dtype=int64)

In [14]:
res = grid.cv_results_
param_scores = list((zip(res["rank_test_score"],res["mean_test_score"],res["params"], )))
param_scores.sort(key=lambda x: x[0])
param_scores

[(1,
  0.9616363636363635,
  {'Imputer': KNNImputer(weights='distance'),
   'Imputer__n_neighbors': 1,
   'KNN': KNeighborsClassifier(n_jobs=-1, weights='distance'),
   'KNN__n_neighbors': 3}),
 (1,
  0.9616363636363635,
  {'Imputer': KNNImputer(weights='distance'),
   'Imputer__n_neighbors': 3,
   'KNN': KNeighborsClassifier(n_jobs=-1, weights='distance'),
   'KNN__n_neighbors': 3}),
 (1,
  0.9616363636363635,
  {'Imputer': KNNImputer(weights='distance'),
   'Imputer__n_neighbors': 7,
   'KNN': KNeighborsClassifier(n_jobs=-1, weights='distance'),
   'KNN__n_neighbors': 3}),
 (4,
  0.9517138830162086,
  {'Imputer': KNNImputer(weights='distance'),
   'Imputer__n_neighbors': 21,
   'KNN': KNeighborsClassifier(n_jobs=-1, weights='distance'),
   'KNN__n_neighbors': 3}),
 (5,
  0.9454458874458874,
  {'Imputer': KNNImputer(weights='distance'),
   'Imputer__n_neighbors': 1,
   'KNN': KNeighborsClassifier(n_jobs=-1, weights='distance'),
   'KNN__n_neighbors': 1}),
 (5,
  0.9454458874458874,
  

In [16]:
joblib.dump(grid.best_estimator_, './models/c_1_Scale_KNNI(1)_PCA(24)_KNN(3).pkl')
joblib.load('./models/c_1_Scale_KNNI(1)_PCA(24)_KNN(3).pkl').score(X, y)


0.7533333333333333