In [1]:
from mylib import base, dataset
from mylib.model_classification import ModelClass
from mylib.model_base import ModelBase

from dotenv import dotenv_values
from pathlib import Path
from pprint import pprint, pformat

import pandas as pd
import pandas.api.types as pd_types

import numpy as np

import plotly.express as plotly_px

import joblib

import seaborn as sns
import matplotlib.pyplot as plt


In [2]:
import warnings
warnings.filterwarnings('ignore')

pd.set_option('display.max_columns', 50) # Устанавливаем максимальное количество отображаемых столбцов равным 50
#pd.set_option('display.max_rows', 50) # Устанавливаем максимальное количество отображаемых строк равным 20
pd.options.display.float_format = '{:.5f}'.format # Устанавливаем формат отображения чисел с двумя знаками после запятой
pd.options.mode.use_inf_as_na = True # Настройка режима Pandas для рассмотрения бесконечностей (inf) как пропущенных значений (NA)

# Конфигурация формата отображения графиков в виде векторных изображений
%config InlineBackend.figure_format = 'svg'

# для построения графиков внутри Jupyter Notebook
%matplotlib inline

# 1. Загрузка конфигов

In [3]:
# загрузить параметры
settings_dict = {
    **dotenv_values("settings")
}

settings = base.Settings(settings_dict)
settings.enviroment["RANDOM_STATE"] = int(settings.enviroment["RANDOM_STATE"])
n_jobs = -1
verbose = 3
load_big = True
use_sklearn = False

In [4]:
settings.enviroment

{'DATASET_SUBFOLDER': 'dataset',
 'RESULT_SUBFOLDER': 'result',
 'CACHE_SUBFOLDER': 'cached_results',
 'RANDOM_STATE': 42,
 'DATASET_FILENAME_TEMPLATE': 'dataset_df_%s.joblib',
 'PARAMS_FILENAME_TEMPLATE': 'params_%s.joblib',
 'X_Train_FILENAME_TEMPLATE': 'X_Train_%s.joblib',
 'y_Train_FILENAME_TEMPLATE': 'y_Train_%s.joblib',
 'X_Test_FILENAME_TEMPLATE': 'X_Test_%s.joblib',
 'y_Test_FILENAME_TEMPLATE': 'y_Test_%s.joblib',
 'GRID_SEARCH_TEMPLATE_FILENAME': '03_GridSearch_%s.joblib',
 'MODEL_CLASS_TEMPLATE_FILENAME': '04_model_%s.joblib'}

# 1. Загрузка датасета

In [5]:
params = joblib.load(Path(settings.result_folder, settings.enviroment["PARAMS_FILENAME_TEMPLATE"]  % "common"))

In [6]:
if load_big:
    print('Используется полный набор данных')
    y_train = joblib.load(Path(settings.result_folder, settings.enviroment["y_Train_FILENAME_TEMPLATE"] % "big"))
    y_test  = joblib.load(Path(settings.result_folder, settings.enviroment["y_Test_FILENAME_TEMPLATE"] % "big"))
    X_train = joblib.load(Path(settings.result_folder, settings.enviroment["X_Train_FILENAME_TEMPLATE"] % "big"))
    X_test  = joblib.load(Path(settings.result_folder, settings.enviroment["X_Test_FILENAME_TEMPLATE"] % "big"))
    X_train_StandardScaler = joblib.load(Path(settings.result_folder, settings.enviroment["X_Train_FILENAME_TEMPLATE"] % "big_StandardScaler,"))
    X_test_StandardScaler  = joblib.load(Path(settings.result_folder, settings.enviroment["X_Test_FILENAME_TEMPLATE"]  % "big_StandardScaler,"))
else:
    print('Используется сокращенный набор данных')
    y_train = joblib.load(Path(settings.result_folder, settings.enviroment["y_Train_FILENAME_TEMPLATE"] % "small"))
    y_test  = joblib.load(Path(settings.result_folder, settings.enviroment["y_Test_FILENAME_TEMPLATE"] % "small"))
    X_train = joblib.load(Path(settings.result_folder, settings.enviroment["X_Train_FILENAME_TEMPLATE"] % "small"))
    X_test  = joblib.load(Path(settings.result_folder, settings.enviroment["X_Test_FILENAME_TEMPLATE"] % "small"))
    X_train_StandardScaler = joblib.load(Path(settings.result_folder, settings.enviroment["X_Train_FILENAME_TEMPLATE"] % "small_StandardScaler,"))
    X_test_StandardScaler  = joblib.load(Path(settings.result_folder, settings.enviroment["X_Test_FILENAME_TEMPLATE"]  % "small_StandardScaler,"))

Используется полный набор данных


# 2. Модель KNeighborsClassifier

In [7]:
if use_sklearn:
    print('Используется sklearn.neighbors.KNeighborsClassifier')
    import sklearn.neighbors
    KNN_Classifier = sklearn.neighbors.KNeighborsClassifier
    knn_params = {'n_neighbors': range(5, 21),
                  'p': [1, 2, 3]
                 }
    knn_model_name = "KNN_sklearn"
else:
    print('Используется cuml.neighbors.KNeighborsClassifier')
    import cuml.neighbors
    KNN_Classifier = cuml.neighbors.KNeighborsClassifier
    knn_params = {'n_neighbors': range(5, 21),
                  'p': [1, 2, 3]
                 }
    knn_model_name = "KNN_cuml"

Используется cuml.neighbors.KNeighborsClassifier


In [8]:

knn_grid_search = ModelBase.load_or_create_and_fit_GridSearchCV(knn_model_name, 
                                                                KNN_Classifier, 
                                                                knn_params,
                                                                X_train_StandardScaler, y_train, settings,
                                                                n_jobs=n_jobs, 
                                                                verbose=verbose,
                                                                scoring='accuracy')

Создается и выполняется RandomizedSearchCV для модели KNN_cuml класса KNeighborsClassifier
[2025-04-10 02:33:58.003] [CUML] [info] Unused keyword parameter: n_jobs during cuML estimator initialization
Fitting 5 folds for each of 48 candidates, totalling 240 fits
[2025-04-10 02:33:58.061] [CUML] [info] Unused keyword parameter: n_jobs during cuML estimator initialization
[2025-04-10 02:33:58.061] [CUML] [info] Unused keyword parameter: n_jobs during cuML estimator initialization
[2025-04-10 02:33:58.062] [CUML] [info] Unused keyword parameter: n_jobs during cuML estimator initialization
[2025-04-10 02:33:58.062] [CUML] [info] Unused keyword parameter: n_jobs during cuML estimator initialization
[2025-04-10 02:33:58.062] [CUML] [info] Unused keyword parameter: n_jobs during cuML estimator initialization
[2025-04-10 02:33:58.062] [CUML] [info] Unused keyword parameter: n_jobs during cuML estimator initialization
[2025-04-10 02:33:58.062] [CUML] [info] Unused keyword parameter: n_jobs duri

In [9]:
print("\nПодобранные гиперпараметры модели:")
pprint(knn_grid_search.best_params_)
print(knn_grid_search.best_score_)       


Подобранные гиперпараметры модели:
{'n_neighbors': 20, 'p': 1}
0.7103111740272982


In [10]:
knn_model = ModelClass.load_or_create_and_fit_model(knn_model_name, KNN_Classifier, 
                                                    knn_grid_search.best_params_, 
                                                    X_train_StandardScaler, X_test_StandardScaler, y_train, y_test,
                                                    settings)

knn_model.show_quality()

Создается и тренируется модель KNN_cuml класса KNeighborsClassifier
Гиперпараметры модели: {'p': 1, 'n_neighbors': 20}

Класс-обвертка модели сохранен в /home/igel/Projects/ml/ml-inno-hw/3. Machine Learning/3.99 Промежуточная аттестация/result/04_model_KNN_cuml.joblib

Натренированная модель сохранена в /home/igel/Projects/ml/ml-inno-hw/3. Machine Learning/3.99 Промежуточная аттестация/result/04_model_KNN_cuml_trained.joblib
