# 0 Импорты и библиотеки

## import

In [1]:
from dotenv import dotenv_values
from pathlib import Path
from pprint import pprint, pformat
import zipfile

import opendatasets as od 
import pandas as pd
import pandas.api.types as pd_types

import numpy as np

import plotly.express as plotly_px
import plotly.graph_objects as plotly_go
import plotly.subplots as plotly_subplt
import seaborn as sns
import matplotlib.pyplot as plt

import copy
import joblib

from dataclasses import dataclass
import builtins
from pathlib import Path
import time
import tqdm
import abc

In [2]:
from sklearn.model_selection import (
    train_test_split,  # Функция для разделения данных на обучающую и тестовую выборки
    cross_val_score, # оценщик кросс-валидации
    GridSearchCV,  # Класс для поиска гиперпараметров с помощью сеточного поиска
    RandomizedSearchCV
)

from sklearn.preprocessing import (OneHotEncoder, 
                                   OrdinalEncoder
                                  )





## Библиотека

### BaseLib

In [3]:
class BaseLib():
    @staticmethod
    def st():
        """старт таймера"""
        return time.monotonic_ns()
    @staticmethod
    def ft(start):
        """финиш таймера и вывод времени"""
        duration = (time.monotonic_ns() - start) / 1000000000
        print(f'Затрачено времени: {duration:.2f} секунд')
        return duration

    @staticmethod
    def get_type(type_name):
        try:
            return getattr(builtins, type_name)
        except AttributeError:
            try:
                obj = globals()[type_name]
            except KeyError:
                return None
            return repr(obj) if isinstance(obj, type) else None

### DataSetLib

In [4]:
class DataSetLib():
    """Библиотека функций для работы с датасетом"""

    @staticmethod
    def columns_by_type(df, target_name, cat_treshold=2):
        """Сфромировать словарь, с разделением имен столбцов по типам:
        - target_columns - целевой столбец
        - columns_X - все столбцы-фичи
        - num_columns - числовые столбцы
        - cat_columns - категориальные. Категориальными считаютс и числовые столбцы, в которых уникальных значений меньше или равно  cat_treshold"""
        params = {}
        # columns_X - переменные датасета
        params["columns_X"] = df.columns.to_list()
        # целевой столбец
        params["target_column"] = target_name
        if target_name is not None:
            params["columns_X"].remove(params["target_column"])
        params["num_columns"] = []
        params["cat_columns"] = []
        # определить числовые и категориальные столбцы
        for col in params["columns_X"]:
            if df[col].nunique() <= cat_treshold or not pd_types.is_numeric_dtype(df[col]):
                params["cat_columns"].append(col)
            else:
                params["num_columns"].append(col)
        #print(f'target_columns={params["target_column"]}')        
        #print(f'columns_X={params["columns_X"]}')
        #print(f'cat_columns={pformat(params["cat_columns"])}')
        #print(f'num_columns={pformat(params["num_columns"])}')
        return params
    
    @staticmethod
    def remove_columns(params, column): 
        """удалить столбец и словаря параметров"""
        if column in params["columns_X"]:
            params["columns_X"].remove(column)
        if column in params["num_columns"]:
            params["num_columns"].remove(column)
        if column in params["cat_columns"]:
            params["cat_columns"].remove(column)
        return params

    @staticmethod
    def add_columns(params, column, type_column): 
        """добавить столбец в словарь параметров
        type_column = "cat" или "num" или None """
        if column not in params["columns_X"]:
            params["columns_X"].append(column)
        if type_column is None:
            pass
        elif type_column == "cat":
            params["cat_columns"].append(column)
        elif type_column == "num":
            params["num_columns"].append(column)
        else:
            raise ValueError("type_column должен быть 'cat' или 'num'")
        return params

    @staticmethod
    def describe_columns(df, params):
        """Отобразить описание содержимого столбцов"""    
        # подсчет столбцов с пропусками
        nan_in_columns = DataSetLib.nans_percents(df)
        
        print("Количество уникальных значений по столбцам, доля пропусков и уникальные значения, если их не более 10")
        nunique = df[params["columns_X"]].nunique()
        for column in nunique.index:
            if column in params["cat_columns"]:
                column_type = "[c]" # категориальные
            else:
                column_type = "[n]" # числовые
                
            if nan_in_columns[column] > 0:
                nan_str = f'({nan_in_columns[column]:4.1f}%)'
            else:
                nan_str = " "*7
            if nunique[column] <= 10:
                print(f'{column:20}{column_type}: {nunique[column]:6} {nan_str}, {df[column].unique().tolist()}')
            else:
                print(f'{column:20}{column_type}: {nunique[column]:6} {nan_str}')

        if params["target_column"] is not None:
            df_describe_num = DataSetLib.eda_df(df[params["num_columns"]+[params["target_column"]]])
        else:
            df_describe_num = DataSetLib.eda_df(df[params["num_columns"]])
        display(df_describe_num)            

        df_describe_cat = df[params["cat_columns"]].describe()
        display(df_describe_cat)

    @staticmethod
    def eda_df(df):
        """Провести EDA для датафрейма"""
        df_describe = df.describe(percentiles=[0.1, 0.25, 0.5, 0.75, 0.9])
        # посчитать долю пропусков
        df_describe.loc["%nan"] = (np.round(df[df_describe.columns].isna().mean()*100, 2)).to_list()
        # посчитать дисперсию
        columns_var = []
        for column in df_describe.columns:
            columns_var.append(df[column].var())
        df_describe.loc['var'] = columns_var
        return df_describe

    @staticmethod
    def show_boxes(df, columns, ncols = 3, type="box", row_height=500, total_width=1200):
        """Показать 'ящики_с_усами' для набора df.
        Ящики будут показаны для столбцов датафрема, перечисленных в columns.
        Графики будут показаны в несколько столбцов, количество которых задается в параметре ncols."""
        nrows = int(round((len(columns) + 0.5) / ncols, 0))
        nrows = nrows if nrows > 1 else 1

        if type == "box":
            title = "Ящики с усами"
        elif type == "hist":
            title = "Гистрограммы"
        elif type == "pie":
            title = "Пирожки"
        else:
            raise f"Не реализована обработка типа графика {type}"


        fig = plotly_subplt.make_subplots(rows=nrows, cols=ncols)
        fig.update_layout(
            title_x=0.5,
            title_text=title,
            height=row_height*nrows, 
            width=total_width
        )
        i = 0
        for r in range(nrows):
            for c in range(ncols):
                if type == "box":
                    fig.add_box(y=df[columns[i]], name=columns[i], row=r+1, col=c+1)
                elif type == "hist":
                    fig.add_histogram(x=df[columns[i]], name=columns[i], row=r+1, col=c+1)
                elif type == "pie":
                    fig.add_pie(df[columns[i]].value_counts().values,
                                labels=df[columns[i]].value_counts().index, 
                                name=columns[i], row=r+1, col=c+1)
                else:
                    raise f"Не реализована обработка типа графика {type}"
                i += 1
                if i >= len(columns):
                    break
            if i >= len(columns):
                break
        fig.show()          

    @staticmethod
    def show_boxes_plt(df, columns_x, ncols = 3, type="box", row_height=500, total_width=1200, column_y=None, filename=None):
        """Показать 'ящики_с_усами' для набора df.
        Ящики будут показаны для столбцов датафрема, перечисленных в columns.
        Графики будут показаны в несколько столбцов, количество которых задается в параметре ncols."""
        nrows = int(round((len(columns_x) + 0.59) / ncols, 0))
        nrows = nrows if nrows > 1 else 1

        if type == "box":
            title = "Ящики с усами"
        elif type == "hist":
            title = "Гистрограммы"
        elif type == "pie":
            title = "Пирожки"
        else:
            raise f"Не реализована обработка типа графика {type}"
                
        plt.figure(figsize=(ncols * 5, nrows * 3))
        
        for i, column in enumerate(columns_x, start=1):
            plt.subplot(nrows, ncols, i)
            if type == "box":
                if column_y is None:
                    sns.boxplot(x=df[column])
                else:
                    sns.boxplot(x=df[column], y=df[column_y])
            elif type == "hist":
                sns.histplot(df[column], kde=True)
            elif type == "pie":
                # define Seaborn color palette to use 
                palette_color = sns.color_palette("pastel") 
                # plotting data on chart 
                plt.pie(x=df[column].value_counts().values, 
                        labels=df[column].value_counts().index, 
                        colors=palette_color, autopct='%.0f%%') 
            else:
                raise f"Не реализована обработка типа графика {type}"
            # Добавить название столбца как заголовок графика
            plt.title(column)
        plt.tight_layout()
        if filename is not None:
            plt.savefig(filename, dpi=300)
        else:
            plt.show()
            
    @staticmethod            
    def iqr_values(values):
        """Границы для ящика-с-усами
        Возвращаемые значения: Q1, Q3, IQR, lower, upper
        """
        Q3 = np.quantile(values, 0.75, axis=0)
        Q1 = np.quantile(values, 0.25, axis=0)
        IQR = Q3 - Q1
        upper = Q3 + 1.5 * IQR
        lower = Q1 - 1.5 * IQR
        return Q1, Q3, IQR, lower, upper

        
    @staticmethod    
    def nans_percents(df):
        return df.isna().sum()/len(df)*100    

    @staticmethod
    def encode_features(src_df, onehot_cols=None, onehot_drop=None, ordinal_cols=None, columns_X=None):
        df = src_df.copy()  
        new_columns_X = copy.deepcopy(columns_X)
        if onehot_cols is not None:
            encoder = OneHotEncoder(sparse_output=False, drop=onehot_drop)
            one_hot_encoded = encoder.fit_transform(df[onehot_cols])
            one_hot_df = pd.DataFrame(one_hot_encoded, columns=encoder.get_feature_names_out(onehot_cols))
            df = pd.concat([df, one_hot_df], axis=1)
            new_columns_X += encoder.get_feature_names_out(onehot_cols).tolist()
            for col in onehot_cols:
                if col in columns_X:
                    new_columns_X.remove(col)
            df.drop(onehot_cols, axis=1, inplace=True)
            
        if ordinal_cols is not None:
            ordinal_columns_cats = list(ordinal_cols.values())
            ordinal_columns_list = list(ordinal_cols.keys())
            encoder = OrdinalEncoder(categories = ordinal_columns_cats)
            df[ordinal_columns_list] = encoder.fit_transform(df[ordinal_columns_list])  

        return df, new_columns_X

    @staticmethod
    def fill_with_mode(data, group_col, target_col):
        """Заполнить target_col модой внутри каждой группы столбцов group_col"""
        global_mode = data[target_col].mode()[0]
        def fill_group_mode(x):
            group_mode = x.mode()
            if not group_mode.empty:
                return group_mode[0]
            else:
                return global_mode
        data[target_col] = data.groupby(group_col)[target_col].transform(fill_group_mode)

    @staticmethod   
    def fill_with_mean(data, group_col, target_col):
        """Заполнить target_col средним внутри каждой группы столбцов group_col"""
        def fill_group_mean(x):
            return x.mean()
        data[target_col] = data.groupby(group_col)[target_col].transform(fill_group_mean)    
        # заполним глобальным средним, если что-то пропустилось
        data.fillna({target_col: data[target_col].mean()}, inplace=True)

    @staticmethod
    def prepare_dataset(dataset_df, params, scaler=None, train_size=0.7):
        """Разделить датасет на тренировочную и тестовую выборки и прогнать через нормализатор, если он указан"""
        X_train, X_test, y_train, y_test = train_test_split(dataset_df[params["columns_X"]], 
                                                            dataset_df[params["target_column"]], 
                                                            train_size=train_size, 
                                                            stratify=dataset_df[params["target_column"]],
                                                            random_state=42)
        # Нормировка признаков
        if scaler is not None:
            X_train = scaler.fit_transform(X_train)
            X_test = scaler.transform(X_test)        
        return X_train, X_test, y_train, y_test


    @staticmethod
    def find_rows_with_nan(dataset_df, columns, debug=False):
        # сначала посмотрим на столбцы с измерениями 
        all_rows_with_nan = []
        rows_with_all_columns_nan = []
        for column in columns:
            nan_rows = dataset_df[dataset_df[column].isna()].index.to_list()
            if debug:
                print(f'Индексы строк с пустым {column}: {nan_rows}')
            all_rows_with_nan += nan_rows
            if rows_with_all_columns_nan == []:
                rows_with_all_columns_nan = nan_rows
            else:
                rows_with_all_columns_nan = list(set(rows_with_all_columns_nan) & set(nan_rows))
        # получить уникальный список индексов с пустыми столбцами
        all_rows_with_nan = list(set(all_rows_with_nan))    
        return rows_with_all_columns_nan, all_rows_with_nan        

### Settings

In [5]:
@dataclass 
class Settings():
    enviroment: object
    
    def __post_init__(self):
        self.dataset_folder = str(Path(Path.cwd(), self.enviroment["DATASET_SUBFOLDER"]))
        self.cache_folder = str(Path(Path.cwd(), self.enviroment["CACHE_SUBFOLDER"]))
        self.result_folder = str(Path(Path.cwd(), self.enviroment["RESULT_SUBFOLDER"]))
        
    def cache_gridsearch_filename(self, model_name): 
        return Path(self.cache_folder, self.enviroment["GRID_SEARCH_TEMPLATE_FILENAME"] % model_name)
    def cache_model_filename(self, model_name): 
        return Path(self.cache_folder, self.enviroment["MODEL_CLASS_TEMPLATE_FILENAME"] % model_name)
    def result_gridsearch_filename(self, model_name): 
        return Path(self.result_folder, self.enviroment["GRID_SEARCH_TEMPLATE_FILENAME"] % model_name) 
    def result_model_filename(self, model_name): 
        return Path(self.result_folder, self.enviroment["MODEL_CLASS_TEMPLATE_FILENAME"] % model_name)
    def result_trained_model_filename(self, model_name): 
        return Path(self.result_folder, self.enviroment["MODEL_CLASS_TEMPLATE_FILENAME"] % f'{model_name}_trained')             

### ModelWrapBase

In [6]:
class ModelWrapBase(abc.ABC):

    def __init__(self, name):
        self.name = name
        self.model_params = None
        self.X_train = None
        self.X_test = None
        self.y_train = None
        self.y_test = None

        self.model = None

    def create_model(self, model_class, model_params, X_train, X_test, y_train, y_test):
        self.model_params = model_params
        self.X_train = X_train
        self.X_test = X_test
        self.y_train = y_train
        self.y_test = y_test
        self.model = model_class(**self.model_params)
    
    def fit(self):
        self.model.fit(self.X_train, self.y_train)    

    @abc.abstractmethod
    def calc_metrics(self):
        raise NotImplemented
    
    @abc.abstractmethod
    def show_quality(self): 
        raise NotImplemented
    
    @staticmethod
    def metrics_names():
        raise NotImplemented
    
    @staticmethod
    def metrics(self):
        raise NotImplemented

    @staticmethod
    def _load_or_create_and_fit_model(model_meta_class, 
                                      model_name, model_class, model_params, 
                                      X_train, X_test, y_train, y_test,
                                      settings, 
                                      need_save=True):
        """Загрузить ранее обученную модель из кеша.
        Если в кеше нет - обучить на переданных данных с заданными параметрами.
        """
        model_filename_cache = settings.cache_model_filename(model_name)
        model_filename = settings.result_model_filename(model_name)

        if Path.is_file(model_filename_cache):
            model = joblib.load(model_filename_cache)
            print(f"Модель {type(model.model).__name__} загружена из {model_filename_cache}")
        else:
            print(f"Создается и тренируется модель {model_name} класса {type(model_class).__name__}")
            print(f'Гиперпараметры модели: {model_params}')
            model = model_meta_class(model_name)
            model.create_model(model_class, model_params, X_train, X_test, y_train, y_test)
            model.fit()
            model.calc_metrics()
            if need_save:
                print(f"\nКласс-обвертка модели сохранен в {model_filename}")
                _ = joblib.dump(model, model_filename)
                print(f"\nНатренированная модель сохранена в {settings.result_trained_model_filename(model_name)}")
                _= joblib.dump(model.model, settings.result_trained_model_filename(model_name))
        return model
    
    @staticmethod
    def load_or_create_and_fit_GridSearchCV(model_name, model_class, param_grid, X_train, y_train,
                                             settings, 
                                             scoring='roc_auc', 
                                             need_save=True, n_jobs=None, verbose=1,
                                             use_randomize_search = True, n_iter=100):
        """Загрузить ранее обученные GridSearchCV из кеша. 
        Если в кеше нет - создать и потренировать, найдя лучшие гиперпараметры"""
        
        
        grid_search_filename_cache = settings.cache_gridsearch_filename(model_name)
        grid_search_filename = settings.result_gridsearch_filename(model_name)

        if Path.is_file(grid_search_filename_cache):
            print(f"GridSearchCV() загружен из {grid_search_filename_cache}")
            grid_search = joblib.load(grid_search_filename_cache)
        else:
            if use_randomize_search:
                print(f"Создается и выполняется RandomizedSearchCV для модели {model_name} класса {model_class.__name__}")
                grid_search = RandomizedSearchCV(model_class(), param_grid, cv=5, n_jobs=n_jobs, 
                                                 verbose=verbose, scoring=scoring,
                                                 random_state=settings.enviroment["RANDOM_STATE"],
                                                 n_iter=n_iter)
            else:
                print(f"Создается и выполняется GridSearchCV для модели {model_name} класса {model_class.__name__}")
                grid_search = GridSearchCV(model_class(), param_grid, cv=5, n_jobs=n_jobs, 
                                           verbose=verbose, scoring=scoring)
            
            # Обучаем модель на данных с использованием кросс-валидации
            grid_search.fit(X_train, y_train)
        
            if need_save:
                print(f"\nРезультаты поиска оптимальных гиперпараметров модели сохранены в {grid_search_filename}")
                _ = joblib.dump(grid_search, grid_search_filename)
        return grid_search    
    
    @staticmethod
    def compare_metrcis(model_wraps):
        """Сформировать датафрейм с метриками моделей из списка model_wraps"""
        df_metrics = []
        for model_wrap in model_wraps:
            df_metrics.append(pd.DataFrame(model_wrap.metrics()))

        df_stat = pd.concat(df_metrics)
        columns = ['model_name']
        columns = columns + model_wraps[0].metrics_names()
        df_stat2 = df_stat.pivot_table(columns = 'params',
                                        index='model_name',
                                        values='values').reset_index()[columns]
        return df_stat2


### ModelWrapRegression

In [7]:
# Наборы метрик для оценки моделей регрессии
from sklearn.metrics import (
    mean_squared_error,  # Средняя квадратичная ошибка для регрессии
    mean_absolute_error, 
    root_mean_squared_error, 
    r2_score  # Коэффициент детерминации для регрессии
)

In [8]:
class ModelWrapRegression(ModelWrapBase):
    
    def __init__(self, name):
        super().__init__(name)
        
        self.mse_train = None
        self.r2_train = None
        self.rmse_train = None
        self.mae_train = None

        self.mse_test = None
        self.r2_test = None
        self.rmse_test = None
        self.mae_test = None

    def calc_metrics(self):
        """Посчитать метрики модели"""
        self.y_train_pred = self.model.predict(self.X_train)
        self.y_test_pred = self.model.predict(self.X_test)
        
        self.mse_train = mean_squared_error(self.y_train, self.y_train_pred)
        self.r2_train = r2_score(self.y_train, self.y_train_pred)
        self.rmse_train = root_mean_squared_error(self.y_train, self.y_train_pred)
        self.mae_train = mean_absolute_error(self.y_train, self.y_train_pred)   
        self.median_train = self.y_train.median() 

        self.mse_test = mean_squared_error(self.y_test, self.y_test_pred)
        self.r2_test = r2_score(self.y_test, self.y_test_pred)
        self.rmse_test = root_mean_squared_error(self.y_test, self.y_test_pred)
        self.mae_test = mean_absolute_error(self.y_test, self.y_test_pred)    
        self.median_test = self.y_test.median() 
    
        
    def show_quality(self): 
        """Показать различные метрики"""
        print('Train data:')
        print(f"  MSE:    {round(self.mse_train,4)}")
        print(f"  RMSE:   {round(self.rmse_train,4)}")
        print(f"  MAE:    {round(self.mae_train,4)}")
        print(f"  r2:     {round(self.r2_train,4)}")
        print(f"  median: {round(self.median_train,4)}")

        print('Test data:')
        print(f"  MSE:    {round(self.mse_test,4)}")
        print(f"  RMSE:   {round(self.rmse_test,4)}")
        print(f"  MAE:    {round(self.mae_test,4)}")
        print(f"  r2:     {round(self.r2_test,4)}")    
        print(f"  median: {round(self.median_train,4)}")        
    
    @staticmethod
    def metrics_names():
        return ['Train_MSE', 'Test_MSE',
                'Train_RMSE', 'Test_RMSE',
                'Train_MAE', 'Test_MAE',
                'Train_R2', 'Test_R2',
                'Train_median', 'Test_Median'
                ]
    
    def metrics(self):
        """Сформировать словарь о сзначениями метрик модели"""
        metrics_as_dict = {
                'params': ModelWrapRegression.metrics_names(),
                'values': [
                    self.mse_train, self.mse_test,
                    self.rmse_train, self.rmse_test,
                    self.mae_train, self.mae_test,
                    self.r2_train, self.r2_test,
                    self.median_train, self.median_train
                ],
                'model_name': [self.name for i in range(len(ModelWrapRegression.metrics_names()))]
            }      
        return metrics_as_dict

    @staticmethod    
    def load_or_create_and_fit_model(model_name, model_class, model_params, 
                                    X_train, X_test, y_train, y_test,
                                    settings, 
                                    need_save=True):
        """Загрузить ранее обученную модель из кеша.
        Если в кеше нет - обучить на переданных данных с заданными параметрами.
        """
        return ModelWrapBase._load_or_create_and_fit_model(ModelWrapRegression, 
                                                       model_name, model_class, model_params, 
                                                       X_train, X_test, y_train, y_test,
                                                       settings, 
                                                       need_save)
    

### ModelWrapClassification

In [9]:
# Наборы метрик для оценки моделей классификации
from sklearn.metrics import (
    precision_score, 
    recall_score, 
    roc_auc_score, 
    roc_curve, 
    f1_score,  #f1-мера
    accuracy_score,  # Метрика точности для классификации
    classification_report,  # Отчет о классификации
    confusion_matrix
)

In [10]:
class ModelWrapClass(ModelWrapBase):
    
    def __init__(self, name):
        super().__init__(name)

        self.train_precision = None
        self.test_precision = None
        self.train_recall = None
        self.test_recall = None
        self.train_roc_auc = None
        self.test_roc_auc = None
        self.train_accuracy = None
        self.test_accuracy = None
        self.train_f1_score = None
        self.test_f1_score = None
        self.specific_data = None
        self.y_train = None
        self.y_test = None
        self.y_train_proba = None
        self.y_test_proba = None

    def calc_metrics(self):
        """Посчитать метрики модели"""
        self.y_train_pred = self.model.predict(self.X_train)
        self.y_train_prob = self.model.predict_proba(self.X_train)[:, 1]
        self.y_test_pred = self.model.predict(self.X_test)
        self.y_test_prob = self.model.predict_proba(self.X_test)[:, 1]
    
        # матрица ошибок
        #self.conf_matrix_train = confusion_matrix(self.y_train, self.y_train_pred)
        #self.conf_matrix_test = confusion_matrix(self.y_test, self.y_test_pred)
        #self.conf_matrix_norm_train = confusion_matrix(self.y_train, self.y_train_pred, normalize='all')
        #self.conf_matrix_norm_test = confusion_matrix(self.y_test, self.y_test_pred, normalize='all')
        

        # Расчет AUC-ROC
        self.train_roc_auc = roc_auc_score(self.y_train, self.y_train_prob)
        self.test_roc_auc = roc_auc_score(self.y_test, self.y_test_prob)

        # Поиск порога, максимизирующего F1-score
        thresholds = np.arange(0.0, 1.0, 0.01)
        f1_scores = [f1_score(self.y_test, self.y_test_prob >= t) for t in thresholds]
        self.optimal_threshold = thresholds[np.argmax(f1_scores)]

        # Пересчет метрик с учетом оптимального порога
        self.y_train_pred_optimal = (self.y_train_prob >= self.optimal_threshold).astype(int)
        self.y_test_pred_optimal = (self.y_test_prob >= self.optimal_threshold).astype(int)

        self.train_precision = precision_score(self.y_train, self.y_train_pred_optimal)
        self.test_precision = precision_score(self.y_test, self.y_test_pred_optimal)

        self.train_recall = recall_score(self.y_train, self.y_train_pred_optimal)
        self.test_recall = recall_score(self.y_test, self.y_test_pred_optimal)

        self.train_accuracy = accuracy_score(self.y_train, self.y_train_pred_optimal)
        self.test_accuracy = accuracy_score(self.y_test, self.y_test_pred_optimal)

        self.train_f1_score = f1_score(self.y_train, self.y_train_pred_optimal)
        self.test_f1_score = f1_score(self.y_test, self.y_test_pred_optimal)
        
    def show_quality(self): #X_train, X_test, y_train, y_test, check_result, title, grid_search, model_cl):
        """Показать различные метрики и промежуточные переменные обучения"""
        #def show_quality2(X_train, X_test, y_train, y_test, check_result, title, grid_search, model_cl):
        fig = plotly_subplt.make_subplots(rows=2, cols=2, 
                                        subplot_titles=['ROC AUC', 'Metrics', 'Confusion Matrix Train', 'Confusion Matrix Test'],
                                        vertical_spacing = 0.1,
                                        row_width=[0.4, 0.6])
        fig.update_layout(
            title_x=0.5,
            title_text=self.name,
            width = 1000,
            height = 800,
            legend = dict(yanchor="bottom", y=0.63, xanchor="right", x=0.44),
            margin = {'t':80, 'b':50, 'l':10, 'r':10}
            
        )

        # Построение ROC кривой
        fpr_test, tpr_test, thresholds = roc_curve(self.y_test, self.y_test_prob)
        fpr_train, tpr_train, thresholds = roc_curve(self.y_train, self.y_train_prob)
        roc_train_g = plotly_go.Scatter(x=fpr_train, y=tpr_train, name="ROC curve Train", line={'color':'green'})
        roc_test_g = plotly_go.Scatter(x=fpr_test, y=tpr_test, name="ROC curve Test", line={'color':'blue'})
        roc_diag_g = plotly_go.Scatter(x=[0, 1], y=[0, 1], line={'color':'gray', 'dash': 'dash'}, showlegend=False)

        fig.add_trace(roc_train_g, row=1, col=1)
        fig.add_trace(roc_test_g, row=1, col=1)
        fig.add_trace(roc_diag_g, row=1, col=1)
        
        fig.update_layout(
            xaxis1 = {'title_text': "False Positive Rate"},
            yaxis1 = {'title_text': "True Positive Rate"}
        )    
        

        # Bar с метриками
        df_metrics = pd.DataFrame([[self.test_accuracy,  self.train_accuracy],
                                   [self.test_precision, self.train_precision],
                                   [self.test_recall,    self.train_recall],
                                   [self.test_roc_auc,   self.train_roc_auc],
                                   [self.test_f1_score,  self.train_f1_score]], 
                                  columns = ["Test", "Train"], 
                                  index=["accuracy", "precision", "recall", "ROC AUC", "F1"])
        metrics_train = plotly_go.Bar(x=df_metrics.index, y=df_metrics.Train, 
                        showlegend=True, text=round(df_metrics.Train,4), textangle=0, 
                        xaxis='x2', yaxis='y2', name="Train Metrics")
        metrics_test = plotly_go.Bar(x=df_metrics.index, y=df_metrics.Test, 
                        showlegend=True, text=round(df_metrics.Test,4), textangle=0, 
                        xaxis='x2', yaxis='y2', name="Test Metrics")

        fig.add_trace(metrics_train, row=1, col=2) 
        fig.add_trace(metrics_test, row=1, col=2) 

        # Confusion Matrix 
        """
        cm_normalized_train = self.conf_matrix_train.astype('float') / self.conf_matrix_train.sum(axis=1)[:, np.newaxis]
        print(self.conf_matrix_train.astype('float'))
        print(self.conf_matrix_train.sum(axis=1).astype('float'))
        print(cm_normalized_train)
        heatmap_train = plotly_go.Heatmap(z=cm_normalized_train, x=['0', '1'], y=['0', '1'], colorscale='Blues', 
                                        text=np.round(cm_normalized_train, 3), texttemplate="%{text}", showscale=False)

        cm_normalized_test = self.conf_matrix_test.astype('float') / self.conf_matrix_test.sum(axis=1)[:, np.newaxis]
        heatmap_test = plotly_go.Heatmap(z=cm_normalized_test, x=['0', '1'], y=['0', '1'], colorscale='Blues', 
                                        text=np.round(cm_normalized_test, 3), texttemplate="%{text}", showscale=False)
        """


        train_cm = confusion_matrix(self.y_train, self.y_train_pred_optimal, normalize='all')
        heatmap_train = plotly_go.Heatmap(z=train_cm, 
                                          x=['0', '1'], y=['0', '1'], 
                                          colorscale='Blues', 
                                          text=np.round(train_cm, 3), 
                                          texttemplate="%{text}", 
                                          showscale=False)

        test_cm = confusion_matrix(self.y_test, self.y_test_pred_optimal, normalize='all')
        heatmap_test = plotly_go.Heatmap(z=test_cm, 
                                         x=['0', '1'], y=['0', '1'], 
                                         colorscale='Blues', 
                                         text=np.round(test_cm, 3), 
                                         texttemplate="%{text}", 
                                         showscale=False)


        fig.add_trace(heatmap_train, row=2, col=1)
        fig.add_trace(heatmap_test,  row=2, col=2) 

        fig.update_layout(
            xaxis1 = {'title': 'Predict'},
            xaxis2 = {'title': 'Predict'},
            yaxis1 = {'title': 'Goals'},
            yaxis2 = {'title': 'Goals'},
            xaxis3 = {'title': 'Предсказания'},
            xaxis4 = {'title': 'Предсказания'},
            yaxis3 = {'title': 'Факт'},
            yaxis4 = {'title': 'Факт'},
                        
        )    
        
        fig.show()
    
    @staticmethod
    def metrics_names():
        return ['Training_Precision', 'Test_Precision',
                'Training_Recall', 'Test_Recall',
                'ROC_AUC_Train', 'ROC_AUC_Test',
                'Accuarcy_Train', 'Accuarcy_Test',
                'F1_score_Train', 'F1_score_Test'
                ]
    
    def metrics(self):
        """Сформировать словарь о сзначениями метрик модели"""
        metrics_as_dict = {
                'params': ModelWrapClass.metrics_names(),
                'values': [
                    self.train_precision, self.test_precision,
                    self.train_recall, self.test_recall,
                    self.train_roc_auc, self.test_roc_auc,
                    self.train_accuracy, self.test_accuracy,
                    self.train_f1_score, self.test_f1_score
                ],
                'model_name': [self.name for i in range(len(ModelWrapClass.metrics_names()))]
            }      
        return metrics_as_dict

    @staticmethod    
    def load_or_create_and_fit_model(model_name, model_class, model_params, 
                                    X_train, X_test, y_train, y_test,
                                    settings, 
                                    need_save=True):
        """Загрузить ранее обученную модель из кеша.
        Если в кеше нет - обучить на переданных данных с заданными параметрами.
        """
        return ModelWrapBase._load_or_create_and_fit_model(ModelWrapClass, 
                                                       model_name, model_class, model_params, 
                                                       X_train, X_test, y_train, y_test,
                                                       settings, 
                                                       need_save)


In [11]:
class ModelWrapRegression(ModelWrapBase):
    
    def __init__(self, name):
        super().__init__(name)
        
        self.mse_train = None
        self.r2_train = None
        self.rmse_train = None
        self.mae_train = None

        self.mse_test = None
        self.r2_test = None
        self.rmse_test = None
        self.mae_test = None

    def calc_metrics(self):
        """Посчитать метрики модели"""
        self.y_train_pred = self.model.predict(self.X_train)
        self.y_test_pred = self.model.predict(self.X_test)
        
        self.mse_train = mean_squared_error(self.y_train, self.y_train_pred)
        self.r2_train = r2_score(self.y_train, self.y_train_pred)
        self.rmse_train = root_mean_squared_error(self.y_train, self.y_train_pred)
        self.mae_train = mean_absolute_error(self.y_train, self.y_train_pred)   
        self.median_train = self.y_train.median() 

        self.mse_test = mean_squared_error(self.y_test, self.y_test_pred)
        self.r2_test = r2_score(self.y_test, self.y_test_pred)
        self.rmse_test = root_mean_squared_error(self.y_test, self.y_test_pred)
        self.mae_test = mean_absolute_error(self.y_test, self.y_test_pred)    
        self.median_test = self.y_test.median() 
    
        
    def show_quality(self): 
        """Показать различные метрики"""
        print('Train data:')
        print(f"  MSE:    {round(self.mse_train,4)}")
        print(f"  RMSE:   {round(self.rmse_train,4)}")
        print(f"  MAE:    {round(self.mae_train,4)}")
        print(f"  r2:     {round(self.r2_train,4)}")
        print(f"  median: {round(self.median_train,4)}")

        print('Test data:')
        print(f"  MSE:    {round(self.mse_test,4)}")
        print(f"  RMSE:   {round(self.rmse_test,4)}")
        print(f"  MAE:    {round(self.mae_test,4)}")
        print(f"  r2:     {round(self.r2_test,4)}")    
        print(f"  median: {round(self.median_train,4)}")        
    
    @staticmethod
    def metrics_names():
        return ['Train_MSE', 'Test_MSE',
                'Train_RMSE', 'Test_RMSE',
                'Train_MAE', 'Test_MAE',
                'Train_R2', 'Test_R2',
                'Train_median', 'Test_Median'
                ]
    
    def metrics(self):
        """Сформировать словарь о сзначениями метрик модели"""
        metrics_as_dict = {
                'params': ModelWrapRegression.metrics_names(),
                'values': [
                    self.mse_train, self.mse_test,
                    self.rmse_train, self.rmse_test,
                    self.mae_train, self.mae_test,
                    self.r2_train, self.r2_test,
                    self.median_train, self.median_train
                ],
                'model_name': [self.name for i in range(len(ModelWrapRegression.metrics_names()))]
            }      
        return metrics_as_dict

    @staticmethod    
    def load_or_create_and_fit_model(model_name, model_class, model_params, 
                                    X_train, X_test, y_train, y_test,
                                    settings, 
                                    need_save=True):
        """Загрузить ранее обученную модель из кеша.
        Если в кеше нет - обучить на переданных данных с заданными параметрами.
        """
        return ModelWrapBase._load_or_create_and_fit_model(ModelWrapRegression, 
                                                       model_name, model_class, model_params, 
                                                       X_train, X_test, y_train, y_test,
                                                       settings, 
                                                       need_save)


## Конфигурирование среды и окружения

In [12]:
import warnings
warnings.filterwarnings('ignore')

pd.set_option('display.max_columns', 50) # Устанавливаем максимальное количество отображаемых столбцов равным 50
#pd.set_option('display.max_rows', 50) # Устанавливаем максимальное количество отображаемых строк равным 20
pd.options.display.float_format = '{:.5f}'.format # Устанавливаем формат отображения чисел с двумя знаками после запятой
pd.options.mode.use_inf_as_na = True # Настройка режима Pandas для рассмотрения бесконечностей (inf) как пропущенных значений (NA)

# Конфигурация формата отображения графиков в виде векторных изображений
%config InlineBackend.figure_format = 'svg'

# для построения графиков внутри Jupyter Notebook
%matplotlib inline

In [13]:
settings_filename = "settings"

In [14]:
if not Path(settings_filename).exists():
    with open(settings_filename, "w") as f:
        f.write("""# Каталог с датасетом
DATASET_SUBFOLDER=dataset
# Каталог для результатов и промежуточных файлов
RESULT_SUBFOLDER=result
# Каталог для кеша промежуточных результатов
CACHE_SUBFOLDER=cached_results
# Каталог для boxplot
BOXPLOT_SUBFOLDER=boxplot

RANDOM_STATE=42

DATASET_FILENAME_TEMPLATE=dataset_df_%s.joblib
PARAMS_FILENAME_TEMPLATE=params_%s.joblib

X_Train_FILENAME_TEMPLATE=X_Train_%s.joblib
y_Train_FILENAME_TEMPLATE=y_Train_%s.joblib
X_Test_FILENAME_TEMPLATE=X_Test_%s.joblib
y_Test_FILENAME_TEMPLATE=y_Test_%s.joblib


# Шаблоны для имен
GRID_SEARCH_TEMPLATE_FILENAME=03_GridSearch_%s.joblib
MODEL_CLASS_TEMPLATE_FILENAME=04_model_%s.joblib""")
        

In [15]:
# загрузить параметры
settings_dict = {
    **dotenv_values(settings_filename)
}

settings = Settings(settings_dict)
settings.enviroment["RANDOM_STATE"] = int(settings.enviroment["RANDOM_STATE"])
n_jobs = -1
verbose = 2
load_from_kaggle = False

In [16]:
settings.enviroment

{'DATASET_SUBFOLDER': 'dataset',
 'RESULT_SUBFOLDER': 'result',
 'CACHE_SUBFOLDER': 'cached_results',
 'BOXPLOT_SUBFOLDER': 'boxplot',
 'RANDOM_STATE': 42,
 'DATASET_FILENAME_TEMPLATE': 'dataset_df_%s.joblib',
 'PARAMS_FILENAME_TEMPLATE': 'params_%s.joblib',
 'X_Train_FILENAME_TEMPLATE': 'X_Train_%s.joblib',
 'y_Train_FILENAME_TEMPLATE': 'y_Train_%s.joblib',
 'X_Test_FILENAME_TEMPLATE': 'X_Test_%s.joblib',
 'y_Test_FILENAME_TEMPLATE': 'y_Test_%s.joblib',
 'GRID_SEARCH_TEMPLATE_FILENAME': '03_GridSearch_%s.joblib',
 'MODEL_CLASS_TEMPLATE_FILENAME': '04_model_%s.joblib'}

## Модель H2O AutoML

In [17]:
#!pip install h2o

In [18]:
import h2o
from h2o.automl import H2OAutoML # загрузка модели

In [19]:
h2o_model_name = "h2o"
h2o.init()

Checking whether there is an H2O instance running at http://localhost:54321. connected.


0,1
H2O_cluster_uptime:,1 hour 59 mins
H2O_cluster_timezone:,Europe/Samara
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.46.0.7
H2O_cluster_version_age:,16 days
H2O_cluster_name:,H2O_from_python_igel_yhbs9q
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,15.60 Gb
H2O_cluster_total_cores:,20
H2O_cluster_allowed_cores:,20


In [21]:
# загрузить датасет если его нет
ts_csv_filename = Path(settings.enviroment["DATASET_SUBFOLDER"], "", 'Electric_Production.csv')
if not Path(ts_csv_filename).exists():
    if not Path(settings.enviroment["DATASET_SUBFOLDER"]).exists():
        Path.mkdir(Path(settings.enviroment["DATASET_SUBFOLDER"]))
    od.download_url("https://raw.githubusercontent.com/ejgao/Time-Series-Datasets/refs/heads/master/Electric_Production.csv", 
                                Path(settings.enviroment["DATASET_SUBFOLDER"]))  

In [25]:
# Загрузим датасет и для ДЗ отберем 5000 строк из датасета
original_dataset_df = pd.read_csv(ts_csv_filename)
dataset_df = original_dataset_df.copy()
dataset_df

Unnamed: 0,DATE,IPG2211A2N
0,1/1/1985,72.50520
1,2/1/1985,70.67200
2,3/1/1985,62.45020
3,4/1/1985,57.47140
4,5/1/1985,55.31510
...,...,...
392,9/1/2017,98.61540
393,10/1/2017,93.61370
394,11/1/2017,97.33590
395,12/1/2017,114.72120


In [26]:
dataset_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 397 entries, 0 to 396
Data columns (total 2 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   DATE        397 non-null    object 
 1   IPG2211A2N  397 non-null    float64
dtypes: float64(1), object(1)
memory usage: 6.3+ KB


# 2. Преобразование даты в индекс

In [27]:
# Дату в индекс
dataset_df['Date'] = pd.to_datetime(dataset_df['DATE']) # Строки преобразуем в даты
dataset_df.set_index('Date', inplace=True)
dataset_df.drop('DATE', inplace=True, axis=1)
dataset_df.columns = ['EP']

In [29]:
# выделим два последних года в тестовую выборку
len_test = 24
train = dataset_df.head(len(dataset_df)-len_test)
test =  dataset_df.tail(len_test)

In [30]:
# Инициализация H2O
h2o.init()

# Конвертация в H2O Frame
train_h2o = h2o.H2OFrame(train)
test_h2o = h2o.H2OFrame(test)

Checking whether there is an H2O instance running at http://localhost:54321. connected.


0,1
H2O_cluster_uptime:,2 hours 13 mins
H2O_cluster_timezone:,Europe/Samara
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.46.0.7
H2O_cluster_version_age:,16 days
H2O_cluster_name:,H2O_from_python_igel_yhbs9q
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,15.60 Gb
H2O_cluster_total_cores:,20
H2O_cluster_allowed_cores:,20


Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%


In [33]:
train_h2o

EP
72.5052
70.672
62.4502
57.4714
55.3151
58.0904
62.6202
63.2485
60.5846
56.3154


In [32]:
# Настройка AutoML
aml = H2OAutoML(
    max_runtime_secs=3600,  # Максимальное время обучения
    exclude_algos=["DeepLearning"],  # Исключить алгоритмы при необходимости
    seed=42,
    nfolds=0  # Для временных рядов кросс-валидация не рекомендуется!
)


In [None]:
# Запуск обучения
aml.train(x=x, y=y, training_frame=train_h2o)

In [31]:
x=train.columns

In [28]:
dataset_df

Unnamed: 0_level_0,EP
Date,Unnamed: 1_level_1
1985-01-01,72.50520
1985-02-01,70.67200
1985-03-01,62.45020
1985-04-01,57.47140
1985-05-01,55.31510
...,...
2017-09-01,98.61540
2017-10-01,93.61370
2017-11-01,97.33590
2017-12-01,114.72120


In [24]:
df = h2o.import_file(nf, header=1)

H2OServerError: HTTP 500 Server Error:
Server error water.util.DistributedException:
  Error: DistributedException from /127.0.0.1:54321: 'This H2O node couldn't read data from 'nfs://home/igel/Projects/ml/ml-inno-hw/3. Machine Learning/3.17. AutoML 0;3>@8B<K 4;O @01>BK A 40==K<8/dataset/Electric_Production.csv'. Please make sure the file is available on all H2O nodes and/or check the working directories.'
  Request: None
  Stacktrace: DistributedException from /127.0.0.1:54321: 'This H2O node couldn't read data from 'nfs://home/igel/Projects/ml/ml-inno-hw/3. Machine Learning/3.17. AutoML 0;3>@8B<K 4;O @01>BK A 40==K<8/dataset/Electric_Production.csv'. Please make sure the file is available on all H2O nodes and/or check the working directories.', caused by java.lang.RuntimeException: This H2O node couldn't read data from 'nfs://home/igel/Projects/ml/ml-inno-hw/3. Machine Learning/3.17. AutoML 0;3>@8B<K 4;O @01>BK A 40==K<8/dataset/Electric_Production.csv'. Please make sure the file is available on all H2O nodes and/or check the working directories.
      water.MRTask.getResult(MRTask.java:660)
      water.MRTask.getResult(MRTask.java:670)
      water.MRTask.doAll(MRTask.java:555)
      water.parser.ParseSetup.guessSetup(ParseSetup.java:408)
      water.api.ParseSetupHandler.guessSetup(ParseSetupHandler.java:44)
      java.base/jdk.internal.reflect.DirectMethodHandleAccessor.invoke(DirectMethodHandleAccessor.java:103)
      java.base/java.lang.reflect.Method.invoke(Method.java:580)
      water.api.Handler.handle(Handler.java:60)
      water.api.RequestServer.serve(RequestServer.java:472)


In [None]:
df

In [None]:
h2o_ml = H2OAutoML(max_models = 10, seed = 1)
h2o_ml.train(x = params["columns_X"], y = params["target_column"], training_frame = dataset_df)

In [35]:
import pandas as pd

data = {
    'date': pd.date_range(start='2020-01-01', periods=100, freq='D'),
    'value': [i + 0.1 * i**2 for i in range(100)]
}
df = pd.DataFrame(data)
df.sort_values('date', inplace=True)  # Обязательно отсортируйте по времени!
df

Unnamed: 0,date,value
0,2020-01-01,0.00000
1,2020-01-02,1.10000
2,2020-01-03,2.40000
3,2020-01-04,3.90000
4,2020-01-05,5.60000
...,...,...
95,2020-04-05,997.50000
96,2020-04-06,1017.60000
97,2020-04-07,1037.90000
98,2020-04-08,1058.40000


In [36]:
# Лаги (значения за предыдущие периоды)
df['lag_1'] = df['value'].shift(1)
df['lag_7'] = df['value'].shift(7)

# Скользящее среднее
df['rolling_mean_7'] = df['value'].rolling(window=7).mean()

# Временные признаки
df['year'] = df['date'].dt.year
df['month'] = df['date'].dt.month
df['day'] = df['date'].dt.day

# Удалите строки с NaN (после создания лагов)
df.dropna(inplace=True)
df

Unnamed: 0,date,value,lag_1,lag_7,rolling_mean_7,year,month,day
7,2020-01-08,11.90000,9.60000,0.00000,6.00000,2020,1,8
8,2020-01-09,14.40000,11.90000,1.10000,7.90000,2020,1,9
9,2020-01-10,17.10000,14.40000,2.40000,10.00000,2020,1,10
10,2020-01-11,20.00000,17.10000,3.90000,12.30000,2020,1,11
11,2020-01-12,23.10000,20.00000,5.60000,14.80000,2020,1,12
...,...,...,...,...,...,...,...,...
95,2020-04-05,997.50000,977.60000,862.40000,938.80000,2020,4,5
96,2020-04-06,1017.60000,997.50000,881.10000,958.30000,2020,4,6
97,2020-04-07,1037.90000,1017.60000,900.00000,978.00000,2020,4,7
98,2020-04-08,1058.40000,1037.90000,919.10000,997.90000,2020,4,8


In [37]:
train = df[df['date'] < '2020-03-01']
test = df[df['date'] >= '2020-03-01']

In [None]:
import h2o
from h2o.automl import H2OAutoML

# Инициализация H2O
h2o.init()

# Конвертация в H2O Frame
train_h2o = h2o.H2OFrame(train)
test_h2o = h2o.H2OFrame(test)

# Определение признаков и целевой переменной
x = ['lag_1', 'lag_7', 'rolling_mean_7', 'year', 'month', 'day']
y = 'value'

# Настройка AutoML
aml = H2OAutoML(
    max_runtime_secs=3600,  # Максимальное время обучения
    exclude_algos=["DeepLearning"],  # Исключить алгоритмы при необходимости
    seed=42,
    nfolds=0  # Для временных рядов кросс-валидация не рекомендуется!
)

# Запуск обучения
aml.train(x=x, y=y, training_frame=train_h2o)

# Просмотр результатов
lb = aml.leaderboard
print(lb)

Checking whether there is an H2O instance running at http://localhost:54321. connected.


0,1
H2O_cluster_uptime:,2 hours 16 mins
H2O_cluster_timezone:,Europe/Samara
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.46.0.7
H2O_cluster_version_age:,16 days
H2O_cluster_name:,H2O_from_python_igel_yhbs9q
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,15.60 Gb
H2O_cluster_total_cores:,20
H2O_cluster_allowed_cores:,20


Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
AutoML progress: |
03:11:09.481: _train param, Dropping bad and constant columns: [year]


03:11:09.962: _train param, Dropping bad and constant columns: [year]
03:11:10.31: _train param, Dropping bad and constant columns: [year]
03:11:10.31: _min_rows param, The dataset size is too small to split for min_rows=100.0: must have at least 200.0 (weighted) rows, but have only 46.0.
03:11:10.32: _train param, Dropping bad and constant columns: [year]


03:11:10.147: _train param, Dropping bad and constant columns: [year]
03:11:10.296: _train param, Dropping bad and constant columns: [year]


03:11:10.425: _train param, Dropping bad and constant columns: [year]
03:11:10.543: _train param, Dropping bad and constant columns: [year]

█
03:11:10.640: _train param, Dropping bad and constant columns: [year]


0

In [None]:
# Прогноз на тестовых данных
preds = aml.leader.predict(test_h2o)
print(preds)

In [None]:
from sklearn.metrics import mean_absolute_error

# Конвертация предсказаний обратно в pandas
test_pred = preds.as_data_frame()['predict']
true_values = test['value'].values

mae = mean_absolute_error(true_values, test_pred)
print(f"MAE: {mae}")

In [None]:
data = InputData.from_dataframe(dataset_df_wo_outliers_minmax[params["columns_X"]],
                                dataset_df_wo_outliers_minmax[params["target_column"]],
                                task='classification')
fedot_train, fedot_test = train_test_data_setup(data)

In [None]:
fedot_model = Fedot(problem='classification', metric=['accuracy', 'roc_auc', 'precision', 'f1'], timeout=5, seed=42)

In [None]:
s = BaseLib.st()
best_pipeline = fedot_model.fit(features=fedot_train, target='target')
fedot_durarion_fit = BaseLib.ft(s)

In [None]:
print(f'Модель: {best_pipeline.primary_nodes}')
print(f'Параметры модели: {pformat(best_pipeline.primary_nodes[0].parameters)}')

In [None]:
best_pipeline.show()

### Подсчет метрик

In [None]:
# Импорт метрик для оценки качества моделей классификации
from sklearn.metrics import (
    precision_score, 
    recall_score, 
    roc_auc_score, 
    roc_curve, 
    f1_score,  #f1-мера
    accuracy_score,  # Метрика точности для классификации
    classification_report,  # Отчет о классификации
    confusion_matrix
)

"""Посчитать метрики модели"""
f_y_train_pred = fedot_model.predict(features=fedot_train)
f_y_train_prob = fedot_model.predict_proba(fedot_train) #[:, 1]
f_y_test_pred = fedot_model.predict(features=fedot_test)
f_y_test_prob = fedot_model.predict_proba(fedot_test) #[:, 1]

# Расчет AUC-ROC
f_train_roc_auc = roc_auc_score(fedot_train.target, f_y_train_prob)
f_test_roc_auc = roc_auc_score(fedot_test.target, f_y_test_prob)
                             
# Поиск порога, максимизирующего F1-score
f_thresholds = np.arange(0.0, 1.0, 0.01)
f_f1_scores = [f1_score(fedot_test.target, f_y_test_prob >= t) for t in f_thresholds]
f_optimal_threshold = f_thresholds[np.argmax(f_f1_scores)]

# Пересчет метрик с учетом оптимального порога
f_y_train_pred_optimal = (f_y_train_prob >= f_optimal_threshold).astype(int)
f_y_test_pred_optimal = (f_y_test_prob >= f_optimal_threshold).astype(int)

f_train_precision = precision_score(fedot_train.target, f_y_train_pred_optimal)
f_test_precision = precision_score(fedot_test.target, f_y_test_pred_optimal)

f_train_recall = recall_score(fedot_train.target, f_y_train_pred_optimal)
f_test_recall = recall_score(fedot_test.target, f_y_test_pred_optimal)

f_train_accuracy = accuracy_score(fedot_train.target, f_y_train_pred_optimal)
f_test_accuracy = accuracy_score(fedot_test.target, f_y_test_pred_optimal)

f_train_f1_score = f1_score(fedot_train.target, f_y_train_pred_optimal)
f_test_f1_score = f1_score(fedot_test.target, f_y_test_pred_optimal)

### Визуализация метрик

In [None]:
import plotly.express as plotly_px
import plotly.graph_objects as plotly_go
import plotly.subplots as plotly_subplt

f_fig = plotly_subplt.make_subplots(rows=2, cols=2, 
                                subplot_titles=['ROC AUC', 'Metrics', 'Confusion Matrix Train', 'Confusion Matrix Test'],
                                vertical_spacing = 0.1,
                                row_width=[0.4, 0.6])
f_fig.update_layout(
    title_x=0.5,
    title_text="FEDOT",
    width = 1000,
    height = 800,
    legend = dict(yanchor="bottom", y=0.63, xanchor="right", x=0.44),
    margin = {'t':80, 'b':50, 'l':10, 'r':10}
    
)

# Построение ROC кривой
fpr_test, tpr_test, f_thresholds = roc_curve(fedot_test.target, fedot_model.predict_proba(features=fedot_test))
fpr_train, tpr_train, f_thresholds = roc_curve(fedot_train.target, fedot_model.predict_proba(features=fedot_train))
roc_train_g = plotly_go.Scatter(x=fpr_train, y=tpr_train, name="ROC curve Train", line={'color':'green'})
roc_test_g = plotly_go.Scatter(x=fpr_test, y=tpr_test, name="ROC curve Test", line={'color':'blue'})
roc_diag_g = plotly_go.Scatter(x=[0, 1], y=[0, 1], line={'color':'gray', 'dash': 'dash'}, showlegend=False)

f_fig.add_trace(roc_train_g, row=1, col=1)
f_fig.add_trace(roc_test_g, row=1, col=1)
f_fig.add_trace(roc_diag_g, row=1, col=1)

f_fig.update_layout(
    xaxis1 = {'title_text': "False Positive Rate"},
    yaxis1 = {'title_text': "True Positive Rate"}
)    

# Bar с метриками
df_metrics = pd.DataFrame([[f_test_accuracy,  f_train_accuracy],
                            [f_test_precision, f_train_precision],
                            [f_test_recall,    f_train_recall],
                            [f_test_roc_auc,   f_train_roc_auc],
                            [f_test_f1_score,  f_train_f1_score]], 
                            columns = ["Test", "Train"], 
                            index=["accuracy", "precision", "recall", "ROC AUC", "F1"])
metrics_train = plotly_go.Bar(x=df_metrics.index, y=df_metrics.Train, 
                showlegend=True, text=round(df_metrics.Train,4), textangle=0, 
                xaxis='x2', yaxis='y2', name="Train Metrics")
metrics_test = plotly_go.Bar(x=df_metrics.index, y=df_metrics.Test, 
                showlegend=True, text=round(df_metrics.Test,4), textangle=0, 
                xaxis='x2', yaxis='y2', name="Test Metrics")

f_fig.add_trace(metrics_train, row=1, col=2) 
f_fig.add_trace(metrics_test, row=1, col=2) 

train_cm = confusion_matrix(fedot_train.target, f_y_train_pred_optimal, normalize='all')
heatmap_train = plotly_go.Heatmap(z=train_cm, 
                                    x=['0', '1'], y=['0', '1'], 
                                    colorscale='Blues', 
                                    text=np.round(train_cm, 3), 
                                    texttemplate="%{text}", 
                                    showscale=False)

test_cm = confusion_matrix(fedot_test.target, f_y_test_pred_optimal, normalize='all')
heatmap_test = plotly_go.Heatmap(z=test_cm, 
                                    x=['0', '1'], y=['0', '1'], 
                                    colorscale='Blues', 
                                    text=np.round(test_cm, 3), 
                                    texttemplate="%{text}", 
                                    showscale=False)


f_fig.add_trace(heatmap_train, row=2, col=1)
f_fig.add_trace(heatmap_test,  row=2, col=2) 

f_fig.update_layout(
    xaxis1 = {'title': 'Predict'},
    xaxis2 = {'title': 'Predict'},
    yaxis1 = {'title': 'Goals'},
    yaxis2 = {'title': 'Goals'},
    xaxis3 = {'title': 'Предсказания'},
    xaxis4 = {'title': 'Предсказания'},
    yaxis3 = {'title': 'Факт'},
    yaxis4 = {'title': 'Факт'},
                
)    
f_fig.show()


In [None]:
fedot_metrics_as_dict = {'params': ModelWrapClass.metrics_names(),
                'values': [
                    f_train_precision, f_test_precision,
                    f_train_recall,    f_test_recall,
                    f_train_roc_auc,   f_test_roc_auc,
                    f_train_accuracy,  f_test_accuracy,
                    f_train_f1_score,  f_test_f1_score
                ],
                'model_name': [fedot_model_name for i in range(len(ModelWrapClass.metrics_names()))]
            }      

# Задача 2. Прогнозирование временных рядов

## Загрузка датасета

In [None]:
# загрузить датасет если его нет
ts_csv_filename = Path(settings.enviroment["DATASET_SUBFOLDER"], "", 'shampoo-sales.csv')
if not Path(ts_csv_filename).exists():
    if not Path(settings.enviroment["DATASET_SUBFOLDER"]).exists():
        Path.mkdir(Path(settings.enviroment["DATASET_SUBFOLDER"]))
    od.download_url("https://raw.githubusercontent.com/sunilmallya/timeseries/refs/heads/master/data/shampoo-sales.csv", 
                                Path(settings.enviroment["DATASET_SUBFOLDER"]))  

In [None]:
# Загрузим датасет и для ДЗ отберем 5000 строк из датасета
original_dataset_df = pd.read_csv(ts_csv_filename, header=None, names=['date', 'sales'])
dataset_df = original_dataset_df.copy()
dataset_df

# ================================================================

# Сравнительная таблица метрик разных моделей

In [None]:
df_stat = pd.concat([pd.DataFrame(knn_model.metrics()),
                     pd.DataFrame(svc_model.metrics()),
                     pd.DataFrame(rfc_model.metrics()),
                     pd.DataFrame(logreg_model.metrics()),
                     pd.DataFrame(dtc_model.metrics()),
                     pd.DataFrame(fedot_metrics_as_dict)
                     ])
columns = ['model_name']
columns = columns + ModelClass.metrics_names()
df_stat2 = df_stat.pivot_table(columns = 'params',
                            index='model_name',
                            values='values').reset_index()[columns]
df_stat2

Наибольшая доля правильных предсказаний (accuracy) у моделей SupportVectorMachine и CatBoost(fedot) - 0.71 на тестовой выборке.
Лучше всего положительные классы предстказывает SupportVectorMachine - 0.67.
Наиболее сбалансированной получается модель Catboost, выбранная с помощью FEDOT - F1=0.75