# Линейная регрессия, полиноминальная, Lasso, Ridge и ElasticNet 

In [None]:
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
from pandas.api.types import is_numeric_dtype
import numpy as np
from pprint import pprint, pformat
import copy
import zipfile
from pathlib import Path


import missingno as msno
import joblib

from my_lib import *
from my_config import *



In [None]:
from sklearn.model_selection import train_test_split

from sklearn.preprocessing import StandardScaler, MinMaxScaler, MaxAbsScaler, RobustScaler, PolynomialFeatures
from sklearn.preprocessing import QuantileTransformer, PowerTransformer
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, root_mean_squared_error, f1_score

from sklearn.linear_model import LinearRegression # для построения моделей линейной регрессии
from sklearn.preprocessing import PolynomialFeatures # для преобразования исходных признаков в полиномиальные, для построения моделей полиномиальной регрессии

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LogisticRegression, Ridge, Lasso, ElasticNet

from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error

In [3]:
import warnings
warnings.filterwarnings('ignore')

pd.set_option('display.max_columns', 50) # Устанавливаем максимальное количество отображаемых столбцов равным 50
#pd.set_option('display.max_rows', 50) # Устанавливаем максимальное количество отображаемых строк равным 20
pd.options.display.float_format = '{:.5f}'.format # Устанавливаем формат отображения чисел с двумя знаками после запятой
pd.options.mode.use_inf_as_na = True # Настройка режима Pandas для рассмотрения бесконечностей (inf) как пропущенных значений (NA)

# Конфигурация формата отображения графиков в виде векторных изображений
%config InlineBackend.figure_format = 'svg'

# для построения графиков внутри Jupyter Notebook
%matplotlib inline

In [4]:
# Загрузить датасет подготовленный в рамках ДЗ по полиноминальной регрессии
dataset_df = joblib.load(Path(dataset_foler, dataset_filename_after_PrepareTarget))
params = joblib.load(Path(dataset_foler, params_filename_after_PrepareTarget))

In [5]:
dataset_df_X  = dataset_df.drop([params["target_column"]], axis=1)
dataset_df_Y  = dataset_df[params["target_column"]]
display(dataset_df_X.sample(5))
display(dataset_df_Y.sample(5))

Unnamed: 0,Age,Annual Income,Number of Dependents,Education Level,Health Score,Policy Type,Previous Claims,Vehicle Age,Credit Score,Insurance Duration,Customer Feedback,Exercise Frequency,Policy Start Year,Policy Start Month,Policy Start Day,Policy Start Weekday,Gender_Male,Marital Status_Married,Marital Status_Single,Occupation_Self-Employed,Occupation_Unemployed,Location_Suburban,Location_Urban,Property Type_Condo,Property Type_House,Smoking Status_Yes
334088,25.0,13982.0,1.0,0.0,28.90846,0.0,0.0,1.0,744.0,4.0,1.0,1.0,2023,9,29,4,1.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0
609030,26.0,16054.0,1.0,0.0,9.63876,1.0,0.0,4.0,431.0,4.0,0.0,2.0,2020,7,23,3,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0
669411,56.0,7073.0,2.0,1.0,30.92399,1.0,0.0,14.0,713.0,7.0,0.0,1.0,2023,7,31,0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
979541,43.0,13982.0,3.0,0.0,14.91238,0.0,1.0,5.0,744.0,1.0,2.0,3.0,2022,1,19,2,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
1031808,25.0,15641.0,3.0,1.0,13.41492,2.0,0.0,14.0,788.0,3.0,0.0,2.0,2023,8,21,0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0


294520    340.00000
448044    379.00000
155454   1035.00000
688078     20.00000
879949    813.00000
Name: Premium Amount, dtype: float64

In [6]:
# методы нормализации
scalers = ['StandardScaler',
           'QuantileTransformer',
           'MaxAbsScaler',
           'RobustScaler',
           'PowerTransformer']

In [7]:
models_scores = pd.DataFrame(columns=['r2_train', 'r2_test', 'mae_train', 'mae_test', 'best_params'])

In [8]:
# задаем интервал перебора для alpha
# альфа зависит от размера выборки
float_range = np.arange(10, 1000, 10)
float_range

array([ 10,  20,  30,  40,  50,  60,  70,  80,  90, 100, 110, 120, 130,
       140, 150, 160, 170, 180, 190, 200, 210, 220, 230, 240, 250, 260,
       270, 280, 290, 300, 310, 320, 330, 340, 350, 360, 370, 380, 390,
       400, 410, 420, 430, 440, 450, 460, 470, 480, 490, 500, 510, 520,
       530, 540, 550, 560, 570, 580, 590, 600, 610, 620, 630, 640, 650,
       660, 670, 680, 690, 700, 710, 720, 730, 740, 750, 760, 770, 780,
       790, 800, 810, 820, 830, 840, 850, 860, 870, 880, 890, 900, 910,
       920, 930, 940, 950, 960, 970, 980, 990])

In [9]:
%%time
# такой перекос в сторону тестовой выборки обусловлен тем, 
# что при больших объемах тренировочной выборки при обучении слишком много памяти требуется - и долго работает, 
# а при некоторых параметрах обучения вообще не хватает памяти
# но на конечный результат объем тренировочной выборки практически не оказывал влияния.
test_size = 0.7 
X_train, X_test, y_train, y_test = train_test_split(dataset_df_X, 
                                                    dataset_df_Y,
                                                    test_size=test_size, random_state=42)
print(f'y_train.mean()={y_train.mean()}, y_test.mean()={y_test.mean()}')    
joblib.dump(y_train, Path(result_foler, y_train_template_filename_after_split % ""), compress=0)
joblib.dump(y_test,  Path(result_foler, y_test_template_filename_after_split  % ""), compress=0)    
get_class = lambda x: globals()[x]

degrees = [2, 3]

for scaler_name in scalers:
    if scaler_name != 'NoScaler':
        scaler = get_class(scaler_name)()
        X_train_scaled = scaler.fit_transform(X_train)
        X_test_scaled = scaler.transform(X_test)

        model_LR = LinearRegression()
        fit_model(models_scores, f"{scaler_name}_LinearRegression",
                  model_LR, 
                  X_train_scaled, y_train, X_test_scaled, y_test)
        display(models_scores.tail(1))
        model_LR = None

        # -------------------- ElasticNet() --------------------
        param_grid = {
            'alpha': [0.00005, 0.0005, 0.001, 0.01, 0.05, 0.06, 0.08, 1, 2, 3],
            'l1_ratio': [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]
        }
        elastic_net = GridSearchCV(ElasticNet(), param_grid, scoring='r2', cv=10)        
        res_elastic_net_model = elastic_net.fit(X_train_scaled, y_train)

        model_reg_elastic = ElasticNet(max_iter=1000, **res_elastic_net_model.best_params_) # alpha — величина регуляризации

        fit_model(models_scores, f"{scaler_name}_ElasticNet",
                    model_reg_elastic, 
                    X_train_scaled, y_train, X_test_scaled, y_test,
                    res_elastic_net_model.best_params_)
        display(models_scores.tail(1))

        model_reg_elastic = None
        res_elastic_net_model = None
        elastic_net = None
        
        # -------------------- Ridge() --------------------
        param_grid = {
            'alpha': float_range
        }

        ridge = GridSearchCV(Ridge(), param_grid, scoring='r2', cv=10)        
        res_ridge_model = ridge.fit(X_train_scaled, y_train)

        # построим регрессию гребневую L2 с оптимальным параметром регуляризации, который мы подобрали перебором
        model_ridge = Ridge(max_iter=1000, **res_ridge_model.best_params_) # alpha — величина регуляризации

        fit_model(models_scores, f"{scaler_name}_Ridge",
                  model_ridge, 
                  X_train_scaled, y_train, X_test_scaled, y_test,
                  res_ridge_model.best_params_)
        display(models_scores.tail(1))

        model_ridge = None
        res_ridge_model = None
        ridge = None
        
        # -------------------- Lasso() --------------------
        param_grid = {
            'alpha': float_range
        }

        lasso = GridSearchCV(Lasso(), param_grid, scoring='r2', cv=10)        
        res_lasso_model = lasso.fit(X_train_scaled, y_train)

        # построим регрессию гребневую L2 с оптимальным параметром регуляризации, который мы подобрали перебором
        model_lasso = Lasso(max_iter=1000, **res_lasso_model.best_params_) # alpha — величина регуляризации

        fit_model(models_scores, f"{scaler_name}_Lasso",
                  model_lasso, 
                  X_train_scaled, y_train, X_test_scaled, y_test,
                  res_lasso_model.best_params_)
        display(models_scores.tail(1))

        model_lasso = None
        res_lasso_model = None
        lasso = None

        X_test_scaled = None
        X_train_scaled = None
        scaler = None
        
    else:
        raise NotImplemented("Обучение моделей без нормализации не реализовано")

y_train.mean()=996.4374802647778, y_test.mean()=997.7506847036096


Unnamed: 0,r2_train,r2_test,mae_train,mae_test,best_params
StandardScaler_LinearRegression,0.0021,0.0019,567.607,567.6318,


Unnamed: 0,r2_train,r2_test,mae_train,mae_test,best_params
StandardScaler_ElasticNet,0.002,0.0019,567.6468,567.6487,"{'alpha': 1, 'l1_ratio': 0.9}"


Unnamed: 0,r2_train,r2_test,mae_train,mae_test,best_params
StandardScaler_Ridge,0.0021,0.0019,567.6074,567.6322,{'alpha': 990}


Unnamed: 0,r2_train,r2_test,mae_train,mae_test,best_params
StandardScaler_Lasso,0.0016,0.0017,567.7751,567.7539,{'alpha': 10}


Unnamed: 0,r2_train,r2_test,mae_train,mae_test,best_params
QuantileTransformer_LinearRegression,0.0015,0.0013,567.7896,567.8223,


Unnamed: 0,r2_train,r2_test,mae_train,mae_test,best_params
QuantileTransformer_ElasticNet,0.0015,0.0013,567.8001,567.8253,"{'alpha': 0.08, 'l1_ratio': 0.9}"


Unnamed: 0,r2_train,r2_test,mae_train,mae_test,best_params
QuantileTransformer_Ridge,0.0015,0.0013,567.7919,567.8235,{'alpha': 990}


Unnamed: 0,r2_train,r2_test,mae_train,mae_test,best_params
QuantileTransformer_Lasso,0.0,-0.0,568.2512,568.2644,{'alpha': 10}


Unnamed: 0,r2_train,r2_test,mae_train,mae_test,best_params
MaxAbsScaler_LinearRegression,0.0021,0.0019,567.607,567.6318,


Unnamed: 0,r2_train,r2_test,mae_train,mae_test,best_params
MaxAbsScaler_ElasticNet,0.002,0.0018,567.6231,567.6556,"{'alpha': 5e-05, 'l1_ratio': 0.9}"


Unnamed: 0,r2_train,r2_test,mae_train,mae_test,best_params
MaxAbsScaler_Ridge,0.002,0.0018,567.626,567.6591,{'alpha': 10}


Unnamed: 0,r2_train,r2_test,mae_train,mae_test,best_params
MaxAbsScaler_Lasso,0.0,-0.0,568.2512,568.2644,{'alpha': 10}


Unnamed: 0,r2_train,r2_test,mae_train,mae_test,best_params
RobustScaler_LinearRegression,0.0021,0.0019,567.607,567.6318,


Unnamed: 0,r2_train,r2_test,mae_train,mae_test,best_params
RobustScaler_ElasticNet,0.0021,0.0019,567.6164,567.6336,"{'alpha': 0.08, 'l1_ratio': 0.6}"


Unnamed: 0,r2_train,r2_test,mae_train,mae_test,best_params
RobustScaler_Ridge,0.0021,0.0019,567.6075,567.6318,{'alpha': 990}


Unnamed: 0,r2_train,r2_test,mae_train,mae_test,best_params
RobustScaler_Lasso,0.0016,0.0016,567.7849,567.7648,{'alpha': 10}


Unnamed: 0,r2_train,r2_test,mae_train,mae_test,best_params
PowerTransformer_LinearRegression,0.0015,0.0013,567.7849,567.8165,


Unnamed: 0,r2_train,r2_test,mae_train,mae_test,best_params
PowerTransformer_ElasticNet,0.0015,0.0013,567.8172,567.8283,"{'alpha': 1, 'l1_ratio': 0.9}"


Unnamed: 0,r2_train,r2_test,mae_train,mae_test,best_params
PowerTransformer_Ridge,0.0015,0.0013,567.7852,567.8167,{'alpha': 990}


Unnamed: 0,r2_train,r2_test,mae_train,mae_test,best_params
PowerTransformer_Lasso,0.0011,0.0011,567.9454,567.935,{'alpha': 10}


CPU times: user 10h 40min 20s, sys: 2h 9min 3s, total: 12h 49min 24s
Wall time: 59min 58s


In [10]:
display(models_scores)

Unnamed: 0,r2_train,r2_test,mae_train,mae_test,best_params
StandardScaler_LinearRegression,0.0021,0.0019,567.607,567.6318,
StandardScaler_ElasticNet,0.002,0.0019,567.6468,567.6487,"{'alpha': 1, 'l1_ratio': 0.9}"
StandardScaler_Ridge,0.0021,0.0019,567.6074,567.6322,{'alpha': 990}
StandardScaler_Lasso,0.0016,0.0017,567.7751,567.7539,{'alpha': 10}
QuantileTransformer_LinearRegression,0.0015,0.0013,567.7896,567.8223,
QuantileTransformer_ElasticNet,0.0015,0.0013,567.8001,567.8253,"{'alpha': 0.08, 'l1_ratio': 0.9}"
QuantileTransformer_Ridge,0.0015,0.0013,567.7919,567.8235,{'alpha': 990}
QuantileTransformer_Lasso,0.0,-0.0,568.2512,568.2644,{'alpha': 10}
MaxAbsScaler_LinearRegression,0.0021,0.0019,567.607,567.6318,
MaxAbsScaler_ElasticNet,0.002,0.0018,567.6231,567.6556,"{'alpha': 5e-05, 'l1_ratio': 0.9}"


Использоваля набор данных, подготовленный в рамках ДЗ по полиноминальной регрессии.

Были построены модели:
  * линейная регрессия
  * ElasticNet с подбором гиперпараметров
  * Ridge с подбором гиперпараметров
  * Lasso с подбором гиперпараметров

для нескольких наборов данных, отличающихся примененным алгоритмом нормализации данных - 'StandardScaler', 'MaxAbsScaler', 'RobustScaler', 'QuantileTransformer', 'PowerTransformer'.

Результат был примерно один и тот же. В частности метрика r2_score колебалась в диапазоне ~ (0.001, 0.004), что крайне мало и показывает, что построенные модели непригодны для прогнозирования целевой функции.
Следовательно:
* либо методы регрессии не подходят к решению этой задаче
* либо где-то ошибка в коде
* либо в принципе неверный подход к подготовке данных и/или обучению моделей
* либо неверно интерпретирую метрики

Предполагаю, что какая-то из двух последних причин. Нужна помощь/обратная связь.
