# Линейная регрессия, полиноминальная, Lasso, Ridge и ElasticNet 

In [1]:
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
from pandas.api.types import is_numeric_dtype
import numpy as np
from pprint import pprint, pformat
import copy
import zipfile
from pathlib import Path


import missingno as msno
import joblib

from my_lib import *
from my_config import *

from sklearn.model_selection import train_test_split

from sklearn.preprocessing import StandardScaler, MinMaxScaler, MaxAbsScaler, RobustScaler, PolynomialFeatures
from sklearn.preprocessing import QuantileTransformer, PowerTransformer
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, root_mean_squared_error, f1_score

In [2]:
from sklearn.linear_model import LinearRegression # для построения моделей линейной регрессии
from sklearn.preprocessing import PolynomialFeatures # для преобразования исходных признаков в полиномиальные, для построения моделей полиномиальной регрессии

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LogisticRegression, Ridge, Lasso, ElasticNet

from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error

In [None]:
import warnings
warnings.filterwarnings('ignore')

pd.set_option('display.max_columns', 50) # Устанавливаем максимальное количество отображаемых столбцов равным 50
#pd.set_option('display.max_rows', 50) # Устанавливаем максимальное количество отображаемых строк равным 20
pd.options.display.float_format = '{:.5f}'.format # Устанавливаем формат отображения чисел с двумя знаками после запятой
pd.options.mode.use_inf_as_na = True # Настройка режима Pandas для рассмотрения бесконечностей (inf) как пропущенных значений (NA)

# Конфигурация формата отображения графиков в виде векторных изображений
%config InlineBackend.figure_format = 'svg'

# для построения графиков внутри Jupyter Notebook
%matplotlib inline

In [4]:
dataset_df = joblib.load(Path(result_foler, dataset_filename_after_PrepareTarget))
params = joblib.load(Path(result_foler, params_filename_after_PrepareTarget))

In [5]:
dataset_df_X  = dataset_df.drop([params["target_column"]], axis=1)
dataset_df_Y  = dataset_df[params["target_column"]]
display(dataset_df_X.sample(5))
display(dataset_df_Y.sample(5))

Unnamed: 0,Age,Annual Income,Number of Dependents,Education Level,Health Score,Policy Type,Previous Claims,Vehicle Age,Credit Score,Insurance Duration,Customer Feedback,Exercise Frequency,Policy Start Year,Policy Start Month,Policy Start Day,Policy Start Weekday,Gender_Male,Marital Status_Married,Marital Status_Single,Occupation_Self-Employed,Occupation_Unemployed,Location_Suburban,Location_Urban,Property Type_Condo,Property Type_House,Smoking Status_Yes
33245,64.0,24897.0,4.0,1.0,20.9915,1.0,2.0,19.0,431.0,8.0,1.0,0.0,2023,5,30,1,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0
758274,56.0,7073.0,1.0,3.0,41.84827,2.0,0.0,18.0,434.0,4.0,1.0,2.0,2021,8,13,4,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
1112843,25.0,9071.0,3.0,0.0,28.90846,1.0,0.0,7.0,725.0,8.0,0.0,1.0,2021,4,16,4,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
929634,30.0,16054.0,3.0,0.0,14.7158,1.0,1.0,3.0,431.0,1.0,0.0,3.0,2021,6,26,5,1.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0
603716,34.0,7073.0,3.0,3.0,14.99999,2.0,0.0,12.0,734.0,4.0,0.0,0.0,2023,10,3,1,1.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0


657797    1453.00000
1013524    739.00000
685199    2191.00000
918841     991.00000
434500     916.00000
Name: Premium Amount, dtype: float64

In [6]:
# методы нормализации
scalers = ['StandardScaler',
           'QuantileTransformer',
           'MaxAbsScaler',
           'RobustScaler',
           'PowerTransformer']

In [7]:
models_scores = pd.DataFrame(columns=['r2_train', 'r2_test', 'mae_train', 'mae_test'])

In [8]:
# задаем интервал перебора для alpha
# альфа зависит от размера выборки
float_range = np.arange(10, 1000, 10)
float_range

array([ 10,  20,  30,  40,  50,  60,  70,  80,  90, 100, 110, 120, 130,
       140, 150, 160, 170, 180, 190, 200, 210, 220, 230, 240, 250, 260,
       270, 280, 290, 300, 310, 320, 330, 340, 350, 360, 370, 380, 390,
       400, 410, 420, 430, 440, 450, 460, 470, 480, 490, 500, 510, 520,
       530, 540, 550, 560, 570, 580, 590, 600, 610, 620, 630, 640, 650,
       660, 670, 680, 690, 700, 710, 720, 730, 740, 750, 760, 770, 780,
       790, 800, 810, 820, 830, 840, 850, 860, 870, 880, 890, 900, 910,
       920, 930, 940, 950, 960, 970, 980, 990])

In [9]:
%%time
# такой перекос в сторону тестовой выборки обусловлен тем, 
# что при больших объемах тренировочной выборки при обучении слишком много памяти требуется - и долго работает, 
# а при некоторых параметрах обучения вообще не хватает памяти
# но на конечный результат объем тренировочной выборки практически не оказывал влияния.
test_size = 0.7 
X_train, X_test, y_train, y_test = train_test_split(dataset_df_X, 
                                                    dataset_df_Y,
                                                    test_size=test_size, random_state=42)
print(f'y_train.mean()={y_train.mean()}, y_test.mean()={y_test.mean()}')    
joblib.dump(y_train, Path(result_foler, y_train_template_filename_after_split % ""), compress=0)
joblib.dump(y_test,  Path(result_foler, y_test_template_filename_after_split  % ""), compress=0)    
get_class = lambda x: globals()[x]

degrees = [2, 3]

for scaler_name in scalers:
    if scaler_name != 'NoScaler':
        scaler = get_class(scaler_name)()
        X_train_scaled = scaler.fit_transform(X_train)
        X_test_scaled = scaler.transform(X_test)

        model_LR = LinearRegression()
        fit_model(models_scores, f"{scaler_name}_LinearRegression",
                  model_LR, 
                  X_train_scaled, y_train, X_test_scaled, y_test)
        display(models_scores.tail(1))
        model_LR = None
        
        for degree in degrees:
            poly = PolynomialFeatures(degree=degree)
            X_train_poly = poly.fit_transform(X_train_scaled)
            X_test_poly = poly.transform(X_test_scaled)
            
            # Повторная нормализация, т.к. у исходных признаков очень разный масштаб
            scaler_poly = get_class(scaler_name)()
            X_train_poly_scaled = scaler_poly.fit_transform(X_train_poly)
            X_test_poly_scaled = scaler_poly.transform(X_test_poly)        

            model_poly_LR = LinearRegression()
            fit_model(models_scores, f"{scaler_name}_PolynomialFeatures(degree={degree})",
                      model_poly_LR, 
                      X_train_poly, y_train, X_test_poly, y_test)
            display(models_scores.tail(1))
            
            model_poly_LR = None
            
            X_test_poly_scaled = None
            X_train_poly_scaled = None
            
            X_test_poly_scaled = None
            X_train_poly_scaled = None
            scaler_poly = None
            X_test_poly = None
            X_train_poly = None
            poly = None

        # -------------------- ElasticNet() --------------------
        param_grid = {
            'alpha': [0.00005, 0.0005, 0.001, 0.01, 0.05, 0.06, 0.08, 1, 2, 3],
            'l1_ratio': [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]
        }
        elastic_net = GridSearchCV(ElasticNet(), param_grid, scoring='r2', cv=10)        
        res_elastic_net_model = elastic_net.fit(X_train_scaled, y_train)

        model_reg_elastic = ElasticNet(max_iter=1000, **res_elastic_net_model.best_params_) # alpha — величина регуляризации

        fit_model(models_scores, f"{scaler_name}_ElasticNet",
                    model_reg_elastic, 
                    X_train_scaled, y_train, X_test_scaled, y_test)
        display(models_scores.tail(1))

        model_reg_elastic = None
        res_elastic_net_model = None
        elastic_net = None
        
        # -------------------- Ridge() --------------------
        param_grid = {
            'alpha': float_range
        }

        ridge = GridSearchCV(Ridge(), param_grid, scoring='r2', cv=10)        
        res_ridge_model = ridge.fit(X_train_scaled, y_train)

        # построим регрессию гребневую L2 с оптимальным параметром регуляризации, который мы подобрали перебором
        model_ridge = Ridge(max_iter=1000, **res_ridge_model.best_params_) # alpha — величина регуляризации

        fit_model(models_scores, f"{scaler_name}_Ridge",
                  model_ridge, 
                  X_train_scaled, y_train, X_test_scaled, y_test)
        display(models_scores.tail(1))

        model_ridge = None
        res_ridge_model = None
        ridge = None
        
        # -------------------- Lasso() --------------------
        param_grid = {
            'alpha': float_range
        }

        lasso = GridSearchCV(Lasso(), param_grid, scoring='r2', cv=10)        
        res_lasso_model = lasso.fit(X_train_scaled, y_train)

        # построим регрессию гребневую L2 с оптимальным параметром регуляризации, который мы подобрали перебором
        model_lasso = Lasso(max_iter=1000, **res_lasso_model.best_params_) # alpha — величина регуляризации

        fit_model(models_scores, f"{scaler_name}_Lasso",
                  model_lasso, 
                  X_train_scaled, y_train, X_test_scaled, y_test)
        display(models_scores.tail(1))

        model_lasso = None
        res_lasso_model = None
        lasso = None

        X_test_scaled = None
        X_train_scaled = None
        scaler = None
        
    else:
        raise NotImplemented("Обучение моделей без нормализации не реализовано")

y_train.mean()=996.4374802647778, y_test.mean()=997.7506847036096


Unnamed: 0,r2_train,r2_test,mae_train,mae_test
StandardScaler_LinearRegression,0.0021,0.0019,567.607,567.6318


Unnamed: 0,r2_train,r2_test,mae_train,mae_test
StandardScaler_PolynomialFeatures(degree=2),0.0034,0.0013,567.1308,567.7258


Unnamed: 0,r2_train,r2_test,mae_train,mae_test
StandardScaler_PolynomialFeatures(degree=3),0.0116,-0.006,564.4583,569.3935


Unnamed: 0,r2_train,r2_test,mae_train,mae_test
StandardScaler_ElasticNet,0.002,0.0019,567.6468,567.6487


Unnamed: 0,r2_train,r2_test,mae_train,mae_test
StandardScaler_Ridge,0.0021,0.0019,567.6074,567.6322


Unnamed: 0,r2_train,r2_test,mae_train,mae_test
StandardScaler_Lasso,0.0016,0.0017,567.7751,567.7539


Unnamed: 0,r2_train,r2_test,mae_train,mae_test
QuantileTransformer_LinearRegression,0.0015,0.0013,567.7933,567.8265


Unnamed: 0,r2_train,r2_test,mae_train,mae_test
QuantileTransformer_PolynomialFeatures(degree=2),0.0036,0.0014,567.0828,567.687


Unnamed: 0,r2_train,r2_test,mae_train,mae_test
QuantileTransformer_PolynomialFeatures(degree=3),0.0116,-0.0063,564.455,569.4897


Unnamed: 0,r2_train,r2_test,mae_train,mae_test
QuantileTransformer_ElasticNet,0.0015,0.0013,567.8037,567.8294


Unnamed: 0,r2_train,r2_test,mae_train,mae_test
QuantileTransformer_Ridge,0.0015,0.0013,567.7956,567.8277


Unnamed: 0,r2_train,r2_test,mae_train,mae_test
QuantileTransformer_Lasso,0.0,-0.0,568.2512,568.2644


Unnamed: 0,r2_train,r2_test,mae_train,mae_test
MaxAbsScaler_LinearRegression,0.0021,0.0019,567.607,567.6318


Unnamed: 0,r2_train,r2_test,mae_train,mae_test
MaxAbsScaler_PolynomialFeatures(degree=2),0.0034,0.0013,567.1328,567.7264


Unnamed: 0,r2_train,r2_test,mae_train,mae_test
MaxAbsScaler_PolynomialFeatures(degree=3),0.0116,-0.006,564.5124,569.4498


Unnamed: 0,r2_train,r2_test,mae_train,mae_test
MaxAbsScaler_ElasticNet,0.002,0.0018,567.6231,567.6556


Unnamed: 0,r2_train,r2_test,mae_train,mae_test
MaxAbsScaler_Ridge,0.002,0.0018,567.626,567.6591


Unnamed: 0,r2_train,r2_test,mae_train,mae_test
MaxAbsScaler_Lasso,0.0,-0.0,568.2512,568.2644


Unnamed: 0,r2_train,r2_test,mae_train,mae_test
RobustScaler_LinearRegression,0.0021,0.0019,567.607,567.6318


Unnamed: 0,r2_train,r2_test,mae_train,mae_test
RobustScaler_PolynomialFeatures(degree=2),0.0034,0.0013,567.5768,568.1599


Unnamed: 0,r2_train,r2_test,mae_train,mae_test
RobustScaler_PolynomialFeatures(degree=3),0.0116,-0.006,564.4562,569.3997


Unnamed: 0,r2_train,r2_test,mae_train,mae_test
RobustScaler_ElasticNet,0.0021,0.0019,567.6164,567.6336


Unnamed: 0,r2_train,r2_test,mae_train,mae_test
RobustScaler_Ridge,0.0021,0.0019,567.6075,567.6318


Unnamed: 0,r2_train,r2_test,mae_train,mae_test
RobustScaler_Lasso,0.0016,0.0016,567.7849,567.7648


Unnamed: 0,r2_train,r2_test,mae_train,mae_test
PowerTransformer_LinearRegression,0.0015,0.0013,567.7849,567.8165


Unnamed: 0,r2_train,r2_test,mae_train,mae_test
PowerTransformer_PolynomialFeatures(degree=2),0.0036,0.0016,567.084,567.652


Unnamed: 0,r2_train,r2_test,mae_train,mae_test
PowerTransformer_PolynomialFeatures(degree=3),0.0117,-0.0059,564.4214,569.3658


Unnamed: 0,r2_train,r2_test,mae_train,mae_test
PowerTransformer_ElasticNet,0.0015,0.0013,567.8172,567.8283


Unnamed: 0,r2_train,r2_test,mae_train,mae_test
PowerTransformer_Ridge,0.0015,0.0013,567.7852,567.8167


Unnamed: 0,r2_train,r2_test,mae_train,mae_test
PowerTransformer_Lasso,0.0011,0.0011,567.9454,567.935


CPU times: user 9h 41min 21s, sys: 2h 33min 36s, total: 12h 14min 58s
Wall time: 3h 10min 11s


In [15]:
display(models_scores)

Unnamed: 0,r2_train,r2_test,mae_train,mae_test
StandardScaler_LinearRegression,0.0021,0.0019,567.607,567.6318
StandardScaler_PolynomialFeatures(degree=2),0.0034,0.0013,567.1308,567.7258
StandardScaler_PolynomialFeatures(degree=3),0.0116,-0.006,564.4583,569.3935
StandardScaler_ElasticNet,0.002,0.0019,567.6468,567.6487
StandardScaler_Ridge,0.0021,0.0019,567.6074,567.6322
StandardScaler_Lasso,0.0016,0.0017,567.7751,567.7539
QuantileTransformer_LinearRegression,0.0015,0.0013,567.7933,567.8265
QuantileTransformer_PolynomialFeatures(degree=2),0.0036,0.0014,567.0828,567.687
QuantileTransformer_PolynomialFeatures(degree=3),0.0116,-0.0063,564.455,569.4897
QuantileTransformer_ElasticNet,0.0015,0.0013,567.8037,567.8294


В результате были построены модели:
  * линейная регрессия
  * полиноминальная регрессия с degree=2, 3
  * ElasticNet с подбором гиперпараметров
  * Ridge с подбором гиперпараметров
  * Lasso с подбором гиперпараметров

для нескольких наборов данных, отличающихся примененным алгоритмом нормализации данных - 'StandardScaler', 'MinMaxScaler', 'MaxAbsScaler', 'RobustScaler', 'QuantileTransformer', 'PowerTransformer'.

Также проверялись разные подходы к подготовке набора данных:
  * с удалением столбцов с большой долей пропусков
  * с заполнением пропусков модой и с удалением строк, в которых есть пропуски
  * разные настройки OneHotEncoder


Результат был примерно один и тот же. В частности метрика r2_score колебалась в диапазоне ~ (0.001, 0.004), что крайне мало и показывает, что построенные модели непригодны для прогнозирования целевой функции.
Следовательно:
* либо методы регрессии не подходят к решению этой задаче
* либо где-то ошибка в коде
* либо в принципе неверный подход к подготовке данных и/или обучению моделей
* либо неверно интерпретирую метрики

Предполагаю, что какая-то из двух последних причин. Нужна помощь/обратная связь.
