In [1]:
import pandas as pd

from numpy import arange
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.preprocessing import PolynomialFeatures
from sklearn.datasets import load_diabetes
from sklearn.model_selection import train_test_split, cross_validate, GridSearchCV
from sklearn.metrics import (
    mean_squared_error, mean_absolute_error, mean_absolute_percentage_error, r2_score, make_scorer
)

# Task
1. Побудувати моделi класифiкацiї або регресiї згiдно з варiантом.
2. Виконати прогнози на основi побудованих моделей.
3. Для кожної з моделей оцiнити, чи має мiсце перенавчання.
7. В задачах регресiї розрахувати критерiї якостi для кожної моделi окремо
на навчальнiй та перевiрочнiй множинах:
- коефiцiєнт детермiнацiї R2
- помилки RMSE, MAE та MAPE

8. Виконати решiтчастий пошук (grid search) для пiдбору гiперпараметрiв
моделей.
9. Зробити висновки про якiсть роботи моделей на дослiджених даних. На
основi критерiїв якостi вибрати найкращу модель.

10. Навчити моделi на пiдмножинах навчальних даних. Оцiнити, наскiльки
розмiр навчальної множини впливає на якiсть моделi.

# Helpers

In [28]:
FEATURES, LABELS = load_diabetes(return_X_y=True)

class Score:
    SCORING = {
        "rmse": make_scorer(lambda *args: mean_squared_error(squared=False, *args), greater_is_better=False),
        "mae": make_scorer(mean_absolute_error, greater_is_better=False),
        "mape": make_scorer(mean_absolute_percentage_error, greater_is_better=False),
        "r2": make_scorer(r2_score),
    }
    
    @classmethod
    def score(cls, regressor, X, y):
        scores = {
             score_name: score(regressor, X, y)
             for score_name, score in cls.SCORING.items()
        }
        return scores
    
    @classmethod
    def perform(cls, regressor, X_train, X_test, y_train, y_test, fit_model=True):
        if fit_model:
            regressor.fit(X_train, y_train)
        return {
            "train_score": cls.score(regressor, X_train, y_train),
            "test_score": cls.score(regressor, X_test, y_test)
        }
    

def do_grid_search(
    regressor,
    param_grid,
    features=FEATURES,
    labels=LABELS,
    cv=10,
    scoring=Score.SCORING,
    refit='rmse'
):
    grid_search = GridSearchCV(
        regressor,
        param_grid,
        cv=cv,
        scoring=scoring,
        refit=refit,
    )
    grid_search.fit(features, labels)
    print(f"Model trained with params: {grid_search.best_params_}")
    
    return grid_search.best_estimator_

# Train-test split

In [3]:
data = train_test_split(FEATURES, LABELS, test_size=0.33, random_state=42)
X_train, X_test, y_train, y_test = data

# Linear Regression model

In [4]:
Score.perform(LinearRegression(), *data)

{'train_score': {'rmse': -54.30733066880185,
  'mae': -44.2421626435625,
  'mape': -0.40130200347089584,
  'r2': 0.5097554608164433},
 'test_score': {'rmse': -53.08296873921689,
  'mae': -41.96445302653266,
  'mape': -0.35510850217076595,
  'r2': 0.510395426135144}}

In [5]:
Score.perform(LinearRegression(fit_intercept=False), *data)

{'train_score': {'rmse': -159.67912970660745,
  'mae': -149.88727293258984,
  'mape': -1.176588043933387,
  'r2': -3.238304596434448},
 'test_score': {'rmse': -165.43771664042117,
  'mae': -156.68744665721707,
  'mape': -1.2208760330701383,
  'r2': -3.7555868143387086}}

# Ridge Regression model

In [6]:
Score.perform(Ridge(alpha=0.2), *data)

{'train_score': {'rmse': -55.21031850110104,
  'mae': -45.67250969301528,
  'mape': -0.4189012396356422,
  'r2': 0.4933169730597279},
 'test_score': {'rmse': -53.05718162788213,
  'mae': -42.161780462194116,
  'mape': -0.36041423608166023,
  'r2': 0.51087099942337}}

In [7]:
Score.perform(Ridge(alpha=0.5), *data)

{'train_score': {'rmse': -57.04970878986107,
  'mae': -47.67302385671916,
  'mape': -0.4417102264094104,
  'r2': 0.4589932108940735},
 'test_score': {'rmse': -54.19925056807602,
  'mae': -43.789648645841325,
  'mape': -0.3802392549514945,
  'r2': 0.4895871231078325}}

In [8]:
Score.perform(Ridge(), *data)

{'train_score': {'rmse': -59.76462054521214,
  'mae': -50.57936975619187,
  'mape': -0.47306361581535467,
  'r2': 0.4062765748571143},
 'test_score': {'rmse': -56.541826369317334,
  'mae': -46.456263192524055,
  'mape': -0.4088686982007224,
  'r2': 0.44451194464421595}}

# Linear Regression model with polynomial features pipeline

In [15]:
poly_pipeline = Pipeline([
    ("poly", PolynomialFeatures(degree=2)),
    ("linear", LinearRegression())
])
Score.perform(poly_pipeline, *data)

{'train_score': {'rmse': -49.40424962587918,
  'mae': -39.50448535948076,
  'mape': -0.3515539958065481,
  'r2': 0.5942818223229758},
 'test_score': {'rmse': -55.68127716654499,
  'mae': -44.17257984703669,
  'mape': -0.3843193811618853,
  'r2': 0.4612919865503654}}

# Ridge: grid search

In [29]:
model = do_grid_search(
    Ridge(),
    {
        'tol': [1e-2, 1e-3, 1e-4, 1e-5],
        'solver': ['auto', 'svd', 'cholesky', 'lsqr', 'sparse_cg', 'sag', 'saga'],
    },
)
train_score

Model trained with params: {'solver': 'sag', 'tol': 0.01}


GridSearchCV(cv=10, estimator=Ridge(),
             param_grid={'solver': ['auto', 'svd', 'cholesky', 'lsqr',
                                    'sparse_cg', 'sag', 'saga'],
                         'tol': [0.01, 0.001, 0.0001, 1e-05]},
             refit='rmse', return_train_score=True,
             scoring={'mae': make_scorer(mean_absolute_error, greater_is_better=False),
                      'mape': make_scorer(mean_absolute_percentage_error, greater_is_better=False),
                      'r2': make_scorer(r2_score),
                      'rmse': make_scorer(<lambda>, greater_is_better=False)})

# Pipeline: grid search

In [17]:
model = do_grid_search(
    poly_pipeline,
    {'poly__degree': range(1, 5)},
    features=X_train,
    labels=y_train
)
scores = Score.perform(model, *data, fit_model=False)
scores

Model trained with params: {'poly__degree': 1}


{'train_score': {'rmse': -54.30733066880185,
  'mae': -44.242162643562494,
  'mape': -0.40130200347089584,
  'r2': 0.5097554608164433},
 'test_score': {'rmse': -53.08296873921689,
  'mae': -41.96445302653266,
  'mape': -0.35510850217076595,
  'r2': 0.510395426135144}}