In [86]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns

from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, KFold
from sklearn.preprocessing import (
    StandardScaler, MinMaxScaler, RobustScaler, OneHotEncoder)
from sklearn.linear_model import LogisticRegression, LinearRegression, ElasticNet
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import GridSearchCV

from sklearn.metrics import (
    f1_score, precision_score, recall_score, accuracy_score)
from sklearn.metrics import (
    mean_absolute_error, mean_absolute_percentage_error,
    root_mean_squared_error)

import re
from typing import Literal

from itertools import product

# Датасеты


In [87]:
classification_df = pd.read_csv("data/consumer_electronics_sales_data.csv")
classification_df

Unnamed: 0,ProductID,ProductCategory,ProductBrand,ProductPrice,CustomerAge,CustomerGender,PurchaseFrequency,CustomerSatisfaction,PurchaseIntent
0,5874,Smartphones,Other Brands,312.949668,18,0,2,1,0
1,5875,Smart Watches,Samsung,980.389404,35,1,7,2,1
2,5876,Tablets,Samsung,2606.718293,63,0,1,5,1
3,5877,Smartphones,Samsung,870.395450,63,1,10,3,1
4,5878,Tablets,Sony,1798.955875,57,0,17,3,0
...,...,...,...,...,...,...,...,...,...
8995,14869,Smart Watches,Samsung,1041.149163,36,1,16,4,0
8996,14870,Smartphones,Samsung,1485.694311,57,0,5,1,1
8997,14871,Headphones,Samsung,2887.369597,28,0,18,4,0
8998,14872,Tablets,HP,1490.453964,38,0,4,2,1


In [88]:
regression_df = pd.read_csv("data/laptop_price.csv", encoding='latin1')
regression_df

Unnamed: 0,laptop_ID,Company,Product,TypeName,Inches,ScreenResolution,Cpu,Ram,Memory,Gpu,OpSys,Weight,Price_euros
0,1,Apple,MacBook Pro,Ultrabook,13.3,IPS Panel Retina Display 2560x1600,Intel Core i5 2.3GHz,8GB,128GB SSD,Intel Iris Plus Graphics 640,macOS,1.37kg,1339.69
1,2,Apple,Macbook Air,Ultrabook,13.3,1440x900,Intel Core i5 1.8GHz,8GB,128GB Flash Storage,Intel HD Graphics 6000,macOS,1.34kg,898.94
2,3,HP,250 G6,Notebook,15.6,Full HD 1920x1080,Intel Core i5 7200U 2.5GHz,8GB,256GB SSD,Intel HD Graphics 620,No OS,1.86kg,575.00
3,4,Apple,MacBook Pro,Ultrabook,15.4,IPS Panel Retina Display 2880x1800,Intel Core i7 2.7GHz,16GB,512GB SSD,AMD Radeon Pro 455,macOS,1.83kg,2537.45
4,5,Apple,MacBook Pro,Ultrabook,13.3,IPS Panel Retina Display 2560x1600,Intel Core i5 3.1GHz,8GB,256GB SSD,Intel Iris Plus Graphics 650,macOS,1.37kg,1803.60
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1298,1316,Lenovo,Yoga 500-14ISK,2 in 1 Convertible,14.0,IPS Panel Full HD / Touchscreen 1920x1080,Intel Core i7 6500U 2.5GHz,4GB,128GB SSD,Intel HD Graphics 520,Windows 10,1.8kg,638.00
1299,1317,Lenovo,Yoga 900-13ISK,2 in 1 Convertible,13.3,IPS Panel Quad HD+ / Touchscreen 3200x1800,Intel Core i7 6500U 2.5GHz,16GB,512GB SSD,Intel HD Graphics 520,Windows 10,1.3kg,1499.00
1300,1318,Lenovo,IdeaPad 100S-14IBR,Notebook,14.0,1366x768,Intel Celeron Dual Core N3050 1.6GHz,2GB,64GB Flash Storage,Intel HD Graphics,Windows 10,1.5kg,229.00
1301,1319,HP,15-AC110nv (i7-6500U/6GB/1TB/Radeon,Notebook,15.6,1366x768,Intel Core i7 6500U 2.5GHz,6GB,1TB HDD,AMD Radeon R5 M330,Windows 10,2.19kg,764.00


# Baseline

## Классификация

### Создание baseline

Используется Логистическая Регрессия для задачи классификации

Разделение датасета на `train` и `test`

In [89]:
cX = classification_df.drop(columns=["ProductID", "PurchaseIntent"])
cy = classification_df["PurchaseIntent"]

cX_train, cX_test, cy_train, cy_test = train_test_split(
    cX, cy, test_size=0.2, random_state=42
)

Препроцессинг данных

In [90]:
cat_cols = ["ProductCategory", "ProductBrand"]
num_cols = ["ProductPrice",
            "CustomerAge",
            "PurchaseFrequency",
            "CustomerSatisfaction"]

preprocessor = ColumnTransformer([
    ('cat', OneHotEncoder(handle_unknown='ignore'), cat_cols),
])

cX_train_prep = preprocessor.fit_transform(cX_train)
cX_test_prep = preprocessor.transform(cX_test)

Создание и обучение модели

In [91]:
model = LogisticRegression()
model.fit(cX_train_prep, cy_train)
cy_pred = model.predict(cX_test_prep)

Оценка качества модели

In [92]:
print(f"""
F1 Score:   {f1_score(cy_test, cy_pred)}
==============================
Accuracy:   {accuracy_score(cy_test, cy_pred)}
Precision:  {precision_score(cy_test, cy_pred)}
Recall:     {recall_score(cy_test, cy_pred)}
""")


F1 Score:   0.717491984324902
Accuracy:   0.5594444444444444
Precision:  0.5594444444444444
Recall:     1.0



### Улучшение baseline

Не смотря на то, что логистическая регрессия
работает корректно без стандартизации признаков,
её использование даст более точный результат.

Среди `StandardScaler`, `RobustScaler`, `MinMaxScaler` лучший результат даёт
**`StandardScaler`**.

In [93]:
preprocessor = ColumnTransformer([
    ('cat', OneHotEncoder(handle_unknown='ignore'), cat_cols),
    ('num', StandardScaler(), num_cols),
])

cX_train_scaled = preprocessor.fit_transform(cX_train)
cX_test_scaled = preprocessor.transform(cX_test)

Также для улучшения результата можно применить l1 или l2 регуляризацию.

Подбор гиперпараметров с помощью `GridSearch`

In [94]:
pipeline = Pipeline((
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression()),
))

params = {
    'preprocessor__num': [StandardScaler(), MinMaxScaler(), RobustScaler()],
    'classifier__C': [0.001, 0.01, 0.1, 1, 10, 100],
    'classifier__penalty': ['l1', 'l2', 'elasticnet'],
    'classifier__solver': ['saga'],
    'classifier__l1_ratio': [0.1, 0.3, 0.5, 0.7, 0.9],
    'classifier__class_weight': [None, 'balanced']
}

grid_search = GridSearchCV(
    pipeline,
    params,
    scoring='f1',
    cv=5,
    n_jobs=-1,
    verbose=True,
)
grid_search.fit(cX_train, cy_train)

Fitting 5 folds for each of 540 candidates, totalling 2700 fits




0,1,2
,estimator,Pipeline(step...egression())))
,param_grid,"{'classifier__C': [0.001, 0.01, ...], 'classifier__class_weight': [None, 'balanced'], 'classifier__l1_ratio': [0.1, 0.3, ...], 'classifier__penalty': ['l1', 'l2', ...], ...}"
,scoring,'f1'
,n_jobs,-1
,refit,True
,cv,5
,verbose,True
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,transformers,"[('cat', ...), ('num', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,0.001
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'saga'
,max_iter,100


In [95]:
print("Лучшие параметры:", grid_search.best_params_)
print("Лучший F1:", grid_search.best_score_)

# Лучший F1: 0.7594332934321656

Лучшие параметры: {'classifier__C': 0.001, 'classifier__class_weight': None, 'classifier__l1_ratio': 0.1, 'classifier__penalty': 'l2', 'classifier__solver': 'saga', 'preprocessor__num': StandardScaler()}
Лучший F1: 0.7595717928561199


Оценка качества модели на тестовых данных:

In [96]:
cy_pred = grid_search.best_estimator_.predict(cX_test)
print(f"""
F1 Score:   {f1_score(cy_test, cy_pred)}
==============================
Accuracy:   {accuracy_score(cy_test, cy_pred)}
Precision:  {precision_score(cy_test, cy_pred)}
Recall:     {recall_score(cy_test, cy_pred)}
""")


F1 Score:   0.7660154475238528
Accuracy:   0.7138888888888889
Precision:  0.7060301507537688
Recall:     0.8371400198609732



- F1 Score в обычном baseline: **0.717491984324902**
- F1 Score в улучшенном baseline: **0.7660154475238528**

За счёт препроцессинга данных и подбора гиперпараметров удалось повысить точность модели.

Улучшенный baseline для линейной регрессии лучше чем для KNN, максимальный F1 Score которого: *0.7362262311067772*.

## Регрессия

### Создание baseline

Используется *Линейная Регрессия* для задачи регрессии

Повторим преобразование колонок из прошлой лабораторной работы.

In [97]:
def extract_resolution(value):
    match = re.search(r'(\d{3,4})x(\d{3,4})', str(value))
    if match:
        return int(match.group(1)), int(match.group(2))
    return None, None

def parse_cpu(cpu):
    cpu = str(cpu)

    if 'Intel' in cpu:
        brand = 'Intel'
    elif 'AMD' in cpu:
        brand = 'AMD'
    elif 'Samsung' in cpu:
        brand = 'Samsung'
    else:
        brand = 'Other'

    if 'Core i3' in cpu:
        series = 'i3'
    elif 'Core i5' in cpu:
        series = 'i5'
    elif 'Core i7' in cpu:
        series = 'i7'
    elif 'Celeron' in cpu:
        series = 'Celeron'
    elif 'Pentium' in cpu:
        series = 'Pentium'
    elif 'Atom' in cpu:
        series = 'Atom'
    elif 'Xeon' in cpu:
        series = 'Xeon'
    elif 'M' in cpu and 'Core' in cpu:
        series = 'Core M'
    elif 'Ryzen' in cpu:
        series = 'Ryzen'
    elif 'A9-Series' in cpu:
        series = 'A9-Series'
    elif 'A6-Series' in cpu:
        series = 'A6-Series'
    elif 'E-Series' in cpu:
        series = 'E-Series'
    elif 'A12-Series' in cpu:
        series = 'A12-Series'
    elif 'A10-Series' in cpu:
        series = 'A10-Series'
    elif 'A8-Series' in cpu:
        series = 'A8-Series'
    elif 'FX' in cpu:
        series = 'FX'
    else:
        series = 'Other'

    freq_match = re.search(r'(\d+\.\d+)GHz', cpu)
    freq = float(freq_match.group(1)) if freq_match else None

    return pd.Series([brand, series, freq])

def parse_memory(memory_str):
    memory_str = memory_str.replace('TB', '000GB')

    parts = re.findall(r'(\d+)\s*GB\s*([A-Za-z\s]*)', memory_str)

    total_gb = 0
    types = []

    for size, mtype in parts:
        total_gb += int(size)
        mtype_clean = (
            mtype.strip()
            .replace('Storage', '')
            .replace('Flash', 'Flash')
            .replace('Hybrid', 'Hybrid')
            .replace('SSD', 'SSD')
            .replace('HDD', 'HDD')
        )
        if mtype_clean:
            types.append(mtype_clean.strip())

    main_type = types[0] if types else None

    return pd.Series([total_gb, main_type])

regression_df['Touchscreen'] = \
    regression_df['ScreenResolution'].str.contains('Touchscreen')
regression_df['IPS'] = \
    regression_df['ScreenResolution'].str.contains('IPS')
regression_df['RetinaDisplay'] = \
    regression_df['ScreenResolution'].str.contains('Retina')

regression_df['ResX'], regression_df['ResY'] = zip(
    *regression_df['ScreenResolution'].map(extract_resolution))
regression_df['TotalPixels'] = regression_df['ResX'] * regression_df['ResY']
regression_df.drop(columns=['ScreenResolution', 'ResX', 'ResY'], inplace=True)

regression_df[['CpuBrand', 'CpuSeries', 'CpuFreqGHz']] = \
    regression_df['Cpu'].apply(parse_cpu)
regression_df.drop(columns='Cpu', inplace=True)
regression_df['CpuFreqGHz'] = regression_df['CpuFreqGHz'].fillna(
    regression_df['CpuFreqGHz'].mean())

regression_df['Ram'] = regression_df['Ram'].str.replace('GB', '', regex=False).astype(int)

regression_df[['Memory_Storage', 'Memory_Type']] = \
    regression_df['Memory'].apply(parse_memory)
regression_df.drop(columns=['Memory'], inplace=True)

regression_df['Weight'] = \
    regression_df['Weight'].str.replace('kg', '', regex=False).astype(float)

regression_df['GpuBrand'] = regression_df['Gpu'].str.split().str[0]
regression_df.drop(columns=['Gpu'], inplace=True)

Разделение датасета на `train` и `test`

In [98]:
rX = regression_df.drop(columns=["laptop_ID", "Product", "Price_euros"])
ry = regression_df["Price_euros"]

rX_train, rX_test, ry_train, ry_test = train_test_split(
    rX, ry, test_size=0.2, random_state=42
)

Препроцессинг данных

In [99]:
cat_cols = ["Company", "TypeName", "OpSys", "CpuBrand",
            "CpuSeries", "Memory_Type", "GpuBrand"]
num_cols = ["Inches", "Ram", "Weight", "TotalPixels",
            "CpuFreqGHz", "Memory_Storage"]

preprocessor = ColumnTransformer([
    ('cat', OneHotEncoder(handle_unknown='ignore'), cat_cols),
])

rX_train_prep = preprocessor.fit_transform(rX_train)
rX_test_prep = preprocessor.transform(rX_test)

Создание и обучение модели

In [100]:
model = LinearRegression()
model.fit(rX_train_prep, ry_train)
ry_pred = model.predict(rX_test_prep)

Оценка качества модели

In [101]:
print(f"""
MAE:   {mean_absolute_error(ry_test, ry_pred)}
==============================
MAPE:   {mean_absolute_percentage_error(ry_test, ry_pred)}
RMSE:  {root_mean_squared_error(ry_test, ry_pred)}
""")


MAE:   291.2439949187024
MAPE:   0.282523270769792
RMSE:  433.09620097371675



### Улучшение baseline

Используем стандартизацию данных для улучшения результата.

`MinMaxScaler` показал себя лучше `StandardScaler` и `RobustScaler` в этой задаче регрессии.

In [102]:
preprocessor = ColumnTransformer([
    ('cat', OneHotEncoder(handle_unknown='ignore'), cat_cols),
    ('num', MinMaxScaler(), num_cols),
])

rX_train_scaled = preprocessor.fit_transform(rX_train)
rX_test_scaled = preprocessor.transform(rX_test)

Для предотвращения переобучения используем регуляризацию, для этого в качестве модели выберем `ElasticNet`.

Подбор гиперпараметров с помощью `GridSearch`

In [103]:
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', ElasticNet())
])

params = {
    'preprocessor__num': [StandardScaler(), RobustScaler(), MinMaxScaler()],
    'regressor__alpha': [0.001, 0.01, 0.1, 1, 10, 100],
    'regressor__l1_ratio': [0.0, 0.1, 0.3, 0.5, 0.7, 0.9, 1.0],
    'regressor__fit_intercept': [True, False]
}

grid_search = GridSearchCV(
    pipeline,
    params,
    scoring='neg_mean_absolute_error',
    cv=5,
    n_jobs=-1,
    verbose=True,
)
grid_search.fit(rX_train, ry_train)

Fitting 5 folds for each of 252 candidates, totalling 1260 fits


  model = cd_fast.sparse_enet_coordinate_descent(


0,1,2
,estimator,Pipeline(step...lasticNet())])
,param_grid,"{'preprocessor__num': [StandardScaler(), RobustScaler(), ...], 'regressor__alpha': [0.001, 0.01, ...], 'regressor__fit_intercept': [True, False], 'regressor__l1_ratio': [0.0, 0.1, ...]}"
,scoring,'neg_mean_absolute_error'
,n_jobs,-1
,refit,True
,cv,5
,verbose,True
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,transformers,"[('cat', ...), ('num', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,with_centering,True
,with_scaling,True
,quantile_range,"(25.0, ...)"
,copy,True
,unit_variance,False

0,1,2
,alpha,0.01
,l1_ratio,0.7
,fit_intercept,False
,precompute,False
,max_iter,1000
,copy_X,True
,tol,0.0001
,warm_start,False
,positive,False
,random_state,


In [104]:
print("Лучшие параметры:", grid_search.best_params_)
print("Лучший MAE:", -grid_search.best_score_)

# Лучший MAE: 237.23649251276834

Лучшие параметры: {'preprocessor__num': RobustScaler(), 'regressor__alpha': 0.01, 'regressor__fit_intercept': False, 'regressor__l1_ratio': 0.7}
Лучший MAE: 237.23649251276834


Оценка качества модели:

In [105]:
ry_pred = grid_search.best_estimator_.predict(rX_test)
print(f"""
MAE:   {mean_absolute_error(ry_test, ry_pred)}
==============================
MAPE:   {mean_absolute_percentage_error(ry_test, ry_pred)}
RMSE:  {root_mean_squared_error(ry_test, ry_pred)}
""")


MAE:   239.75457100729756
MAPE:   0.2459587317754573
RMSE:  348.70139139375084



- MAE в обычном baseline: **291.2439949187024**
- MAE в улучшенном baseline: **239.75457100729756**

За счёт препроцессинга данных и подбора гиперпараметров удалось повысить точность модели.

В задаче регрессии линейная модель проявила себя хуже, чем KNN, минимальный MAE которого: *187.48610305754687*.

# Имплементация Логистической регрессии

Линейные модели строят прогноз как линейную комбинацию признаков:
$$\hat y = w_0 + w_1x_1 + ... + w_nx_n$$

Логистическая регрессия пропускает результат через сигмоидную функцию для вычисления вероятности принадлежности к классу 1:
$$\hat p = \frac{1}{1 + e^{-(w_0 + w_1x_1 + ... + w_nx_n)}}$$
При обучении минимизируется *log loss* =
$-(y \log (\hat p) + (1 - y) \log (1 - \hat p))$

Для предотвращения переобучения используется регуляризация.
Она накладывает штраф на веса,
чтобы они не стали слишком большими.

При реализации используется регуляризация `elastic net`.

$$Loss = log\_loss + \frac{1}{C} (l1\_ratio \cdot ||w||_1 + (1 - l1\_ratio) \cdot \frac{1}{2} ||w||_2^2)$$


In [106]:
class LogReg:
    EPS = 1e-15

    def __init__(self, step = 0.01, n_iter=1000, C=1.0, l1_ratio=0.5, fit_intercept=True):
        self.step = step
        self.n_iter = n_iter
        self.C = C
        self.l1_ratio = l1_ratio
        self.fit_intercept = fit_intercept

    def fit(self, X, y):
        X = self.__add_intercept(X)
        rows, cols = X.shape
        self.weights = np.zeros(cols)
        alpha = 1.0 / self.C if self.C != 0 else 0.0
        for i in range(self.n_iter):
            z = X @ self.weights
            p = self.__sigmoid(z)
            gradient = (X.T @ (p - y)) / rows
            if alpha != 0:
                gradient[1:] += alpha * (1 - self.l1_ratio) * self.weights[1:] / rows
                l1_grad = alpha * self.l1_ratio * np.sign(self.weights)
                l1_grad[0] = 0.0
                gradient += l1_grad / rows
            self.weights -= self.step * gradient

    def predict_proba(self, X):
        X = self.__add_intercept(X)
        return self.__sigmoid(X @ self.weights)

    def predict(self, X, threshold=0.5):
        return (self.predict_proba(X) >= threshold).astype(int)

    def __add_intercept(self, X):
        if hasattr(X, "toarray"):
            X = X.toarray()
        if self.fit_intercept:
            return np.hstack([np.ones((X.shape[0], 1)), X])
        return X

    def __sigmoid(self, x):
        return 1 / (1 + np.exp(-x))


Испытание `LogReg`

In [107]:
my_classifier = LogReg()
my_classifier.fit(cX_train_prep, cy_train)
cy_pred = my_classifier.predict(cX_test_prep)
f1 = f1_score(cy_test, cy_pred)
print(f"F1 score for self-implemented LogReg: {f1}")

F1 score for self-implemented LogReg: 0.717491984324902


Воспользуемся стандартизацией и подбором гиперпараметров, как в улучшенном baseline

In [108]:
def linear_grid_search(model_name: Literal['LogReg', 'LinReg'], param_grid, X, y, scorer, cv=5):
    kf = KFold(n_splits=cv, shuffle=True, random_state=42)

    best_score = None
    best_params = None

    keys = list(param_grid.keys())
    for values in product(*param_grid.values()):
        params = dict(zip(keys, values))
        fold_scores = []
        for train_idx, val_idx in kf.split(X):
            X_train = X.iloc[train_idx] if hasattr(X, "iloc") else X[train_idx]
            X_val   = X.iloc[val_idx]   if hasattr(X, "iloc") else X[val_idx]
            y_train = y.iloc[train_idx] if hasattr(y, "iloc") else y[train_idx]
            y_val   = y.iloc[val_idx]   if hasattr(y, "iloc") else y[val_idx]

            model = None
            if model_name == 'LogReg':
                model = LogReg(**params)
            elif model_name == 'LinReg':
                model = LinReg(**params)
            model.fit(X_train, y_train)
            preds = model.predict(X_val)

            fold_scores.append(scorer(y_val, preds))

        mean_score = np.mean(fold_scores)
        if (best_score is None) or (mean_score > best_score):
            best_score = mean_score
            best_params = params

    return {
        "best_params": best_params,
        "best_score": best_score,
    }

In [109]:
params = {
    'C': [0.001, 0.01, 0.1, 1, 10, 100],
    'l1_ratio': [0.0, 0.1, 0.3, 0.5, 0.7, 0.9, 1.0],
    'fit_intercept': [False, True],
}

linear_grid_search('LogReg', params, cX_train_scaled, cy_train, f1_score)

{'best_params': {'C': 0.001, 'l1_ratio': 0.1, 'fit_intercept': True},
 'best_score': np.float64(0.7575278559802217)}

Оценка качества собственной модели:

In [110]:
my_classifier = LogReg(C=0.001, l1_ratio=0.1)
my_classifier.fit(cX_train_scaled, cy_train)
cy_pred = my_classifier.predict(cX_test_scaled)
print(f"""
F1 Score:   {f1_score(cy_test, cy_pred)}
==============================
Accuracy:   {accuracy_score(cy_test, cy_pred)}
Precision:  {precision_score(cy_test, cy_pred)}
Recall:     {recall_score(cy_test, cy_pred)}
""")


F1 Score:   0.7674101046882112
Accuracy:   0.7161111111111111
Precision:  0.7084033613445379
Recall:     0.8371400198609732



Результат почти совпадает с улучшенным baseline, получился даже немного лучше.

# Имплементация Линейной регрессии


В качестве регуляризации была выбрана `ElasticNet`,
так как она показала лучший результат в `GridSearch`. Она представляет из себя комбинацию L1 и L2 регуляризаций.
$$Loss = MSE + \alpha (l1\_ratio \cdot ||w||_1 + (1 - l1\_ratio) \cdot \frac{1}{2} ||w||_2^2)$$

In [111]:
class LinReg:
    def __init__(self, step=0.01, n_iter = 2000, alpha=1.0,
                 l1_ratio=0.5, fit_intercept=True):
        self.step = step
        self.n_iter = n_iter
        self.alpha = alpha
        self.l1_ratio = l1_ratio
        self.fit_intercept = fit_intercept

    def fit(self, X, y):
        X = self.__add_intercept(X)
        rows, cols = X.shape

        self.weights = np.zeros(cols)
        for i in range(self.n_iter):
            y_pred = X @ self.weights
            errors = y_pred - y
            gradient = (2 / rows) * (X.T @ errors)

            gradient[1:] += self.alpha * (1 - self.l1_ratio) * self.weights[1:]
            l1_grad = self.alpha * self.l1_ratio * np.sign(self.weights)
            l1_grad[0] = 0.0
            gradient += l1_grad

            self.weights -= self.step * gradient

    def predict(self, X, threshold=0.5):
        X = self.__add_intercept(X)
        return X @ self.weights

    def __add_intercept(self, X):
        X = X.toarray()
        if self.fit_intercept:
            return np.hstack([np.ones((X.shape[0], 1)), X])
        return X


Испытание `LinReg`

In [112]:
my_regressor = LinReg()
my_regressor.fit(rX_train_prep, ry_train)
ry_pred = my_regressor.predict(rX_test_prep)
mae = mean_absolute_error(ry_test, ry_pred)
print(f"MAE for self-implemented LinReg: {mae}")

MAE for self-implemented LinReg: 338.23412874208765


Применение стандартизации и подбора гиперпараметров

In [113]:
params = {
    'alpha': [0.001, 0.01, 0.1, 1, 10, 100],
    'l1_ratio': [0.0, 0.1, 0.3, 0.5, 0.7, 0.9, 1.0],
    'fit_intercept': [True, False],
}

linear_grid_search('LinReg', params, rX_train_scaled, ry_train, lambda t, p: -mean_absolute_error(t, p))

{'best_params': {'alpha': 0.001, 'l1_ratio': 1.0, 'fit_intercept': False},
 'best_score': np.float64(-270.7998564136027)}

Оценка точности собственного `LinReg`:

In [115]:
my_regressor = LinReg(alpha=0.001, l1_ratio=1.0, fit_intercept=False)
my_regressor.fit(rX_train_scaled, ry_train)
ry_pred = my_regressor.predict(rX_test_scaled)
print(f"""
MAE:   {mean_absolute_error(ry_test, ry_pred)}
==============================
MAPE:   {mean_absolute_percentage_error(ry_test, ry_pred)}
RMSE:  {root_mean_squared_error(ry_test, ry_pred)}
""")


MAE:   271.7726029692868
MAPE:   0.2664778313211179
RMSE:  426.23390823143274



В задаче регрессии самостоятельно реализованный алгоритм отстаёт от аналогичного из `sklearn`,
MAE которого: *239.75457100729756*.

# Вывод

В ходе выполнения лабораторной работы были исследованы алгоритмы логистической и линейной регрессии. Для задач классификации и регрессии были созданы обычный и улучшенный baseline.

Логистическая регрессия хорошо проявила себя в данной задаче классификации, получив метрики лучше KNN. Линейная регрессия проявила себя хуже KNN в задаче регрессии, возможно, из-за нелинейности данных.

Собственные реализации получили точность, близкую к библиотечным.