# Линейная регрессия: домашнее задание

In [1]:
import numpy as np
import pandas as pd
from sklearn import datasets
from sklearn import linear_model
from sklearn import metrics
from sklearn.model_selection import train_test_split

Рассмотрим датасет с ценами домов

Сразу разобьем его на обучающую и тестовую выборки

In [2]:
X, Y = datasets.fetch_california_housing(as_frame=True, return_X_y=True)
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.1, random_state=42)

In [3]:
X_train

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude
5564,3.8372,41.0,5.265957,1.131915,1416.0,3.012766,33.91,-118.29
16016,8.0069,52.0,7.224033,1.030550,1276.0,2.598778,37.73,-122.46
17131,3.6712,30.0,3.659983,1.084568,2678.0,2.334786,37.46,-122.15
11752,3.1583,16.0,5.622378,1.034965,792.0,2.769231,38.76,-121.21
2308,4.9292,15.0,6.333333,0.988662,1411.0,3.199546,36.83,-119.75
...,...,...,...,...,...,...,...,...
11284,6.3700,35.0,6.129032,0.926267,658.0,3.032258,33.78,-117.96
11964,3.0500,33.0,6.868597,1.269488,1753.0,3.904232,34.02,-117.43
5390,2.9344,36.0,3.986717,1.079696,1756.0,3.332068,34.03,-118.38
860,5.7192,15.0,6.395349,1.067979,1777.0,3.178891,37.58,-121.96


In [4]:
Y_train

5564     1.564
16016    4.268
17131    3.086
11752    2.392
2308     1.094
         ...  
11284    2.292
11964    0.978
5390     2.221
860      2.835
15795    3.250
Name: MedHouseVal, Length: 18576, dtype: float64

In [5]:
Y_train.min(), Y_train.max()

(0.14999, 5.00001)

In [6]:
from sklearn.preprocessing import StandardScaler

X_scaler = StandardScaler()
X_scaler.fit(X_train)
X_train = pd.DataFrame(X_scaler.transform(X_train), columns=X_scaler.get_feature_names_out())
X_test = pd.DataFrame(X_scaler.transform(X_test), columns=X_scaler.get_feature_names_out())

В качестве основной метрики будем использовать MSE

## Задание 1

Обучите модели:
- Линейной регрессии без регуляризации
- Ridge
- Lasso
- ElasticNet

В каждой из моделей с регуляризацией попробуйте 10 значений гиперпараметра `alpha`, указанных в ячейке ниже.
Выберите среди всех моделей лучшую на тестовой выборке

Для каждого типа модели напишите лучшее значение метрики на обучающей и тестовой выборки

In [7]:
alphas = [0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1, 5, 10, 50]

Без регуляризации

In [8]:
model = linear_model.LinearRegression()
model.fit(X_train, Y_train)
print(f"Train MSE: {metrics.mean_squared_error(Y_train, model.predict(X_train)):.3}")
print(f"Test MSE: {metrics.mean_squared_error(Y_test, model.predict(X_test)):.3}")

Train MSE: 0.521
Test MSE: 0.559


Ridge

In [9]:
best_train_mse = np.inf
best_test_mse = np.inf
best_alpha = None

for alpha in alphas:
    model = linear_model.Ridge(alpha=alpha)
    model.fit(X_train, Y_train)
    train_mse = metrics.mean_squared_error(Y_train, model.predict(X_train))
    test_mse = metrics.mean_squared_error(Y_test, model.predict(X_test))
    if test_mse < best_test_mse:
        best_train_mse = train_mse
        best_test_mse = test_mse
        best_alpha = alpha

print(f"alpha: {best_alpha}")
print(f"Train MSE: {best_train_mse:.3}")
print(f"Test MSE: {best_test_mse:.3}")

alpha: 10
Train MSE: 0.521
Test MSE: 0.559


Lasso

In [10]:
best_train_mse = np.inf
best_test_mse = np.inf
best_alpha = None

for alpha in alphas:
    model = linear_model.Lasso(alpha=alpha)
    model.fit(X_train, Y_train)
    train_mse = metrics.mean_squared_error(Y_train, model.predict(X_train))
    test_mse = metrics.mean_squared_error(Y_test, model.predict(X_test))
    if test_mse < best_test_mse:
        best_train_mse = train_mse
        best_test_mse = test_mse
        best_alpha = alpha

print(f"alpha: {best_alpha}")
print(f"Train MSE: {best_train_mse:.3}")
print(f"Test MSE: {best_test_mse:.3}")

alpha: 0.001
Train MSE: 0.521
Test MSE: 0.559


ElasticNet

In [11]:
best_train_mse = np.inf
best_test_mse = np.inf
best_alpha = None

for alpha in alphas:
    model = linear_model.ElasticNet(alpha=alpha)
    model.fit(X_train, Y_train)
    train_mse = metrics.mean_squared_error(Y_train, model.predict(X_train))
    test_mse = metrics.mean_squared_error(Y_test, model.predict(X_test))
    if test_mse < best_test_mse:
        best_train_mse = train_mse
        best_test_mse = test_mse
        best_alpha = alpha

print(f"alpha: {best_alpha}")
print(f"Train MSE: {best_train_mse:.3}")
print(f"Test MSE: {best_test_mse:.3}")

alpha: 0.001
Train MSE: 0.521
Test MSE: 0.559


## Задание 2

Добавьте в датасет больше признаков. Так как с помощью линейной регрессии мы можем найти только линейные зависимости, добавьте в обучающий и тестовый датасеты
1) квадраты всех признаков
2) кубы всех признаков
3) логарифмы всех признаков (придумайте как поступить с отрицательными значениями)
4) все возможные попарные произведения исходных признаков

Повторите на новом датасете задание 1. Почему получаются такие результаты?

In [12]:
num_features = X.shape[1]
for i in range(num_features):
    column = X.columns[i]
    X_train[f"{column}_squared"] = X_train[column] ** 2
    X_test[f"{column}_squared"] = X_test[column] ** 2
    X_train[f"{column}_cubed"] = X_train[column] ** 3
    X_test[f"{column}_cubed"] = X_test[column] ** 3
    X_train[f"{column}_log"] = np.log(10 + X_train[column])
    X_test[f"{column}_log"] = np.log(10 + X_test[column])

    for j in range(i + 1, num_features):
        other_column = X.columns[j]
        X_train[f"{column}_x_{other_column}"] = X_train[column] * X_train[other_column]
        X_test[f"{column}_x_{other_column}"] = X_test[column] * X_test[other_column]

In [13]:
X_train

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,MedInc_squared,MedInc_cubed,...,AveOccup_log,AveOccup_x_Latitude,AveOccup_x_Longitude,Latitude_squared,Latitude_cubed,Latitude_log,Latitude_x_Longitude,Longitude_squared,Longitude_cubed,Longitude_log
0,-0.018837,0.983513,-0.066315,0.072017,-0.009699,-0.006443,-0.808863,0.641303,0.000355,-0.000007,...,2.301941,0.005212,-0.004132,0.654259,-0.529206,2.218240,-0.518726,0.411269,0.263748,2.364743
1,2.185237,1.856793,0.712744,-0.137047,-0.132519,-0.044325,0.980369,-1.439758,4.775261,10.435077,...,2.298143,-0.043454,0.063817,0.961123,0.942254,2.396109,-1.411493,2.072902,-2.984478,2.147128
2,-0.106583,0.110233,-0.705284,-0.025634,1.097431,-0.068481,0.853905,-1.285051,0.011360,-0.001211,...,2.295713,-0.058476,0.088001,0.729153,0.622627,2.384525,-1.097311,1.651355,-2.122075,2.165040
3,-0.377698,-1.001215,0.075494,-0.127940,-0.557123,-0.028728,1.462805,-0.815939,0.142656,-0.053881,...,2.299708,-0.042023,0.023440,2.139800,3.130110,2.439107,-1.193560,0.665756,-0.543216,2.217470
4,0.558387,-1.080604,0.358362,-0.223439,-0.014086,0.010648,0.558822,-0.087318,0.311796,0.174103,...,2.303649,0.005950,-0.000930,0.312282,0.174510,2.356962,-0.048795,0.007624,-0.000666,2.293815
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18571,1.319983,0.507178,0.277077,-0.352128,-0.674679,-0.004660,-0.869753,0.805991,1.742356,2.299881,...,2.302119,0.004053,-0.003756,0.756470,-0.657942,2.211593,-0.701013,0.649621,0.523589,2.380101
18572,-0.434945,0.348400,0.571327,0.355758,0.285945,0.075129,-0.757340,1.070490,0.189177,-0.082282,...,2.310070,-0.056898,0.080424,0.573565,-0.434384,2.223830,-0.810726,1.145949,1.226727,2.404283
18573,-0.496050,0.586567,-0.575286,-0.035683,0.288577,0.022774,-0.752657,0.596388,0.246066,-0.122061,...,2.304860,-0.017141,0.013582,0.566492,-0.426374,2.224336,-0.448875,0.355678,0.212122,2.360513
18574,0.975975,-1.080604,0.383036,-0.059851,0.307000,0.008758,0.910111,-1.190230,0.952527,0.929643,...,2.303460,0.007970,-0.010424,0.828302,0.753846,2.389690,-1.083241,1.416648,-1.686137,2.175861


In [14]:
from sklearn.preprocessing import StandardScaler

X_scaler = StandardScaler()
X_scaler.fit(X_train)
X_train = pd.DataFrame(X_scaler.transform(X_train), columns=X_scaler.get_feature_names_out())
X_test = pd.DataFrame(X_scaler.transform(X_test), columns=X_scaler.get_feature_names_out())

Без регуляризации

In [15]:
model = linear_model.LinearRegression()
model.fit(X_train, Y_train)
print(f"Train MSE: {metrics.mean_squared_error(Y_train, model.predict(X_train)):.3}")
print(f"Test MSE: {metrics.mean_squared_error(Y_test, model.predict(X_test)):.3}")

Train MSE: 0.387
Test MSE: 0.427


Ridge

In [16]:
best_train_mse = np.inf
best_test_mse = np.inf
best_alpha = None

for alpha in alphas:
    model = linear_model.Ridge(alpha=alpha)
    model.fit(X_train, Y_train)
    train_mse = metrics.mean_squared_error(Y_train, model.predict(X_train))
    test_mse = metrics.mean_squared_error(Y_test, model.predict(X_test))
    if test_mse < best_test_mse:
        best_train_mse = train_mse
        best_test_mse = test_mse
        best_alpha = alpha

print(f"alpha: {best_alpha}")
print(f"Train MSE: {best_train_mse:.3}")
print(f"Test MSE: {best_test_mse:.3}")

alpha: 50
Train MSE: 0.409
Test MSE: 0.429


Lasso

In [17]:
best_train_mse = np.inf
best_test_mse = np.inf
best_alpha = None

for alpha in alphas:
    model = linear_model.Lasso(alpha=alpha)
    model.fit(X_train, Y_train)
    train_mse = metrics.mean_squared_error(Y_train, model.predict(X_train))
    test_mse = metrics.mean_squared_error(Y_test, model.predict(X_test))
    if test_mse < best_test_mse:
        best_train_mse = train_mse
        best_test_mse = test_mse
        best_alpha = alpha

print(f"alpha: {best_alpha}")
print(f"Train MSE: {best_train_mse:.3}")
print(f"Test MSE: {best_test_mse:.3}")

  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


alpha: 0.001
Train MSE: 0.41
Test MSE: 0.429


ElasticNet

In [18]:
best_train_mse = np.inf
best_test_mse = np.inf
best_alpha = None

for alpha in alphas:
    model = linear_model.ElasticNet(alpha=alpha)
    model.fit(X_train, Y_train)
    train_mse = metrics.mean_squared_error(Y_train, model.predict(X_train))
    test_mse = metrics.mean_squared_error(Y_test, model.predict(X_test))
    if test_mse < best_test_mse:
        best_train_mse = train_mse
        best_test_mse = test_mse
        best_alpha = alpha

print(f"alpha: {best_alpha}")
print(f"Train MSE: {best_train_mse:.3}")
print(f"Test MSE: {best_test_mse:.3}")

  model = cd_fast.enet_coordinate_descent(


alpha: 0.001
Train MSE: 0.409
Test MSE: 0.428


# Задание 3

Проанализируйте веса лучшей модели. По ним оцените:
1) Топ-5 наиболее важных признаков
2) Топ-5 признаков, увеличение которых **увеличивает** цену
3) Топ-5 признаков, увеличение которых **уменьшает** цену

In [19]:
model = linear_model.LinearRegression()
model.fit(X_train, Y_train)
print(f"Train MSE: {metrics.mean_squared_error(Y_train, model.predict(X_train)):.3}")
print(f"Test MSE: {metrics.mean_squared_error(Y_test, model.predict(X_test)):.3}")

Train MSE: 0.387
Test MSE: 0.427


In [20]:
model.coef_

array([ 1.39495561e+01,  1.20878710e+02,  1.92206835e+00, -2.34104658e+00,
        7.44788497e-01,  1.58237802e+01, -3.68902870e+02, -5.56595707e+01,
       -1.51327999e+00,  8.76186549e-03, -1.22885608e+01,  5.48436563e-02,
        3.87844751e-02,  6.51546792e-03,  7.99207074e-02, -1.76020440e-01,
       -4.96670273e-01, -4.68687703e-01, -6.69725395e+00,  1.09560966e+00,
       -1.21529368e+02, -2.21558212e-02,  4.91862616e-02,  4.50346569e-02,
       -6.69089058e-01, -2.62849169e-01, -2.43335962e-01,  3.43798555e-01,
        8.58018325e-01, -1.57866996e+00, -4.71078550e+00, -1.48807531e-01,
       -1.71462505e-01,  5.92455558e-01,  3.20461374e-01,  5.49686674e+00,
       -1.54275543e+00,  1.67086821e+00,  1.80767794e-01,  7.03495592e-02,
       -6.31447448e-01, -3.08000810e-01, -2.28844096e-01,  1.40480148e-01,
       -6.05015066e-01, -4.92562258e-01,  4.09138434e-02,  2.05500419e-02,
       -2.12703049e+01,  1.19063798e+01, -5.37429438e+00, -3.04947940e-01,
       -2.17353721e-01,  

Топ-5 наиболее важных

In [21]:
ind = np.argpartition(np.abs(model.coef_), -5)[-5:]
print(X_train.columns[ind].tolist())

['Longitude', 'HouseAge', 'HouseAge_log', 'Latitude_log', 'Latitude']


Топ-5 признаков, увеличение которых **увеличивает** цену

In [22]:
ind = np.argpartition(model.coef_, -5)[-5:]
print(X_train.columns[ind].tolist())

['AveOccup', 'Latitude_squared', 'Longitude_log', 'HouseAge', 'Latitude_log']


Топ-5 признаков, увеличение которых **уменьшает** цену

In [23]:
ind = np.argpartition(model.coef_, 5)[:5]
print(X_train.columns[ind].tolist())

['HouseAge_log', 'Latitude', 'Longitude', 'AveOccup_squared', 'MedInc_log']


## Задание 4*

Попробуйте добиться качества 0.4 на тестовой выборке, используя следующие техники:
- Очистка обучающего датасета от выбросов
- Преобразования признаков
- Преобразования таргета (метрику считать необходимо на оригинальном таргете!)