In [31]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, OrdinalEncoder

In [32]:
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error

In [33]:
import warnings
warnings.filterwarnings("ignore")

In [34]:
data = pd.read_csv('../data/automobile_preprocessed.csv')

In [35]:
data = data.iloc[:, 1:]

In [36]:
enc = OrdinalEncoder()
data['make'] = enc.fit_transform(data[["make"]])

In [37]:
data

Unnamed: 0,symboling,normalized-losses,make,fuel-type,aspiration,num-of-doors,drive-wheels,engine-location,wheel-base,length,...,num-of-cylinders_six,num-of-cylinders_three,num-of-cylinders_twelve,fuel-system_1bbl,fuel-system_2bbl,fuel-system_idi,fuel-system_mfi,fuel-system_mpfi,fuel-system_spdi,fuel-system_spfi
0,3,122,0.0,0,0,1,2,0,88.6,168.8,...,0,0,0,0,0,0,0,1,0,0
1,3,122,0.0,0,0,1,2,0,88.6,168.8,...,0,0,0,0,0,0,0,1,0,0
2,1,122,0.0,0,0,1,2,0,94.5,171.2,...,1,0,0,0,0,0,0,1,0,0
3,2,164,1.0,0,0,0,1,0,99.8,176.6,...,0,0,0,0,0,0,0,1,0,0
4,2,164,1.0,0,0,0,0,0,99.4,176.6,...,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
188,-1,95,20.0,0,0,0,2,0,109.1,188.8,...,0,0,0,0,0,0,0,1,0,0
189,-1,95,20.0,0,1,0,2,0,109.1,188.8,...,0,0,0,0,0,0,0,1,0,0
190,-1,95,20.0,0,0,0,2,0,109.1,188.8,...,1,0,0,0,0,0,0,1,0,0
191,-1,95,20.0,1,1,0,2,0,109.1,188.8,...,1,0,0,0,0,1,0,0,0,0


In [38]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 193 entries, 0 to 192
Data columns (total 45 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   symboling                193 non-null    int64  
 1   normalized-losses        193 non-null    int64  
 2   make                     193 non-null    float64
 3   fuel-type                193 non-null    int64  
 4   aspiration               193 non-null    int64  
 5   num-of-doors             193 non-null    int64  
 6   drive-wheels             193 non-null    int64  
 7   engine-location          193 non-null    int64  
 8   wheel-base               193 non-null    float64
 9   length                   193 non-null    float64
 10  width                    193 non-null    float64
 11  height                   193 non-null    float64
 12  curb-weight              193 non-null    int64  
 13  engine-size              193 non-null    int64  
 14  bore                     1

In [39]:
X = data.drop(columns=['price']).values # все признаки, кроме целевого
y = data['price'].values # целевой признак

In [40]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

In [41]:
scaler = StandardScaler()

X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [42]:
X_train

array([[ 0.20976052,  0.26731676, -0.00218913, ..., -0.91287093,
        -0.21821789, -0.08084521],
       [ 0.20976052, -0.01529272,  0.33493649, ..., -0.91287093,
        -0.21821789, -0.08084521],
       [-1.36600145, -1.71094958,  1.00918772, ...,  1.09544512,
        -0.21821789, -0.08084521],
       ...,
       [-0.57812046,  0.26731676, -0.00218913, ...,  1.09544512,
        -0.21821789, -0.08084521],
       [-0.57812046,  0.07891044,  1.17775053, ...,  1.09544512,
        -0.21821789, -0.08084521],
       [-0.57812046,  1.3035515 ,  0.16637368, ...,  1.09544512,
        -0.21821789, -0.08084521]])

In [43]:
def print_metrics(y_test, y_pred):
    print("R2:", r2_score(y_test, y_pred))
    print("MSE:", mean_squared_error(y_test, y_pred))
    print("RMSE:", mean_squared_error(y_test, y_pred, squared=False))
    print("MAE:", mean_absolute_error(y_test, y_pred))

**Linreg**

https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LinearRegression.html#sklearn.linear_model.LinearRegression

In [44]:
from sklearn.linear_model import LinearRegression

lr = LinearRegression()
lr.fit(X_train, y_train)

LinearRegression()

In [45]:
print_metrics(y_test, lr.predict(X_test))

R2: 0.8767400539281057
MSE: 13851325.431686739
RMSE: 3721.7368837260296
MAE: 2477.250853793811


своя реализация LinReg

In [46]:
class linReg:
    def __init__(self, lr = 0.0001, iters = 5000):
        self.lr = lr
        self.iters = iters

    def fit(self, x, y):
        x = np.c_[np.ones(x.shape[0]), x]
        self.w = np.random.rand((x.shape[1]))

        for i in range(self.iters):
            self.w = self.w - self.lr * np.dot(x.T, (np.dot(x, self.w) - y)) * 2 / x.shape[0]

    def predict(self, x):
        x = np.c_[np.ones(x.shape[0]), x]
        return np.dot(x, self.w)

In [47]:
lr_ = linReg()
lr_.fit(X_train, y_train)
print_metrics(y_test, lr_.predict(X_test))

R2: 0.5936454608801681
MSE: 45664055.04273778
RMSE: 6757.518408612572
MAE: 5427.250776924238


Увеличим количество итераций

In [48]:
lr_ = linReg(iters=50000)
lr_.fit(X_train, y_train)

In [49]:
print_metrics(y_test, lr_.predict(X_test))

R2: 0.8799098501086123
MSE: 13495119.868991578
RMSE: 3673.5704524333787
MAE: 2343.5393152329852


Качество готовой модели и собственной могут отличаться. В собственной модели от величины шага и кол-ва итераций зависят правильность построения модели

**DecisionTreeRegressor**

https://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeRegressor.html#sklearn.tree.DecisionTreeRegressor

In [50]:
from sklearn.tree import DecisionTreeRegressor

dtr = DecisionTreeRegressor(random_state=42)
params = {'max_depth': np.arange(1,15,1),
          'max_features': np.arange(5,X.shape[1]-1,1)}
clf = GridSearchCV(dtr, params)
clf.fit(X_train, y_train)
clf.best_params_

{'max_depth': 6, 'max_features': 33}

In [51]:
print_metrics(y_test, clf.predict(X_test))

R2: 0.9349425786398476
MSE: 7310821.915176182
RMSE: 2703.853160801485
MAE: 1779.9327707151238


Качество данной модели немного хуже, чем в предыдущей

**Lasso**

https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.Lasso.html#sklearn.linear_model.Lasso

https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LassoCV.html#sklearn.linear_model.LassoCV

In [52]:
from sklearn.linear_model import Lasso, LassoCV

In [53]:
lasso = Lasso(random_state=42)
params = {'alpha': np.arange(0.1,2.05,0.05)}
clf = GridSearchCV(lasso, params)
clf.fit(X_train, y_train)
clf.best_params_

{'alpha': 0.1}

In [54]:
print_metrics(y_test, clf.predict(X_test))

R2: 0.8772532107057729
MSE: 13793659.484625574
RMSE: 3713.9816214711636
MAE: 2470.865533170151


Lasso с кросс-валидацией

In [55]:
lasso_cv = LassoCV(random_state=42)
lasso_cv.fit(X_train, y_train)

LassoCV(random_state=42)

In [56]:
print_metrics(y_test, lasso_cv.predict(X_test))

R2: 0.8749743675876691
MSE: 14049744.276502766
RMSE: 3748.298851012652
MAE: 2467.3398167136693


Lasso и LassoCV показывают примерно одинаковое качество.

**Ridge**

https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.Ridge.html#sklearn.linear_model.Ridge

https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.RidgeCV.html#sklearn.linear_model.RidgeCV

In [57]:
from sklearn.linear_model import Ridge, RidgeCV

In [58]:
ridge = Ridge(random_state=42)
params = {'alpha': np.arange(0.1,2.05,0.05)}
clf = GridSearchCV(ridge, params)
clf.fit(X_train, y_train)
clf.best_params_

{'alpha': 2.0000000000000004}

In [59]:
print_metrics(y_test, clf.predict(X_test))

R2: 0.8831635449678119
MSE: 13129486.199771138
RMSE: 3623.4632880396534
MAE: 2324.4374370099813


Ridge с кросс-валидацией

In [61]:
ridge_cv = RidgeCV(cv=5)
ridge_cv.fit(X_train, y_train)

RidgeCV(alphas=array([ 0.1,  1. , 10. ]), cv=5)

In [62]:
print_metrics(y_test, ridge_cv.predict(X_test))

R2: 0.8818619985825361
MSE: 13275747.358577553
RMSE: 3643.5898998896064
MAE: 2378.1821708838784


Тоже самое и с Ridge и RidgeCV

**Elastic Net**

https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.ElasticNet.html#sklearn.linear_model.ElasticNet

https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.ElasticNetCV.html#sklearn.linear_model.ElasticNetCV

In [63]:
from sklearn.linear_model import ElasticNet, ElasticNetCV

In [64]:
params = {'alpha': np.arange(0.1,2.05,0.05),
          'l1_ratio': np.arange(0.1,1.1,0.1)}

clf = GridSearchCV(ElasticNet(random_state=42), params)
clf.fit(X_train, y_train)
clf.best_params_

{'alpha': 0.1, 'l1_ratio': 0.8}

In [65]:
print_metrics(y_test, clf.predict(X_test))

R2: 0.8831604824735886
MSE: 13129830.347285984
RMSE: 3623.510776482662
MAE: 2313.878317380594


ElasticNet без гиперпараметров

In [66]:
print_metrics(y_test, ElasticNet(random_state=42).fit(X_train, y_train).predict(X_test))

R2: 0.8367761828478278
MSE: 18342261.875225585
RMSE: 4282.786695041627
MAE: 2711.483915633681


ElasticNet с кросс-валидацией

In [67]:
en_cv = ElasticNetCV(cv=5, random_state=42)
en_cv.fit(X_train, y_train)

ElasticNetCV(cv=5, random_state=42)

In [68]:
print_metrics(y_test, en_cv.predict(X_test))

R2: 0.5896081689738716
MSE: 46117745.36014398
RMSE: 6791.004738633598
MAE: 4339.627064694045
