# Определение стоимости автомобилей

Сервис по продаже автомобилей с пробегом разрабатывает приложение для привлечения новых клиентов. В нём можно быстро узнать рыночную стоимость своего автомобиля. В нашем распоряжении исторические данные: технические характеристики, комплектации и цены автомобилей. Нам нужно построить модель для определения стоимости. 

Заказчику важны:

- качество предсказания;
- скорость предсказания;
- время обучения.

## Подготовка данных

In [1]:
import pandas as pd
!pip3 install pandas-profiling==2.11
from pandas_profiling import ProfileReport
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
import numpy as np
import warnings
warnings.filterwarnings("ignore")
from lightgbm import LGBMRegressor
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import make_scorer
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.dummy import DummyRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor

import time
from sklearn import linear_model
from numpy import arange
from sklearn. linear_model import LassoCV
from sklearn. model_selection import RepeatedKFold
from sklearn.preprocessing import OrdinalEncoder




In [2]:
try:
    data = pd.read_csv('/datasets/autos.csv')
except:
    data = pd.read_csv('https://code.s3.yandex.net/datasets/autos.csv')

In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 354369 entries, 0 to 354368
Data columns (total 16 columns):
 #   Column             Non-Null Count   Dtype 
---  ------             --------------   ----- 
 0   DateCrawled        354369 non-null  object
 1   Price              354369 non-null  int64 
 2   VehicleType        316879 non-null  object
 3   RegistrationYear   354369 non-null  int64 
 4   Gearbox            334536 non-null  object
 5   Power              354369 non-null  int64 
 6   Model              334664 non-null  object
 7   Kilometer          354369 non-null  int64 
 8   RegistrationMonth  354369 non-null  int64 
 9   FuelType           321474 non-null  object
 10  Brand              354369 non-null  object
 11  Repaired           283215 non-null  object
 12  DateCreated        354369 non-null  object
 13  NumberOfPictures   354369 non-null  int64 
 14  PostalCode         354369 non-null  int64 
 15  LastSeen           354369 non-null  object
dtypes: int64(7), object(

In [4]:
data.head(10)

Unnamed: 0,DateCrawled,Price,VehicleType,RegistrationYear,Gearbox,Power,Model,Kilometer,RegistrationMonth,FuelType,Brand,Repaired,DateCreated,NumberOfPictures,PostalCode,LastSeen
0,2016-03-24 11:52:17,480,,1993,manual,0,golf,150000,0,petrol,volkswagen,,2016-03-24 00:00:00,0,70435,2016-04-07 03:16:57
1,2016-03-24 10:58:45,18300,coupe,2011,manual,190,,125000,5,gasoline,audi,yes,2016-03-24 00:00:00,0,66954,2016-04-07 01:46:50
2,2016-03-14 12:52:21,9800,suv,2004,auto,163,grand,125000,8,gasoline,jeep,,2016-03-14 00:00:00,0,90480,2016-04-05 12:47:46
3,2016-03-17 16:54:04,1500,small,2001,manual,75,golf,150000,6,petrol,volkswagen,no,2016-03-17 00:00:00,0,91074,2016-03-17 17:40:17
4,2016-03-31 17:25:20,3600,small,2008,manual,69,fabia,90000,7,gasoline,skoda,no,2016-03-31 00:00:00,0,60437,2016-04-06 10:17:21
5,2016-04-04 17:36:23,650,sedan,1995,manual,102,3er,150000,10,petrol,bmw,yes,2016-04-04 00:00:00,0,33775,2016-04-06 19:17:07
6,2016-04-01 20:48:51,2200,convertible,2004,manual,109,2_reihe,150000,8,petrol,peugeot,no,2016-04-01 00:00:00,0,67112,2016-04-05 18:18:39
7,2016-03-21 18:54:38,0,sedan,1980,manual,50,other,40000,7,petrol,volkswagen,no,2016-03-21 00:00:00,0,19348,2016-03-25 16:47:58
8,2016-04-04 23:42:13,14500,bus,2014,manual,125,c_max,30000,8,petrol,ford,,2016-04-04 00:00:00,0,94505,2016-04-04 23:42:13
9,2016-03-17 10:53:50,999,small,1998,manual,101,golf,150000,0,,volkswagen,,2016-03-17 00:00:00,0,27472,2016-03-31 17:17:06


In [5]:
profile = ProfileReport(data,
                        title='Pandas Profiling Report - Data 0')

profile.to_widgets()

Summarize dataset:   0%|          | 0/29 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render widgets:   0%|          | 0/1 [00:00<?, ?it/s]

VBox(children=(Tab(children=(Tab(children=(GridBox(children=(VBox(children=(GridspecLayout(children=(HTML(valu…

Изменим названия столбцов в соответствии с snake_case

In [6]:
data.columns

Index(['DateCrawled', 'Price', 'VehicleType', 'RegistrationYear', 'Gearbox',
       'Power', 'Model', 'Kilometer', 'RegistrationMonth', 'FuelType', 'Brand',
       'Repaired', 'DateCreated', 'NumberOfPictures', 'PostalCode',
       'LastSeen'],
      dtype='object')

In [7]:
data.columns = ['Date_Crawled', 'Price', 'Vehicle_Type', 'Registration_Year', 'Gear_box',
       'Power', 'Model', 'Kilometer', 'Registration_Month', 'Fuel_Type', 'Brand',
       'Repaired', 'Date_Created', 'Number_Of_Pictures', 'Postal_Code',
       'Last_Seen']

In [8]:
data.columns = map(str.lower,data.columns)
data.columns

Index(['date_crawled', 'price', 'vehicle_type', 'registration_year',
       'gear_box', 'power', 'model', 'kilometer', 'registration_month',
       'fuel_type', 'brand', 'repaired', 'date_created', 'number_of_pictures',
       'postal_code', 'last_seen'],
      dtype='object')

Есть неинформативные столбцы, сразу удалим их

In [9]:
data = data.drop(['date_crawled', 'date_created','number_of_pictures','postal_code','last_seen', 'model'], axis=1)

In [10]:
data.columns

Index(['price', 'vehicle_type', 'registration_year', 'gear_box', 'power',
       'kilometer', 'registration_month', 'fuel_type', 'brand', 'repaired'],
      dtype='object')

Увидели, что много пропусков в отчете profile, посмотрим на них отдельно

In [11]:
pd.DataFrame(round(data.isna().mean()*100,)).style.background_gradient('coolwarm')

Unnamed: 0,0
price,0.0
vehicle_type,11.0
registration_year,0.0
gear_box,6.0
power,0.0
kilometer,0.0
registration_month,0.0
fuel_type,9.0
brand,0.0
repaired,20.0


В пункте с ремонтом - была машина в ремонте или нет, ответ да или нет, поэтому пропуски предлагаю считать за отрицание и заполнить их ответом нет

In [12]:
data['repaired'].fillna('no', inplace=True)

In [13]:
data['vehicle_type'].value_counts()

sedan          91457
small          79831
wagon          65166
bus            28775
convertible    20203
coupe          16163
suv            11996
other           3288
Name: vehicle_type, dtype: int64

Пропуски в пункте с типом автомобильного кузова предлагаю заполнить значением другой

In [14]:
data['vehicle_type'].fillna('other', inplace=True)

In [15]:
data['gear_box'].value_counts()

manual    268251
auto       66285
Name: gear_box, dtype: int64

Заполним пропуски заглушкой

In [16]:
data['gear_box'].fillna('unknown', inplace=True)

In [17]:
data['fuel_type'].value_counts()

petrol      216352
gasoline     98720
lpg           5310
cng            565
hybrid         233
other          204
electric        90
Name: fuel_type, dtype: int64

In [18]:
data['fuel_type'].fillna('unknown', inplace=True)

In [19]:
pd.DataFrame(round(data.isna().mean()*100,)).style.background_gradient('coolwarm')

Unnamed: 0,0
price,0.0
vehicle_type,0.0
registration_year,0.0
gear_box,0.0
power,0.0
kilometer,0.0
registration_month,0.0
fuel_type,0.0
brand,0.0
repaired,0.0


Посмотрим числовые данные

In [20]:
data.describe()

Unnamed: 0,price,registration_year,power,kilometer,registration_month
count,354369.0,354369.0,354369.0,354369.0,354369.0
mean,4416.656776,2004.234448,110.094337,128211.172535,5.714645
std,4514.158514,90.227958,189.850405,37905.34153,3.726421
min,0.0,1000.0,0.0,5000.0,0.0
25%,1050.0,1999.0,69.0,125000.0,3.0
50%,2700.0,2003.0,105.0,150000.0,6.0
75%,6400.0,2008.0,143.0,150000.0,9.0
max,20000.0,9999.0,20000.0,150000.0,12.0


есть аномальные значения в годах регистрации, и нулевой месяц надо удалить

In [21]:
data['registration_year'].sort_values().unique()

array([1000, 1001, 1039, 1111, 1200, 1234, 1253, 1255, 1300, 1400, 1500,
       1600, 1602, 1688, 1800, 1910, 1915, 1919, 1920, 1923, 1925, 1927,
       1928, 1929, 1930, 1931, 1932, 1933, 1934, 1935, 1936, 1937, 1938,
       1940, 1941, 1942, 1943, 1944, 1945, 1946, 1947, 1948, 1949, 1950,
       1951, 1952, 1953, 1954, 1955, 1956, 1957, 1958, 1959, 1960, 1961,
       1962, 1963, 1964, 1965, 1966, 1967, 1968, 1969, 1970, 1971, 1972,
       1973, 1974, 1975, 1976, 1977, 1978, 1979, 1980, 1981, 1982, 1983,
       1984, 1985, 1986, 1987, 1988, 1989, 1990, 1991, 1992, 1993, 1994,
       1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004, 2005,
       2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016,
       2017, 2018, 2019, 2066, 2200, 2222, 2290, 2500, 2800, 2900, 3000,
       3200, 3500, 3700, 3800, 4000, 4100, 4500, 4800, 5000, 5300, 5555,
       5600, 5900, 5911, 6000, 6500, 7000, 7100, 7500, 7800, 8000, 8200,
       8455, 8500, 8888, 9000, 9229, 9450, 9996, 99

In [22]:
data = data.query('registration_year > 1910')

In [23]:
data = data.query('registration_year < 2023')

In [24]:
data['registration_year'].sort_values().unique()

array([1915, 1919, 1920, 1923, 1925, 1927, 1928, 1929, 1930, 1931, 1932,
       1933, 1934, 1935, 1936, 1937, 1938, 1940, 1941, 1942, 1943, 1944,
       1945, 1946, 1947, 1948, 1949, 1950, 1951, 1952, 1953, 1954, 1955,
       1956, 1957, 1958, 1959, 1960, 1961, 1962, 1963, 1964, 1965, 1966,
       1967, 1968, 1969, 1970, 1971, 1972, 1973, 1974, 1975, 1976, 1977,
       1978, 1979, 1980, 1981, 1982, 1983, 1984, 1985, 1986, 1987, 1988,
       1989, 1990, 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
       2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010,
       2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019])

In [25]:
data['registration_month'].sort_values().unique()

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12])

In [26]:
data = data.query('registration_month >= 1')

In [27]:
data['registration_month'].sort_values().unique()

array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12])

Вывод: открыли файл, посмотрели типы данных и что они из себя представляют, с помощью profiling посмотрели взаимосвязи между данными, распределения, есть ли пропуски и дубликаты. Были обработаны названия столбцов, проведена работа с пропусками, проведено кодирование категориальных данных, неинформативные столбцы были удалены.

## Обучение моделей

In [28]:
features = data.drop('price', axis=1)
target = data['price']

In [29]:
features_v_l, features_test, target_v_l, target_test = train_test_split(
    features, target, test_size=0.20, random_state=12345) 

features_train, features_valid, target_train, target_valid = train_test_split(
    features_v_l, target_v_l, test_size=0.25, random_state=12345)


Проведем кодирование отдельно для разных типов моделей

Линейные модели (прямое кодирование)

In [30]:
features_train_ohe = pd.get_dummies(features_train, drop_first=True)
features_valid_ohe = pd.get_dummies(features_valid, drop_first=True)
features_test_ohe = pd.get_dummies(features_test, drop_first=True)

common_columns = set(features_train_ohe.columns) & set(features_valid_ohe.columns) & set(features_test_ohe.columns)

features_train_ohe = features_train_ohe[list(common_columns)]
features_valid_ohe = features_valid_ohe[list(common_columns)]
features_test_ohe = features_test_ohe[list(common_columns)]


Деревянные модели (порядковое кодирование)

In [31]:
encoder = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=9999)
cat_columns = ['vehicle_type', 'gear_box', 'fuel_type', 'brand', 'repaired']
encoder.fit(features_train[cat_columns])

features_train_ord = features_train.copy()
features_valid_ord = features_valid.copy()
features_test_ord = features_test.copy()

features_train_ord[cat_columns] = encoder.transform(features_train_ord[cat_columns])
features_valid_ord[cat_columns] = encoder.transform(features_valid_ord[cat_columns])
features_test_ord[cat_columns] = encoder.transform(features_test_ord[cat_columns])

In [32]:
numeric = ['registration_year', 'power', 'kilometer', 'registration_month']
pd.options.mode.chained_assignment = None
scaler = StandardScaler() 
scaler.fit(features_train[numeric])
features_train[numeric] = scaler.transform(features_train[numeric])
features_valid[numeric] = scaler.transform(features_valid[numeric])
features_test[numeric] = scaler.transform(features_test[numeric])
print(features_train.head())

       vehicle_type  registration_year gear_box     power  kilometer  \
232716        small          -0.039887   manual -0.351898   0.585810   
193107        sedan           0.950859   manual  0.035081  -0.769778   
292111        wagon           0.526254   manual -0.033577   0.585810   
323150  convertible           1.517000   manual  0.035081  -2.396484   
347544  convertible           0.243184   manual  0.727899   0.585810   

        registration_month fuel_type    brand repaired  
232716           -1.007363    petrol  renault       no  
193107           -1.007363    petrol  peugeot       no  
292111           -0.709176  gasoline     ford       no  
323150           -1.007363    petrol     opel       no  
347544           -0.410988    petrol      bmw       no  


Попробуем обучить разные модели на гиперпараметрах по умолчанию и посмотрим какая модель покажет лучшее время и RMSE

In [33]:
%%time

model = RandomForestRegressor(random_state=12345)
model.fit(features_train_ord, target_train)

predictions = model.predict(features_valid_ord)
rmse = (mean_squared_error(target_valid, predictions))**0.5
rmse

CPU times: user 44.1 s, sys: 830 ms, total: 44.9 s
Wall time: 44.9 s


1809.0178125396915

In [34]:
%%time

model_lgbm = LGBMRegressor(random_state=12345)
model_lgbm.fit(features_train_ord, target_train)
predictions_lgbm = model_lgbm.predict(features_valid_ord)
rmse_lgbm = (mean_squared_error(target_valid, predictions_lgbm))**0.5
rmse_lgbm

CPU times: user 4.01 s, sys: 0 ns, total: 4.01 s
Wall time: 4.02 s


1877.62733697449

In [35]:
%%time

model_cat = CatBoostRegressor(random_state=12345, learning_rate=0.5, verbose=False)
model_cat.fit(features_train_ord, target_train)
predictions_cat = model_cat.predict(features_valid_ord)
rmse_cat = (mean_squared_error(target_valid, predictions_cat))**0.5
rmse_cat

CPU times: user 22 s, sys: 102 ms, total: 22.1 s
Wall time: 23.2 s


1785.2268832357154

In [36]:
%%time

model_ridge = linear_model.Ridge(random_state=12345)
model_ridge.fit(features_train_ohe, target_train)
predictions_ridge = model_ridge.predict(features_valid_ohe)
rmse_ridge = (mean_squared_error(target_valid, predictions_ridge))**0.5
rmse_ridge

CPU times: user 158 ms, sys: 178 ms, total: 337 ms
Wall time: 315 ms


3067.4860316221975

In [37]:
%%time
cv = RepeatedKFold(n_splits= 10 , n_repeats= 3 , random_state= 12345)
model_lasso = LassoCV(alphas= arange (0, 1, 0.01), cv=cv, n_jobs=-1)
model_lasso.fit(features_train_ohe, target_train)
predictions_lasso = model_lasso.predict(features_valid_ohe)
rmse_lasso = (mean_squared_error(target_valid, predictions_lasso))**0.5
rmse_lasso

CPU times: user 1min 7s, sys: 51 s, total: 1min 58s
Wall time: 1min 59s


3067.4835084424058

Вывод: провели обучение моделей с параметрами по умолчанию, лучший показатель по RMSE случайный лес, а по времени гребневая регрессия.

## Анализ моделей

Проанализируем модели, подобрав разные гиперпараметры

In [38]:
def parameters_model(features, target, model, rs_space):
    lr = model(random_state=12345)
    lr_random = RandomizedSearchCV(lr, rs_space, scoring='neg_mean_squared_error', n_jobs=-1)
    model_random = lr_random.fit(features, target)
    
    rmse = -model_random.best_score_
    
    parameters = model_random.best_params_
    return rmse**0.5, parameters

In [39]:
forest_rmse, forest_parameters = parameters_model(features_train_ord, target_train, RandomForestRegressor,  rs_space={'criterion':['squared_error', 'friedman_mse', 'absolute_error', 'poisson'],
              'max_depth': range (1,13, 2), 'n_estimators': range (10, 51, 10)})

print(forest_rmse, forest_parameters)

1989.0310586514656 {'n_estimators': 20, 'max_depth': 11, 'criterion': 'friedman_mse'}


In [40]:
%%time
model_forest = RandomForestRegressor(random_state=12345, n_estimators=20, max_depth=11, criterion='friedman_mse')
model_forest.fit(features_train_ord, target_train)


CPU times: user 4.25 s, sys: 46.2 ms, total: 4.3 s
Wall time: 4.3 s


RandomForestRegressor(criterion='friedman_mse', max_depth=11, n_estimators=20,
                      random_state=12345)

In [41]:
%%time
model_forest.predict(features_valid_ord)

CPU times: user 109 ms, sys: 4.56 ms, total: 113 ms
Wall time: 112 ms


array([ 6232.43185168,  6881.20339571, 10646.75445812, ...,
        6163.00058477,   752.38673937,  8151.51704866])

In [42]:
lgbm_rmse, lgbm_parameters = parameters_model(features_train_ord, target_train, LGBMRegressor, rs_space={
              'n_estimators': range(30, 1000, 10),
              'max_depth': range(-1, 256, 1),
              })
lgbm_rmse, lgbm_parameters

(1767.6394657001174, {'n_estimators': 850, 'max_depth': 38})

In [43]:
%%time
model_lgbm = LGBMRegressor(random_state=12345, n_estimators=850, max_depth=38)
model_lgbm.fit(features_train_ord, target_train)

CPU times: user 20.6 s, sys: 18 ms, total: 20.6 s
Wall time: 20.7 s


LGBMRegressor(max_depth=38, n_estimators=850, random_state=12345)

In [44]:
%%time
model_lgbm.predict(features_valid_ord)

CPU times: user 4.15 s, sys: 0 ns, total: 4.15 s
Wall time: 4.13 s


array([5540.98239033, 6513.43022251, 8646.51680672, ..., 4352.2395662 ,
        367.24078987, 8393.66497281])

In [45]:
cat_rmse, cat_parameters = parameters_model(features_train_ord, target_train, CatBoostRegressor, rs_space={'depth' : [6,8,10],
              'learning_rate' : [0.01, 0.05, 0.1],
              'iterations'    : [30, 50, 100]
              })
print(cat_rmse, cat_parameters)

0:	learn: 4565.0900064	total: 26.4ms	remaining: 766ms
1:	learn: 4535.2828182	total: 50ms	remaining: 700ms
2:	learn: 4505.5225384	total: 71.6ms	remaining: 645ms
3:	learn: 4476.2816320	total: 94.1ms	remaining: 612ms
4:	learn: 4447.0523215	total: 122ms	remaining: 612ms
5:	learn: 4418.3415391	total: 145ms	remaining: 582ms
6:	learn: 4389.9585388	total: 167ms	remaining: 549ms
7:	learn: 4362.1341273	total: 189ms	remaining: 519ms
8:	learn: 4333.8958614	total: 210ms	remaining: 490ms
9:	learn: 4306.8010457	total: 233ms	remaining: 466ms
10:	learn: 4279.5474249	total: 255ms	remaining: 441ms
11:	learn: 4253.3467790	total: 277ms	remaining: 415ms
12:	learn: 4226.8250722	total: 298ms	remaining: 390ms
13:	learn: 4200.6191704	total: 322ms	remaining: 368ms
14:	learn: 4174.6674233	total: 344ms	remaining: 344ms
15:	learn: 4149.0650794	total: 367ms	remaining: 321ms
16:	learn: 4123.7990859	total: 389ms	remaining: 297ms
17:	learn: 4098.7433682	total: 411ms	remaining: 274ms
18:	learn: 4073.9113269	total: 434ms

In [46]:
%%time
model_cat = CatBoostRegressor(random_state=12345, learning_rate=0.1, iterations=100, depth=10)
model_cat.fit(features_train_ord, target_train)

0:	learn: 4276.4282099	total: 45.3ms	remaining: 4.49s
1:	learn: 4000.5727100	total: 86.7ms	remaining: 4.25s
2:	learn: 3758.5697096	total: 127ms	remaining: 4.1s
3:	learn: 3541.5504624	total: 167ms	remaining: 4.01s
4:	learn: 3353.7956443	total: 207ms	remaining: 3.93s
5:	learn: 3189.2375269	total: 250ms	remaining: 3.92s
6:	learn: 3045.0276801	total: 290ms	remaining: 3.85s
7:	learn: 2920.1103909	total: 332ms	remaining: 3.82s
8:	learn: 2806.5640261	total: 372ms	remaining: 3.76s
9:	learn: 2708.5273159	total: 412ms	remaining: 3.71s
10:	learn: 2625.0885923	total: 453ms	remaining: 3.67s
11:	learn: 2548.0793200	total: 495ms	remaining: 3.63s
12:	learn: 2485.8855069	total: 546ms	remaining: 3.65s
13:	learn: 2430.0753478	total: 586ms	remaining: 3.6s
14:	learn: 2380.4960648	total: 626ms	remaining: 3.55s
15:	learn: 2338.9485697	total: 667ms	remaining: 3.5s
16:	learn: 2302.1180275	total: 707ms	remaining: 3.45s
17:	learn: 2267.6353611	total: 749ms	remaining: 3.41s
18:	learn: 2238.9282761	total: 788ms	re

<catboost.core.CatBoostRegressor at 0x7f920e41dbe0>

In [47]:
%%time
model_cat.predict(features_valid_ord)

CPU times: user 25.6 ms, sys: 204 µs, total: 25.8 ms
Wall time: 24.2 ms


array([6506.45258102, 7311.66915963, 9794.11243021, ..., 4873.66652679,
        486.21444285, 8916.81940965])

In [48]:
ridge_rmse, ridge_parameters = parameters_model(features_train_ohe, target_train, linear_model.Ridge, rs_space={'alpha' : range(0, 1), 
                                                                                           'solver': ['auto', 'svd', 'cholesky', 'lsqr', 'saga', 'lbfgs']} )
print(ridge_rmse, ridge_parameters)

3069.9464160038656 {'solver': 'svd', 'alpha': 0}


In [49]:
%%time
ridge_model = linear_model.Ridge(random_state=12345, solver='svd', alpha=0)
ridge_model.fit(features_train_ohe, target_train)

CPU times: user 2.25 s, sys: 1.54 s, total: 3.8 s
Wall time: 3.77 s


Ridge(alpha=0, random_state=12345, solver='svd')

In [50]:
%%time
ridge_model.predict(features_valid_ohe)

CPU times: user 26.9 ms, sys: 359 µs, total: 27.3 ms
Wall time: 19.8 ms


array([   58.85832074,  6955.48546628, 10296.0369144 , ...,
        4614.55298981,  -887.2773496 , 10510.82314617])

In [51]:
%%time
lasso_model = LassoCV(random_state=12345)
lasso_model.fit(features_train_ohe, target_train)

CPU times: user 2.75 s, sys: 2.21 s, total: 4.96 s
Wall time: 4.95 s


LassoCV(random_state=12345)

In [52]:
%%time
lasso_model.predict(features_valid_ohe)

CPU times: user 28.9 ms, sys: 11.7 ms, total: 40.6 ms
Wall time: 33.7 ms


array([6609.3077218 , 7841.9749225 , 9870.89349986, ..., 4873.41602101,
       3684.54137187, 3363.23929023])

Сравнение

In [53]:
comparison = {'model':['forest','lgbm','catboost','ridge','lasso'], 
              'rmse':['1989', '1767', '1884', '3069', '3067'], 
              'fit_time':['4.3 s', '20.7 s', '4.75 s', '3.77 s', '4.95 s'], 
              'predict_time':['112 ms', '4.13 s', '24.2 ms', '19.8 ms', '33.7 ms']}

df = pd.DataFrame(comparison)
df


Unnamed: 0,model,rmse,fit_time,predict_time
0,forest,1989,4.3 s,112 ms
1,lgbm,1767,20.7 s,4.13 s
2,catboost,1884,4.75 s,24.2 ms
3,ridge,3069,3.77 s,19.8 ms
4,lasso,3067,4.95 s,33.7 ms


Вывод: проанализировали модели, подобрав случайным поиском лучшие параметры, в качестве лучшей модели выберу catboost, показатель RMSE у нее меньше 2500 как и требуется и она быстро обучается, Лассо в предыдущем пункте проанализировала, сюда поэтому не включила

## Тестирование лучшей модели

In [54]:
model_cat_test = CatBoostRegressor()
model_cat_test.fit(features_test_ord, target_test)
predictions_cat_test = model_cat_test.predict(features_test_ord)
rmse_test = (mean_squared_error(target_test, predictions_cat_test))**0.5
rmse_test

Learning rate set to 0.078868
0:	learn: 4367.4382201	total: 12.9ms	remaining: 12.9s
1:	learn: 4173.9498832	total: 24.5ms	remaining: 12.2s
2:	learn: 3997.6179466	total: 33.1ms	remaining: 11s
3:	learn: 3826.0944547	total: 41.2ms	remaining: 10.3s
4:	learn: 3670.4998494	total: 48.6ms	remaining: 9.68s
5:	learn: 3538.1891710	total: 56.2ms	remaining: 9.31s
6:	learn: 3406.4779277	total: 63.8ms	remaining: 9.05s
7:	learn: 3293.5428019	total: 71.6ms	remaining: 8.88s
8:	learn: 3196.3570894	total: 79.2ms	remaining: 8.72s
9:	learn: 3106.3719201	total: 86.6ms	remaining: 8.57s
10:	learn: 3022.0114724	total: 93.9ms	remaining: 8.44s
11:	learn: 2948.6241483	total: 102ms	remaining: 8.36s
12:	learn: 2882.6657486	total: 109ms	remaining: 8.27s
13:	learn: 2821.9703407	total: 117ms	remaining: 8.22s
14:	learn: 2763.6738988	total: 124ms	remaining: 8.14s
15:	learn: 2710.4179796	total: 131ms	remaining: 8.08s
16:	learn: 2668.1206518	total: 139ms	remaining: 8.05s
17:	learn: 2630.3043140	total: 147ms	remaining: 8.01s

1678.496706074799

In [55]:
model_dummy = DummyRegressor()
model_dummy.fit(features_test, target_test)
predictions_dummy = model_dummy.predict(features_test)
rmse_dummy = (mean_squared_error(target_test, predictions_dummy))**0.5
rmse_dummy

4589.3127343734495

Общий вывод: посмотрели данные и провели их предобработку, далее разделили выборки на обучающую и тестовую, масштабировали численные данные, провели обучение и анализ моделей, выявили лучшую - это CatBoost, провели тест на тестовых данных -тест пройден, результат как и требовалось RMSE менее 2500 и проверили на адекватность с помощью dummy модели, также показатели нашей модели лучше этой модели.