In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

In [None]:
# https://www.kaggle.com/datasets/abdelrahmankhalil/100000-uk-used-car-data-set

In [84]:
df = pd.read_csv('unzipped_archive/bmw.csv')

# Валидационная выборка

In [85]:
# train = 60%, validate = 20%, test = 20%

# сначала определяем train = 60%:

train, test = train_test_split(df, train_size = 0.6, random_state = 42)

In [4]:
len(train) / len(df)

0.5999443465355718

In [5]:
len(test) / len(df)

0.4000556534644282

In [86]:
# теперь определяем валидационную выборку и test - как 50% на 50% (от теста)
 
val, test = train_test_split(test, train_size = 0.5, random_state = 42)

In [7]:
len(val) / len(df)

0.19998144884519062

In [8]:
len(test) / len(df)

0.20007420461923756

# Список фичей

In [None]:
# Разновидности фичей:
    
# непрерывные (числовые) фичи = ['year', 'mileage', 'tax', 'mpg', 'engineSize']

# и 

# категориальные фичи = ['model', 'transmission', 'fuelType']

In [13]:
train.head()

Unnamed: 0,model,year,price,transmission,mileage,fuelType,tax,mpg,engineSize
5133,X6,2015,25000,Semi-Auto,26100,Diesel,205,45.6,3.0
2121,X2,2019,31980,Automatic,4804,Petrol,145,34.0,2.0
6576,X2,2018,22995,Semi-Auto,24136,Petrol,145,38.7,2.0
8330,3 Series,2016,12999,Manual,52224,Diesel,125,62.8,2.0
3161,X3,2019,32980,Semi-Auto,1961,Diesel,150,54.3,2.0


In [9]:
# все наши признаки: (включая целевой признак - его надо убрать из фичей)

train.columns

Index(['model', 'year', 'price', 'transmission', 'mileage', 'fuelType', 'tax',
       'mpg', 'engineSize'],
      dtype='object')

In [10]:
# список фичей сохраняем в переменную Х (x-большое):

X = ['model', 'year', 'transmission', 'mileage', 'fuelType', 'tax', 'mpg', 'engineSize']

In [11]:
# целевой признак сохраняем в переменную y (y-маленькое):

y = ['price']

In [12]:
# определяем категориальные фичи:

cat_features = ['model', 'transmission', 'fuelType']

# Запуск КэтБуста

In [19]:
# !pip install catboost

In [17]:
from catboost import CatBoostRegressor

In [None]:
# Первый запуск пробуем на тех же фичах, которые применяли в человеческом обучении:
# список фичей сохраняем в переменную Х (x-большое):

X = ['year', 'transmission', 'engineSize']

# целевой признак сохраняем в переменную y (y-маленькое):

y = ['price']


In [23]:
# передаем КэтБусту категориальные фичи и используемую метрику:

model = CatBoostRegressor(cat_features = cat_features, eval_metric = 'MAPE', verbose = 100)

In [24]:
model.fit(train[X], train[y], eval_set = (val[X], val[y]))

Learning rate set to 0.068263
0:	learn: 0.4599628	test: 0.4555782	best: 0.4555782 (0)	total: 39.9ms	remaining: 39.9s
100:	learn: 0.0899566	test: 0.0886397	best: 0.0886397 (100)	total: 4.39s	remaining: 39.1s
200:	learn: 0.0784329	test: 0.0788599	best: 0.0788599 (200)	total: 8.47s	remaining: 33.7s
300:	learn: 0.0729501	test: 0.0751211	best: 0.0751061 (299)	total: 12.7s	remaining: 29.5s
400:	learn: 0.0697490	test: 0.0735425	best: 0.0735425 (400)	total: 16.8s	remaining: 25.1s
500:	learn: 0.0673184	test: 0.0722698	best: 0.0722698 (500)	total: 22s	remaining: 22s
600:	learn: 0.0656606	test: 0.0715477	best: 0.0715477 (600)	total: 26.3s	remaining: 17.5s
700:	learn: 0.0641905	test: 0.0710945	best: 0.0710945 (700)	total: 30.6s	remaining: 13.1s
800:	learn: 0.0628803	test: 0.0707882	best: 0.0707575 (794)	total: 34.7s	remaining: 8.63s
900:	learn: 0.0617151	test: 0.0703845	best: 0.0703845 (900)	total: 38.9s	remaining: 4.27s
999:	learn: 0.0606650	test: 0.0702898	best: 0.0702730 (979)	total: 43s	remain

<catboost.core.CatBoostRegressor at 0x25e842c5060>

In [25]:
# Работаем на выборке test:

model.predict(test[X])

array([56918.76292215, 22437.01991601, 13323.39059668, ...,
       30077.53028673, 13712.85710997, 24544.95066628])

In [26]:
# Задаем новую колонку с предиктом цены:

test['price_predict'] = model.predict(test[X])

In [27]:
test

Unnamed: 0,model,year,price,transmission,mileage,fuelType,tax,mpg,engineSize,price_predict
4984,X6,2019,52990,Semi-Auto,3086,Diesel,145,34.9,3.0,56918.762922
4880,X1,2018,24081,Semi-Auto,13245,Diesel,150,60.1,2.0,22437.019916
9132,1 Series,2016,12999,Automatic,68949,Petrol,200,43.5,2.0,13323.390597
6451,1 Series,2020,11995,Semi-Auto,10,Petrol,150,34.5,2.0,33416.841004
7199,3 Series,2020,29875,Semi-Auto,150,Petrol,145,42.2,2.0,34999.003042
...,...,...,...,...,...,...,...,...,...,...
9627,3 Series,2015,14999,Automatic,78680,Diesel,160,52.3,3.0,14283.884697
6096,4 Series,2019,29676,Automatic,7365,Petrol,145,41.5,3.0,30842.309886
1747,5 Series,2019,30570,Automatic,3067,Hybrid,145,49.6,2.0,30077.530287
4023,3 Series,2016,14999,Automatic,70054,Hybrid,0,148.7,2.0,13712.857110


In [29]:
# расчет MAE:

test['err'] = abs(test['price_predict'] - test['price'])
test['err'].mean()

1557.829933640045

In [30]:
test

Unnamed: 0,model,year,price,transmission,mileage,fuelType,tax,mpg,engineSize,price_predict,err
4984,X6,2019,52990,Semi-Auto,3086,Diesel,145,34.9,3.0,56918.762922,3928.762922
4880,X1,2018,24081,Semi-Auto,13245,Diesel,150,60.1,2.0,22437.019916,1643.980084
9132,1 Series,2016,12999,Automatic,68949,Petrol,200,43.5,2.0,13323.390597,324.390597
6451,1 Series,2020,11995,Semi-Auto,10,Petrol,150,34.5,2.0,33416.841004,21421.841004
7199,3 Series,2020,29875,Semi-Auto,150,Petrol,145,42.2,2.0,34999.003042,5124.003042
...,...,...,...,...,...,...,...,...,...,...,...
9627,3 Series,2015,14999,Automatic,78680,Diesel,160,52.3,3.0,14283.884697,715.115303
6096,4 Series,2019,29676,Automatic,7365,Petrol,145,41.5,3.0,30842.309886,1166.309886
1747,5 Series,2019,30570,Automatic,3067,Hybrid,145,49.6,2.0,30077.530287,492.469713
4023,3 Series,2016,14999,Automatic,70054,Hybrid,0,148.7,2.0,13712.857110,1286.142890


In [31]:
# расчет МАРЕ:

test['err_percent'] = test['err'] / test['price']
test['err_percent'].mean()

0.07355150889681633

# Проверка CatBoost на всех имеющихся фичах

In [48]:
# список фичей сохраняем в переменную Х (x-большое):

X = ['model', 'year', 'transmission', 'mileage', 'fuelType', 'tax', 'mpg', 'engineSize']

In [49]:
# определяем категориальные фичи:

cat_features = ['model', 'transmission', 'fuelType']

In [50]:
# целевой признак сохраняем в переменную y (y-маленькое):

y = ['price']

In [55]:
# зададим параметры отдельно через словарь:

# Количество итераций (early_stopping_rounds - если улучшений нет за 200 итераций - останов) 


# и learning rate (скорость обучения) - если итераций сдалано 999, то результат не достигнут, надо поднять 'learning rate'

parameters = {'cat_features': cat_features, 
              'eval_metric': 'MAPE', 
              'verbose': 100, 
              'random_seed': 42, 
              'learning_rate': 0.04,
             'early_stopping_rounds': 200}

In [58]:
# передаем КэтБусту заданные параметры:

model = CatBoostRegressor(**parameters)

In [53]:
model.fit(train[X], train[y], eval_set = (val[X], val[y]))

0:	learn: 0.4711596	test: 0.4666621	best: 0.4666621 (0)	total: 51.4ms	remaining: 51.4s
100:	learn: 0.1053797	test: 0.1039319	best: 0.1039319 (100)	total: 4.78s	remaining: 42.5s
200:	learn: 0.0854406	test: 0.0856249	best: 0.0856249 (200)	total: 9.32s	remaining: 37s
300:	learn: 0.0788871	test: 0.0804969	best: 0.0804969 (300)	total: 13.7s	remaining: 31.9s
400:	learn: 0.0750965	test: 0.0777876	best: 0.0777876 (400)	total: 18.1s	remaining: 27.1s
500:	learn: 0.0721271	test: 0.0756791	best: 0.0756786 (499)	total: 22.5s	remaining: 22.4s
600:	learn: 0.0699669	test: 0.0743096	best: 0.0743096 (600)	total: 27.7s	remaining: 18.4s
700:	learn: 0.0685569	test: 0.0734598	best: 0.0734598 (700)	total: 33.3s	remaining: 14.2s
800:	learn: 0.0674260	test: 0.0729098	best: 0.0729033 (799)	total: 39s	remaining: 9.68s
900:	learn: 0.0662586	test: 0.0721544	best: 0.0721544 (900)	total: 44.3s	remaining: 4.87s
999:	learn: 0.0652414	test: 0.0716313	best: 0.0716313 (999)	total: 49.9s	remaining: 0us

bestTest = 0.07163

<catboost.core.CatBoostRegressor at 0x25e844a2e60>

In [42]:
# предикт на тесте:

test['price_predict_all'] = model.predict(test[X])

In [43]:
test

Unnamed: 0,model,year,price,transmission,mileage,fuelType,tax,mpg,engineSize,price_predict,err,err_percent,price_predict_all
4984,X6,2019,52990,Semi-Auto,3086,Diesel,145,34.9,3.0,56918.762922,3928.762922,0.074142,56918.762922
4880,X1,2018,24081,Semi-Auto,13245,Diesel,150,60.1,2.0,22437.019916,1643.980084,0.068269,22437.019916
9132,1 Series,2016,12999,Automatic,68949,Petrol,200,43.5,2.0,13323.390597,324.390597,0.024955,13323.390597
6451,1 Series,2020,11995,Semi-Auto,10,Petrol,150,34.5,2.0,33416.841004,21421.841004,1.785898,33416.841004
7199,3 Series,2020,29875,Semi-Auto,150,Petrol,145,42.2,2.0,34999.003042,5124.003042,0.171515,34999.003042
...,...,...,...,...,...,...,...,...,...,...,...,...,...
9627,3 Series,2015,14999,Automatic,78680,Diesel,160,52.3,3.0,14283.884697,715.115303,0.047678,14283.884697
6096,4 Series,2019,29676,Automatic,7365,Petrol,145,41.5,3.0,30842.309886,1166.309886,0.039301,30842.309886
1747,5 Series,2019,30570,Automatic,3067,Hybrid,145,49.6,2.0,30077.530287,492.469713,0.016110,30077.530287
4023,3 Series,2016,14999,Automatic,70054,Hybrid,0,148.7,2.0,13712.857110,1286.142890,0.085749,13712.857110


In [44]:
# расчет MAE:

test['err'] = abs(test['price_predict_all'] - test['price'])
test['err'].mean()

1557.829933640045

In [45]:
# расчет МАРЕ:

test['err_percent'] = test['err'] / test['price']
test['err_percent'].mean()

0.07355150889681633

# Обучение на всех данных

In [None]:
# после того, как найдена точка переобучения - можно выборку валидации val добавить к выборке train

In [56]:
pd.concat([train, val])

Unnamed: 0,model,year,price,transmission,mileage,fuelType,tax,mpg,engineSize
5133,X6,2015,25000,Semi-Auto,26100,Diesel,205,45.6,3.0
2121,X2,2019,31980,Automatic,4804,Petrol,145,34.0,2.0
6576,X2,2018,22995,Semi-Auto,24136,Petrol,145,38.7,2.0
8330,3 Series,2016,12999,Manual,52224,Diesel,125,62.8,2.0
3161,X3,2019,32980,Semi-Auto,1961,Diesel,150,54.3,2.0
...,...,...,...,...,...,...,...,...,...
5152,3 Series,2019,27888,Semi-Auto,10622,Diesel,150,52.3,2.0
9910,5 Series,2012,11990,Automatic,84000,Diesel,165,51.4,3.0
535,3 Series,2016,13798,Semi-Auto,46673,Diesel,0,74.3,2.0
8310,1 Series,2017,11991,Automatic,108000,Petrol,150,47.9,2.0


In [57]:
train_full = pd.concat([train, val])

In [62]:
model.best_iteration_

999

In [63]:
# новые параметры с использованием model.best_iteration_

parameters = {'iterations': model.best_iteration_+1,
              'cat_features': cat_features, 
              'eval_metric': 'MAPE', 
              'verbose': 100, 
              'random_seed': 42, 
              'learning_rate': 0.04,
             'early_stopping_rounds': 200}

In [64]:
parameters

{'iterations': 1000,
 'cat_features': ['model', 'transmission', 'fuelType'],
 'eval_metric': 'MAPE',
 'verbose': 100,
 'random_seed': 42,
 'learning_rate': 0.04,
 'early_stopping_rounds': 200}

In [65]:
model.fit(train_full[X], train_full[y])

0:	learn: 0.4686652	total: 30.9ms	remaining: 30.9s
100:	learn: 0.1045521	total: 4.04s	remaining: 36s
200:	learn: 0.0867311	total: 8.74s	remaining: 34.8s
300:	learn: 0.0797848	total: 13.6s	remaining: 31.7s
400:	learn: 0.0755976	total: 18s	remaining: 26.9s
500:	learn: 0.0727070	total: 22.8s	remaining: 22.8s
600:	learn: 0.0708238	total: 27.4s	remaining: 18.2s
700:	learn: 0.0691556	total: 32s	remaining: 13.6s
800:	learn: 0.0677530	total: 36.2s	remaining: 8.98s
900:	learn: 0.0665484	total: 40.5s	remaining: 4.45s
999:	learn: 0.0656585	total: 44.1s	remaining: 0us


<catboost.core.CatBoostRegressor at 0x25e844bb100>

In [71]:
test['predict_price_all_features'] = model.predict(test[X])

In [72]:
test

Unnamed: 0,model,year,price,transmission,mileage,fuelType,tax,mpg,engineSize,price_predict,err,err_percent,price_predict_all,predict_price_all_features
4984,X6,2019,52990,Semi-Auto,3086,Diesel,145,34.9,3.0,56918.762922,3928.762922,0.074142,56918.762922,56673.011485
4880,X1,2018,24081,Semi-Auto,13245,Diesel,150,60.1,2.0,22437.019916,1643.980084,0.068269,22437.019916,22454.951978
9132,1 Series,2016,12999,Automatic,68949,Petrol,200,43.5,2.0,13323.390597,324.390597,0.024955,13323.390597,13453.230985
6451,1 Series,2020,11995,Semi-Auto,10,Petrol,150,34.5,2.0,33416.841004,21421.841004,1.785898,33416.841004,33748.805487
7199,3 Series,2020,29875,Semi-Auto,150,Petrol,145,42.2,2.0,34999.003042,5124.003042,0.171515,34999.003042,34537.047645
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9627,3 Series,2015,14999,Automatic,78680,Diesel,160,52.3,3.0,14283.884697,715.115303,0.047678,14283.884697,14352.724254
6096,4 Series,2019,29676,Automatic,7365,Petrol,145,41.5,3.0,30842.309886,1166.309886,0.039301,30842.309886,30897.770873
1747,5 Series,2019,30570,Automatic,3067,Hybrid,145,49.6,2.0,30077.530287,492.469713,0.016110,30077.530287,30713.891506
4023,3 Series,2016,14999,Automatic,70054,Hybrid,0,148.7,2.0,13712.857110,1286.142890,0.085749,13712.857110,14048.492032


In [69]:
# Расчет ошибки МАЕ и МАРЕ в sklearn:

from sklearn.metrics import mean_absolute_error, mean_absolute_percentage_error

# для среднего:
print(mean_absolute_error(test['price'], test['predict_price_all_fratures']))
print(mean_absolute_percentage_error(test['price'], test['predict_price_all_fratures']))
print('*'*20)

1569.468867117051
0.07517689512107839
********************


# Метрики регрессии

In [None]:
# выше мы использовали метрики МАЕ и МАРЕ

In [None]:
# по умолчанию модель движется к снижению функции RMSE Root-mean-square deviation

# RMSE это функция оптимизации по умолчанию

# можно поменять функцию оптимизации: например на 'loss_function': 'МАЕ'
# или любую другую функцию поддерживаемую для оптимизации: 
# https://catboost.ai/en/docs/concepts/loss-functions-regression#usage-information

In [87]:
# список фичей сохраняем в переменную Х (x-большое):

X = ['model', 'year', 'transmission', 'mileage', 'fuelType', 'tax', 'mpg', 'engineSize']

In [88]:
# определяем категориальные фичи:

cat_features = ['model', 'transmission', 'fuelType']

In [89]:
# целевой признак сохраняем в переменную y (y-маленькое):

y = ['price']

In [100]:
# зададим параметры отдельно через словарь: (дополнительно задаем loss_function)

# Количество итераций (early_stopping_rounds - если улучшений нет за 200 итераций - останов) и learning rate (скорость обучения)

parameters = {'cat_features': cat_features,
              
              'loss_function': 'MAE',
              
              'eval_metric': 'MAPE', 
              'verbose': 100, 
              'random_seed': 42, 
              'learning_rate': 0.1,
             'early_stopping_rounds': 200}

In [101]:
# передаем КэтБусту заданные параметры:

model = CatBoostRegressor(**parameters)

In [102]:
model.fit(train[X], train[y], eval_set = (val[X], val[y]))

0:	learn: 0.3849345	test: 0.3793341	best: 0.3793341 (0)	total: 35.4ms	remaining: 35.4s
100:	learn: 0.0764009	test: 0.0774578	best: 0.0774578 (100)	total: 3.8s	remaining: 33.8s
200:	learn: 0.0666216	test: 0.0706498	best: 0.0706498 (200)	total: 8.03s	remaining: 31.9s
300:	learn: 0.0628577	test: 0.0695086	best: 0.0694924 (298)	total: 13.1s	remaining: 30.4s
400:	learn: 0.0605748	test: 0.0687890	best: 0.0687826 (399)	total: 17.8s	remaining: 26.5s
500:	learn: 0.0589764	test: 0.0683467	best: 0.0683222 (490)	total: 22.4s	remaining: 22.3s
600:	learn: 0.0579201	test: 0.0680567	best: 0.0680567 (600)	total: 27s	remaining: 17.9s
700:	learn: 0.0570454	test: 0.0678671	best: 0.0678659 (681)	total: 31.8s	remaining: 13.6s
800:	learn: 0.0563072	test: 0.0676018	best: 0.0676018 (800)	total: 36s	remaining: 8.94s
900:	learn: 0.0555798	test: 0.0674500	best: 0.0674384 (889)	total: 39.7s	remaining: 4.36s
999:	learn: 0.0548863	test: 0.0673326	best: 0.0672972 (972)	total: 43.2s	remaining: 0us

bestTest = 0.067297

<catboost.core.CatBoostRegressor at 0x25e88eb3ca0>

In [96]:
# предикт на тесте:

test['price_predict_all'] = model.predict(test[X])

In [97]:
test

Unnamed: 0,model,year,price,transmission,mileage,fuelType,tax,mpg,engineSize,price_predict_all
4984,X6,2019,52990,Semi-Auto,3086,Diesel,145,34.9,3.0,58459.411804
4880,X1,2018,24081,Semi-Auto,13245,Diesel,150,60.1,2.0,22281.628207
9132,1 Series,2016,12999,Automatic,68949,Petrol,200,43.5,2.0,14010.139323
6451,1 Series,2020,11995,Semi-Auto,10,Petrol,150,34.5,2.0,33475.689657
7199,3 Series,2020,29875,Semi-Auto,150,Petrol,145,42.2,2.0,32871.298896
...,...,...,...,...,...,...,...,...,...,...
9627,3 Series,2015,14999,Automatic,78680,Diesel,160,52.3,3.0,14153.645076
6096,4 Series,2019,29676,Automatic,7365,Petrol,145,41.5,3.0,31998.659982
1747,5 Series,2019,30570,Automatic,3067,Hybrid,145,49.6,2.0,31198.277053
4023,3 Series,2016,14999,Automatic,70054,Hybrid,0,148.7,2.0,13842.460815


In [98]:
# расчет MAE:

test['err'] = abs(test['price_predict_all'] - test['price'])
test['err'].mean()

1573.0615582947705

In [99]:
# расчет МАРЕ:

test['err_percent'] = test['err'] / test['price']
test['err_percent'].mean()

0.07239296660046018