In [None]:
%pip install -r requirements.txt

In [2]:
import numpy as np

import pandas as pd

from catboost import CatBoostRegressor

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.model_selection import RandomizedSearchCV

In [52]:
df = pd.read_csv('df_2.csv', index_col=0)

In [53]:
df = df.drop(['date', 'day', 'month', 'year', 'dayofweek', 'street', 'lat_long', 'price_sq_meter'], axis=1)


In [72]:
df

Unnamed: 0,rooms,district,floor,total_floors,total_area,living_area,kitchen_area,price,park_eko,attraction,south,eternal_fire,veter,city_center,floor_type
0,2,,5.0,9.0,49.8,30.0,9.0,4000.0,,,,,,,middle
1,2,Орджоникидзевский,13.0,14.0,66.2,40.0,10.5,3999.0,5.49,3.89,1.61,4.77,7.27,5.08,middle
2,3,Орджоникидзевский,7.0,10.0,65.0,46.0,9.0,5400.0,5.72,3.45,1.33,6.16,8.81,6.18,middle
3,3,Ленинский,5.0,5.0,62.4,43.0,8.0,4500.0,2.63,4.23,6.86,0.88,1.94,0.85,last
4,1,Ленинский,4.0,4.0,32.4,17.0,6.0,2300.0,4.14,6.30,9.34,3.38,1.50,3.16,last
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
460,3,Ленинский,5.0,5.0,74.0,56.0,9.0,5500.0,4.32,6.22,8.82,2.81,0.34,2.86,last
461,2,Ленинский,1.0,5.0,41.0,22.2,7.2,930.0,12.63,14.30,15.76,10.60,8.28,10.96,first
462,2,Правобережный,5.0,5.0,42.5,29.0,6.0,2650.0,1.74,3.41,6.40,1.03,2.77,0.31,last
463,2,Орджоникидзевский,3.0,9.0,50.0,30.0,9.0,3350.0,5.87,4.29,1.65,5.06,7.53,5.41,middle


In [57]:
categorical = ['district', 'floor', 'total_floors', 'floor_type']

df[categorical] = df[categorical].astype(str)

In [58]:
X = df.drop(['price'], axis=1)
y = df['price']

X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.25, 
                                                    random_state=47,
                                                    shuffle=True,
                                                    #stratify=df['price']
                                                    )

In [59]:
print(f"Категориальные признаки: {categorical}")

numerical = list(set(X_train.columns) - set(categorical))
print(f"Числовые признаки: {numerical}")

Категориальные признаки: ['district', 'floor', 'total_floors', 'floor_type']
Числовые признаки: ['eternal_fire', 'total_area', 'south', 'kitchen_area', 'park_eko', 'veter', 'city_center', 'rooms', 'living_area', 'attraction']


In [60]:
pipe = Pipeline(
    [
        ("regressor", CatBoostRegressor())
    ]
)

param_grid = [
    {
        'regressor': [CatBoostRegressor(random_state=47, 
                                        eval_metric='RMSE',
                                        verbose=100,
                                        #silent=True,
                                        cat_features=categorical,
                                        early_stopping_rounds=20)],
        'regressor__depth': range(4, 12, 2),
        'regressor__learning_rate': np.linspace(0.01, 0.09, 7),
        'regressor__iterations': range(200, 1200, 200)
    }
]

grid = RandomizedSearchCV(pipe, param_grid, 
                          n_iter=10 ,
                          cv=3, 
                          scoring = 'neg_mean_squared_error', 
                          verbose=1)

In [None]:
%%time
grid.fit(X_train, y_train)

In [62]:
result = pd.DataFrame(grid.cv_results_)
result[['split0_test_score','split1_test_score','split2_test_score','mean_test_score','std_test_score']] = \
(result[['split0_test_score','split1_test_score','split2_test_score','mean_test_score','std_test_score']]*-1)**0.5

In [63]:
result.sort_values('mean_test_score')

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_regressor__learning_rate,param_regressor__iterations,param_regressor__depth,param_regressor,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score
6,10.902446,0.095686,0.003002,1.573482e-06,0.09,600,6,<catboost.core.CatBoostRegressor object at 0x0...,"{'regressor__learning_rate': 0.09, 'regressor_...",1101.061128,713.797674,899.876246,918.625783,,1
0,20.639137,1.844969,0.004002,0.00141383,0.09,1000,6,<catboost.core.CatBoostRegressor object at 0x0...,"{'regressor__learning_rate': 0.09, 'regressor_...",1102.427564,715.262683,900.288303,919.685865,,2
3,17.568792,0.06349,0.003333,0.0004709792,0.09,600,8,<catboost.core.CatBoostRegressor object at 0x0...,"{'regressor__learning_rate': 0.09, 'regressor_...",1112.7858,718.937108,907.960439,927.281863,,3
7,9.166378,0.074286,0.003673,0.0004766008,0.036667,800,4,<catboost.core.CatBoostRegressor object at 0x0...,{'regressor__learning_rate': 0.036666666666666...,1048.48906,707.378397,991.544296,927.878152,,4
9,31.385876,2.348036,0.003667,0.0004710895,0.023333,1000,8,<catboost.core.CatBoostRegressor object at 0x0...,{'regressor__learning_rate': 0.023333333333333...,1130.569963,711.973277,937.79807,942.436489,,5
4,22.105048,1.469353,0.003001,7.37001e-07,0.01,800,8,<catboost.core.CatBoostRegressor object at 0x0...,"{'regressor__learning_rate': 0.01, 'regressor_...",1129.355219,726.20215,949.873007,949.573852,,6
2,35.873364,1.529871,0.004665,0.0004717638,0.036667,800,10,<catboost.core.CatBoostRegressor object at 0x0...,{'regressor__learning_rate': 0.036666666666666...,1163.754171,705.740281,938.398457,954.460542,,7
8,17.303835,0.499827,0.003332,0.000471258,0.063333,400,10,<catboost.core.CatBoostRegressor object at 0x0...,{'regressor__learning_rate': 0.063333333333333...,1160.823649,718.757529,945.394857,958.80053,,8
5,2.351038,0.090777,0.001999,2.247832e-07,0.023333,200,4,<catboost.core.CatBoostRegressor object at 0x0...,{'regressor__learning_rate': 0.023333333333333...,1122.715686,737.43508,986.138189,962.0758,,9
1,8.434183,0.342995,0.193626,0.2695863,0.09,200,10,<catboost.core.CatBoostRegressor object at 0x0...,"{'regressor__learning_rate': 0.09, 'regressor_...",1161.871863,749.994083,965.858116,973.878714,,10


In [68]:
dset = pd.concat([pd.Series(X_train.columns), 
                  pd.Series(grid.best_estimator_.named_steps["regressor"].feature_importances_)], axis= 1 )
dset = dset.set_axis(['attr', 'importance'], 
                     axis=1).sort_values(by='importance', 
                                                ascending=True)
fig = go.Figure(go.Bar(
            x=dset["importance"],
            y=dset['attr'],
            orientation='h'))

fig.update_layout(width=1500, height=500)

In [69]:
%%time
y_final = grid.best_estimator_.predict(X_test)

CPU times: total: 0 ns
Wall time: 4 ms


In [70]:
from sklearn.metrics import mean_squared_error
mean_squared_error(y_test, y_final, squared=False)

840.7404000988691

In [71]:
import joblib
joblib.dump(grid, "model.pkl")

['model.pkl']