In [1]:
import pandas as pd
import numpy as np
import pickle
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import SGDRegressor
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn import tree, ensemble
import catboost as ct

In [2]:
df3 = pd.read_csv('housing.csv')
df3.head(5)

Unnamed: 0,price,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,furnishingstatus
0,13300000,7420,4,2,3,yes,no,no,no,yes,2,yes,furnished
1,12250000,8960,4,4,4,yes,no,no,no,yes,3,no,furnished
2,12250000,9960,3,2,2,yes,no,yes,no,no,2,yes,semi-furnished
3,12215000,7500,4,2,2,yes,no,yes,no,yes,3,yes,furnished
4,11410000,7420,4,1,2,yes,yes,yes,no,yes,2,no,furnished


---

In [3]:
X = df3.drop('price',axis = 1)
y = df3['price']

In [4]:
print(X.shape,'\n',y.shape)

(545, 12) 
 (545,)


In [5]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size= 0.3,random_state= 1)

In [6]:
class BinaryEncoder(BaseEstimator, TransformerMixin):
    def __init__(self, binary_columns):
        self.binary_columns = binary_columns

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X_transformed = X.copy()
        for column in self.binary_columns:
            X_transformed[column] = X_transformed[column].apply(lambda a: 1 if a == "yes" else 0)
        return X_transformed

In [7]:
binary_columns = list(X_train.select_dtypes(exclude=[np.number]).columns)
binary_columns.remove("furnishingstatus")
binary_columns

['mainroad',
 'guestroom',
 'basement',
 'hotwaterheating',
 'airconditioning',
 'prefarea']

In [8]:
num_columns = list(X_train.select_dtypes(include=[np.number]).columns)
num_columns

['area', 'bedrooms', 'bathrooms', 'stories', 'parking']

In [9]:
preprocessor = ColumnTransformer(
    transformers=[
        ('binary', BinaryEncoder(binary_columns), binary_columns),
        ('cat', OneHotEncoder(), ["furnishingstatus"]),
        ('num', StandardScaler(), num_columns)
    ])

In [10]:
sgd_regressor = SGDRegressor(penalty='l2', alpha=0.0005, random_state=2)
pipeline = Pipeline(steps=[('preprocessor', preprocessor), 
                           ('regressor', sgd_regressor)])

In [11]:
param_grid = [
    {
        'preprocessor__num__with_mean': [True, False],
        'regressor': [tree.DecisionTreeRegressor(random_state=1)],
        'regressor__max_depth': [None,5,7,10,15]
    },
    {
        'preprocessor__num__with_mean': [True, False],
        'regressor': [ensemble.RandomForestRegressor(random_state=1, n_jobs=-1)],
        'regressor__n_estimators': [25,50,75,100],
        'regressor__max_depth': [None,5,7,10,15],
        'regressor__min_samples_leaf': [1,3,5],
    },
    {
        'preprocessor__num__with_mean': [True, False],
        'regressor': [ct.CatBoostRegressor(random_state=1, allow_writing_files=False, thread_count=-1, verbose=False)],
        'regressor__max_depth': [8,7,6, 5, 4, 3]
    },
    {
        'preprocessor__num__with_mean': [True]
        # здесь наш дефолт пайплайна - сгдРегрессор
    }
]

In [12]:
X_grid = pd.concat([X_train, X_test], ignore_index=True)
y_grid = pd.concat([y_train, y_test], ignore_index=True)

grid_search = GridSearchCV(pipeline, param_grid, cv=([(list(range(len(X_train))), list(range(len(X_train), len(X_grid))))]), 
                           scoring='neg_mean_absolute_error')

In [13]:
grid_search.fit(X_grid, y_grid)

In [14]:
grid_search.best_params_

{'preprocessor__num__with_mean': True,
 'regressor': <catboost.core.CatBoostRegressor at 0x1cc5e20ce50>,
 'regressor__max_depth': 6}

In [15]:
grid_search.best_score_

-826044.8231762612

In [16]:
pd.DataFrame(grid_search.cv_results_).sort_values("rank_test_score").head(10)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_preprocessor__num__with_mean,param_regressor,param_regressor__max_depth,param_regressor__min_samples_leaf,param_regressor__n_estimators,params,split0_test_score,mean_test_score,std_test_score,rank_test_score
132,1.733828,0.0,0.015945,0.0,True,<catboost.core.CatBoostRegressor object at 0x0...,6,,,"{'preprocessor__num__with_mean': True, 'regres...",-826044.823176,-826044.823176,0.0,1
138,1.572101,0.0,0.017529,0.0,False,<catboost.core.CatBoostRegressor object at 0x0...,6,,,"{'preprocessor__num__with_mean': False, 'regre...",-826044.823176,-826044.823176,0.0,1
131,2.366719,0.0,0.016098,0.0,True,<catboost.core.CatBoostRegressor object at 0x0...,7,,,"{'preprocessor__num__with_mean': True, 'regres...",-834865.380809,-834865.380809,0.0,3
137,2.249304,0.0,0.014942,0.0,False,<catboost.core.CatBoostRegressor object at 0x0...,7,,,"{'preprocessor__num__with_mean': False, 'regre...",-834865.380809,-834865.380809,0.0,3
135,0.618308,0.0,0.01495,0.0,True,<catboost.core.CatBoostRegressor object at 0x0...,3,,,"{'preprocessor__num__with_mean': True, 'regres...",-838052.584873,-838052.584873,0.0,5
141,0.579933,0.0,0.015947,0.0,False,<catboost.core.CatBoostRegressor object at 0x0...,3,,,"{'preprocessor__num__with_mean': False, 'regre...",-838052.584873,-838052.584873,0.0,5
134,0.814726,0.0,0.015946,0.0,True,<catboost.core.CatBoostRegressor object at 0x0...,4,,,"{'preprocessor__num__with_mean': True, 'regres...",-839966.339163,-839966.339163,0.0,7
140,0.765679,0.0,0.014364,0.0,False,<catboost.core.CatBoostRegressor object at 0x0...,4,,,"{'preprocessor__num__with_mean': False, 'regre...",-839966.339163,-839966.339163,0.0,7
130,3.897926,0.0,0.01647,0.0,True,<catboost.core.CatBoostRegressor object at 0x0...,8,,,"{'preprocessor__num__with_mean': True, 'regres...",-840910.670392,-840910.670392,0.0,9
136,3.551828,0.0,0.01495,0.0,False,<catboost.core.CatBoostRegressor object at 0x0...,8,,,"{'preprocessor__num__with_mean': False, 'regre...",-840910.670392,-840910.670392,0.0,9


In [17]:
best_model = grid_search.best_estimator_
best_model

In [18]:
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_absolute_percentage_error
from sklearn.metrics import mean_squared_error

In [19]:
preds = best_model.predict(X_test)

In [20]:
print('Тестовая средння абсолютная ошибка: {}'.format(mean_absolute_error(y_test,preds)))
print('Тестовая среднеквадратичная ошибка: {}'.format(mean_squared_error(y_test,preds)))
print('Тестовая средння абсолютная процентная ошибка: {}%'.format(mean_absolute_percentage_error(y_test,preds)*100))
print('Тестовое отношение mae к среднему: {}%'.format((mean_absolute_error(y_test,preds)/y_test.mean())*100))
print('Тестовое средняя цена: {}'.format(y_test.mean()))

Тестовая средння абсолютная ошибка: 341696.7728496742
Тестовая среднеквадратичная ошибка: 218746992811.38742
Тестовая средння абсолютная процентная ошибка: 8.613054865080073%
Тестовое отношение mae к среднему: 7.19809313332189%
Тестовое средняя цена: 4747045.731707317


Сохраняем

In [21]:
preprocessor_trained = best_model[0]
ct_trained = best_model[1]

In [22]:
with open("prepocess.pkl", "wb") as file:
    pickle.dump(preprocessor_trained, file)

ct_trained.save_model("ct_regr.ct")

Читаем

In [23]:
with open("prepocess.pkl", "rb") as file:
    preprocessor_loaded = pickle.load(file)

In [24]:
ct_reg_read = ct.CatBoostRegressor(random_state=1, allow_writing_files=False, thread_count=-1, verbose=False)
ct_reg_read.load_model("ct_regr.ct")

<catboost.core.CatBoostRegressor at 0x1cc5e21bf40>

Проверяем

In [25]:
preds_loaded = ct_reg_read.predict(preprocessor_loaded.transform(X_test))

In [26]:
print('Тестовая средння абсолютная ошибка: {} | {}'.format(mean_absolute_error(y_test,preds),mean_absolute_error(y_test,preds_loaded)))
print('Тестовая среднеквадратичная ошибка: {} | {}'.format(mean_squared_error(y_test,preds),mean_squared_error(y_test,preds_loaded)))
print('Тестовая средння абсолютная процентная ошибка: {} | {}%'.format(mean_absolute_percentage_error(y_test,preds)*100,mean_absolute_percentage_error(y_test,preds_loaded)*100))
print('Тестовое отношение mae к среднему: {} | {}%'.format((mean_absolute_error(y_test,preds)/y_test.mean())*100,(mean_absolute_error(y_test,preds_loaded)/y_test.mean())*100))
print('Тестовое средняя цена: {}'.format(y_test.mean()))

Тестовая средння абсолютная ошибка: 341696.7728496742 | 341696.7728496742
Тестовая среднеквадратичная ошибка: 218746992811.38742 | 218746992811.38742
Тестовая средння абсолютная процентная ошибка: 8.613054865080073 | 8.613054865080073%
Тестовое отношение mae к среднему: 7.19809313332189 | 7.19809313332189%
Тестовое средняя цена: 4747045.731707317
