In [1]:
!pip install geodist >> none
from geodist import GeoDist

!pip install catboost >> none
from catboost import CatBoostRegressor, Pool

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re

from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
import pickle

# from google.colab import drive
# drive.mount('/content/gdrive')

import warnings
warnings.filterwarnings('ignore')

# Анализ и обработка данных

In [2]:
df = pd.read_csv('formatted.csv')
df.head()

Unnamed: 0,id,coords,rooms_count,year,house_floors,material,floor,flat_area,kitchen_area,balcony,metro,condition,price,url
0,2624808613,"(55.680879, 37.738863)",1,2021.0,13.0,панельный,13.0,36.7,10.3,лоджия,21–30 мин.,косметический,9500000,/moskva/kvartiry/1-k._kvartira_367m_1313et._26...
1,2532916318,"(55.796471, 37.609919)",студия,1971.0,8.0,блочный,5.0,32.0,,,6–10 мин.,дизайнерский,10490000,/moskva/kvartiry/apartamenty-studiya_32m_58et....
2,2587872760,"(55.740624, 37.66645)",2,1984.0,12.0,панельный,4.0,52.0,10.0,лоджия,6–10 мин.,евро,17500000,/moskva/kvartiry/2-k._kvartira_52m_412et._2587...
3,2543206638,"(55.829437, 37.414219)",студия,,4.0,блочный,1.0,21.1,,,16–20 мин.,евро,3258000,/moskva/kvartiry/apartamenty-studiya_211m_14et...
4,2658418970,"(55.805169, 37.540695)",3,,8.0,кирпичный,1.0,68.7,11.0,,6–10 мин.,требует ремонта,16500000,/moskva/kvartiry/3-k._kvartira_687m_18et._2658...


In [11]:
df.shape

(16250, 11)

In [4]:
df = df.drop(['id', 'url'], axis=1)


def dist(x):
        s = [float(s) for s in re.findall(r'-?\d+\.?\d*', x)]
        s[0], s[1] = s[1], s[0]
        s = tuple(s)
        res = float(format(GeoDist([s]).distance(37.617734, 55.751999)/1000, '.3f'))
        return res

df['coords'] = [dist(x) for x in df['coords']]


df = df.dropna(thresh=4)
df = df.reset_index(drop=True)

df.year = df.year.replace(0, np.nan)
df.year[932] = 2021.0
df.year[990] = 1976.0
df.year[1706] = 1956.0


monolit = df.loc[(df.material == 'монолитный') & (df.year.isna() == False)]
panel = df.loc[(df.material == 'панельный') & (df.year.isna() == False)]
brick = df.loc[(df.material == 'кирпичный') & (df.year.isna() == False)]
monolit_brick = df.loc[(df.material == 'монолитно-кирпичный') & (df.year.isna() == False)]
block = df.loc[(df.material == 'блочный') & (df.year.isna() == False)]

for i in range(df.shape[0]):
    if df.year[i] != df.year[i]:
        if df.material[i] == 'монолитный':
            df.year[i] = monolit.year.mean()
        elif df.material[i] == 'панельный':
            df.year[i] = panel.year.mean()
        elif df.material[i] == 'кирпичный':
            df.year[i] = brick.year.mean()
        elif df.material[i] == 'блочный':
            df.year[i] = block.year.mean()
        else:
            df.year[i] = monolit_brick.year.mean()

df.year = df.year.astype(int)


kitchen_coef = df.flat_area.mean() / df.kitchen_area.mean()

for i in range(df.shape[0]):
    if df.kitchen_area[i] != df.kitchen_area[i]:
        df.kitchen_area[i] = format(df.flat_area[i] / kitchen_coef, '.1f')
        if df.kitchen_area[i] < 5: df.kitchen_area[i] = 5


df.balcony = df.balcony.fillna('нет')
df.metro = df.metro.fillna('6–10 мин.')


for i in range(df.shape[0]):
    if df.condition[i] != df.condition[i]:
        if df.coords[i] < 10:
            if 2000 < df.year[i] < 2020 and df.price[i] > df.price.mean():
                df.condition[i] = 'евро'
            else:
                df.condition[i] = 'косметический'
        else:
            df.condition[i] = 'требует ремонта'

df.sample(7)

Unnamed: 0,coords,rooms_count,year,house_floors,material,floor,flat_area,kitchen_area,balcony,metro,condition,price
12663,9.842,6,1979,3.0,панельный,1.0,385.8,40.0,нет,21–30 мин.,косметический,270060000
8624,5.845,студия,1961,2.0,кирпичный,2.0,17.8,,нет,6–10 мин.,косметический,3180000
147,15.916,3,1975,9.0,панельный,2.0,77.0,10.4,нет,21–30 мин.,косметический,11000000
4351,7.353,1,1961,5.0,кирпичный,3.0,30.5,6.0,нет,11–15 мин.,евро,11945000
2958,13.849,студия,1969,9.0,панельный,1.0,18.6,,нет,11–15 мин.,требует ремонта,3700000
14543,5.266,4,2020,26.0,монолитный,21.0,176.0,15.0,нет,6–10 мин.,дизайнерский,115000000
14138,2.201,6,1961,3.0,кирпичный,2.0,160.0,11.0,балкон,6–10 мин.,требует ремонта,130000000


In [None]:
df.corr().style.background_gradient(cmap='coolwarm').set_precision(2)

Unnamed: 0,coords,year,house_floors,floor,flat_area,kitchen_area,price
coords,1.0,0.06,-0.01,-0.12,-0.43,-0.34,-0.45
year,0.06,1.0,0.39,0.33,0.23,0.27,0.19
house_floors,-0.01,0.39,1.0,0.81,0.16,0.18,0.02
floor,-0.12,0.33,0.81,1.0,0.28,0.26,0.13
flat_area,-0.43,0.23,0.16,0.28,1.0,0.65,0.75
kitchen_area,-0.34,0.27,0.18,0.26,0.65,1.0,0.51
price,-0.45,0.19,0.02,0.13,0.75,0.51,1.0


In [5]:
num_cols = ['coords',
            'year',
            'house_floors',
            'floor',
            'flat_area',
            'kitchen_area']

cat_cols = ['rooms_count',
            'material',
            'balcony',
            'metro',
            'condition']

target_col = df.price

In [6]:
df = df.iloc[:, :-1]

# Обучение

In [7]:
X_train, X_test, y_train, y_test = train_test_split(df, target_col)

In [8]:
boosting_model = CatBoostRegressor(cat_features = cat_cols)

In [13]:
params = {
    'depth' : 9,
    'learning_rate' : 0.04,
    'cat_features': cat_cols,
    'iterations' : 1000
}

boosting_model = CatBoostRegressor(**params)
boosting_model.fit(X_train, y_train, early_stopping_rounds=100)

0:	learn: 112871740.6487230	total: 59.1ms	remaining: 59.1s
1:	learn: 110017915.8229798	total: 101ms	remaining: 50.5s
2:	learn: 107493820.2774743	total: 154ms	remaining: 51.1s
3:	learn: 105001035.8472976	total: 202ms	remaining: 50.3s
4:	learn: 102529660.4849972	total: 269ms	remaining: 53.6s
5:	learn: 100056417.8600391	total: 341ms	remaining: 56.5s
6:	learn: 97877777.3262960	total: 418ms	remaining: 59.3s
7:	learn: 95702124.0009309	total: 508ms	remaining: 1m 2s
8:	learn: 93587247.2533278	total: 582ms	remaining: 1m 4s
9:	learn: 91614142.4904040	total: 662ms	remaining: 1m 5s
10:	learn: 89714733.5940774	total: 731ms	remaining: 1m 5s
11:	learn: 87901779.3943113	total: 785ms	remaining: 1m 4s
12:	learn: 86196083.0599158	total: 865ms	remaining: 1m 5s
13:	learn: 84579825.6367348	total: 944ms	remaining: 1m 6s
14:	learn: 82941902.9869773	total: 1.02s	remaining: 1m 7s
15:	learn: 81599316.1934229	total: 1.1s	remaining: 1m 7s
16:	learn: 80104063.7037773	total: 1.19s	remaining: 1m 8s
17:	learn: 7879719

<catboost.core.CatBoostRegressor at 0x7f1e6c7ddcc0>

In [15]:
pred = boosting_model.predict(X_test)
r2 = r2_score(y_test, pred)
print('R2: {:.2f}'.format(r2))

R2: 0.84


In [17]:
filename = 'lct_model.sav'
pickle.dump(boosting_model, open(filename, 'wb'))