In [29]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from catboost import CatBoostRegressor
import joblib
import json

In [3]:
df = pd.read_csv("./data.csv")

In [4]:
df

Unnamed: 0,Price,Apartment type,Metro station,Minutes to metro,Region,Number of rooms,Area,Living area,Kitchen area,Floor,Number of floors,Renovation
0,6300000.0,Secondary,Опалиха,6.0,Moscow region,1.0,30.60,11.1,8.5,25.0,25,Cosmetic
1,9000000.0,Secondary,Павшино,2.0,Moscow region,1.0,49.20,20.0,10.0,6.0,15,European-style renovation
2,11090000.0,Secondary,Мякинино,14.0,Moscow region,1.0,44.70,16.2,13.1,10.0,25,Cosmetic
3,8300000.0,Secondary,Строгино,8.0,Moscow region,1.0,35.10,16.0,11.0,12.0,33,European-style renovation
4,6450000.0,Secondary,Опалиха,6.0,Moscow region,1.0,37.70,15.2,4.0,5.0,5,Without renovation
...,...,...,...,...,...,...,...,...,...,...,...,...
22671,4768792.0,New building,Котельники,8.0,Moscow region,0.0,31.75,13.0,5.0,4.0,17,Cosmetic
22672,5900000.0,New building,Домодедовская,25.0,Moscow region,1.0,31.60,10.1,12.2,11.0,15,Cosmetic
22673,3700000.0,New building,Котельники,30.0,Moscow region,0.0,18.00,15.0,8.1,17.0,17,Cosmetic
22674,5768869.0,New building,Жулебино,14.0,Moscow region,2.0,36.39,22.0,6.6,12.0,14,Cosmetic


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22676 entries, 0 to 22675
Data columns (total 12 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Price             22676 non-null  float64
 1   Apartment type    22676 non-null  object 
 2   Metro station     22676 non-null  object 
 3   Minutes to metro  22676 non-null  float64
 4   Region            22676 non-null  object 
 5   Number of rooms   22676 non-null  float64
 6   Area              22676 non-null  float64
 7   Living area       22676 non-null  float64
 8   Kitchen area      22676 non-null  float64
 9   Floor             22676 non-null  float64
 10  Number of floors  22676 non-null  int64  
 11  Renovation        22676 non-null  object 
dtypes: float64(7), int64(1), object(4)
memory usage: 2.1+ MB


In [6]:
null_counts = df.isnull().sum()
print(null_counts)

Price               0
Apartment type      0
Metro station       0
Minutes to metro    0
Region              0
Number of rooms     0
Area                0
Living area         0
Kitchen area        0
Floor               0
Number of floors    0
Renovation          0
dtype: int64


In [7]:
df.drop("Region",axis=1,inplace=True)

In [8]:
df["Renovation"].unique()

array(['Cosmetic', 'European-style renovation', 'Without renovation',
       'Designer'], dtype=object)

In [9]:
df["Renovation"] = df["Renovation"].map({'Cosmetic':'Косметический ремонт','European-style renovation':'Европейский стиль ремонта', 'Without renovation':'Без ремонта', 'Designer':'Дизанейрский ремон'})

In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22676 entries, 0 to 22675
Data columns (total 11 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Price             22676 non-null  float64
 1   Apartment type    22676 non-null  object 
 2   Metro station     22676 non-null  object 
 3   Minutes to metro  22676 non-null  float64
 4   Number of rooms   22676 non-null  float64
 5   Area              22676 non-null  float64
 6   Living area       22676 non-null  float64
 7   Kitchen area      22676 non-null  float64
 8   Floor             22676 non-null  float64
 9   Number of floors  22676 non-null  int64  
 10  Renovation        22676 non-null  object 
dtypes: float64(7), int64(1), object(3)
memory usage: 1.9+ MB


In [11]:
df["Apartment type"] = df["Apartment type"].map({"Secondary":"Вторичка","New building":"Новострой"})

In [12]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22676 entries, 0 to 22675
Data columns (total 11 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Price             22676 non-null  float64
 1   Apartment type    22676 non-null  object 
 2   Metro station     22676 non-null  object 
 3   Minutes to metro  22676 non-null  float64
 4   Number of rooms   22676 non-null  float64
 5   Area              22676 non-null  float64
 6   Living area       22676 non-null  float64
 7   Kitchen area      22676 non-null  float64
 8   Floor             22676 non-null  float64
 9   Number of floors  22676 non-null  int64  
 10  Renovation        22676 non-null  object 
dtypes: float64(7), int64(1), object(3)
memory usage: 1.9+ MB


In [15]:
X = df.drop("Price",axis=1)
y = df["Price"]

In [16]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22676 entries, 0 to 22675
Data columns (total 10 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Apartment type    22676 non-null  object 
 1   Metro station     22676 non-null  object 
 2   Minutes to metro  22676 non-null  float64
 3   Number of rooms   22676 non-null  float64
 4   Area              22676 non-null  float64
 5   Living area       22676 non-null  float64
 6   Kitchen area      22676 non-null  float64
 7   Floor             22676 non-null  float64
 8   Number of floors  22676 non-null  int64  
 9   Renovation        22676 non-null  object 
dtypes: float64(6), int64(1), object(3)
memory usage: 1.7+ MB


In [17]:
label_encoders = {}
dicts = {}
for column in ['Apartment type', 'Metro station', 'Renovation']:
    le = LabelEncoder()
    X[column] = le.fit_transform(X[column])
    label_encoders[column] = le 

In [18]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22676 entries, 0 to 22675
Data columns (total 10 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Apartment type    22676 non-null  int32  
 1   Metro station     22676 non-null  int32  
 2   Minutes to metro  22676 non-null  float64
 3   Number of rooms   22676 non-null  float64
 4   Area              22676 non-null  float64
 5   Living area       22676 non-null  float64
 6   Kitchen area      22676 non-null  float64
 7   Floor             22676 non-null  float64
 8   Number of floors  22676 non-null  int64  
 9   Renovation        22676 non-null  int32  
dtypes: float64(6), int32(3), int64(1)
memory usage: 1.5 MB


In [22]:
cat_dict = {col: dict(zip(le.classes_, le.transform(le.classes_))) for col, le in label_encoders.items()}

# Вывод словарей для трех категориальных переменных
for col, mapping in cat_dict.items():
    print(f"{col}: {mapping}")

Apartment type: {'Вторичка': 0, 'Новострой': 1}
Metro station: {' Авиамоторная': 0, ' Автозаводская': 1, ' Академическая': 2, ' Александровский сад': 3, ' Алексеевская': 4, ' Алма-Атинская': 5, ' Алтуфьево': 6, ' Аминьевская': 7, ' Андроновка': 8, ' Аникеевка': 9, ' Арбатская': 10, ' Аэропорт': 11, ' Аэропорт Внуково': 12, ' Бабушкинская': 13, ' Багратионовская': 14, ' Балтийская': 15, ' Баррикадная': 16, ' Бауманская': 17, ' Беговая': 18, ' Беломорская': 19, ' Белорусская': 20, ' Бескудниково': 21, ' Бибирево': 22, ' Библиотека и Ленина': 23, ' Боровицкая': 24, ' Ботанический сад': 25, ' Братиславская': 26, ' Бульвар Адмирала Ушакова': 27, ' Бульвар Дмитрия Донского': 28, ' Бульвар Рокоссовского': 29, ' Бунинская аллея': 30, ' Бутово': 31, ' Бутырская': 32, ' ВДНХ': 33, ' Варшавская': 34, ' Верхние Лихоборы': 35, ' Верхние котлы': 36, ' Вешняки': 37, ' Владыкино': 38, ' Внуково': 39, ' Водный стадион': 40, ' Войковская': 41, ' Волгоградский проспект': 42, ' Волжская': 43, ' Волоколамс

In [23]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22676 entries, 0 to 22675
Data columns (total 10 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Apartment type    22676 non-null  int32  
 1   Metro station     22676 non-null  int32  
 2   Minutes to metro  22676 non-null  float64
 3   Number of rooms   22676 non-null  float64
 4   Area              22676 non-null  float64
 5   Living area       22676 non-null  float64
 6   Kitchen area      22676 non-null  float64
 7   Floor             22676 non-null  float64
 8   Number of floors  22676 non-null  int64  
 9   Renovation        22676 non-null  int32  
dtypes: float64(6), int32(3), int64(1)
memory usage: 1.5 MB


In [31]:
X["Apartment type"] = X["Apartment type"].astype("int64")
X["Metro station"] = X["Metro station"].astype("int64")
X["Renovation"] = X["Renovation"].astype("int64")

In [32]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=42)

rf = RandomForestRegressor(random_state=42)
xgb = XGBRegressor(random_state=42)
lr = LinearRegression()

rf.fit(X_train,y_train)
xgb.fit(X_train,y_train)
lr.fit(X_train,y_train)

print("mae rf:",mean_absolute_error(y_test,rf.predict(X_test)),"mse rf:",mean_squared_error(y_test,rf.predict(X_test)))
print("mae xgb:",mean_absolute_error(y_test,xgb.predict(X_test)),"mse xgb:",mean_squared_error(y_test,xgb.predict(X_test)))
print("mae lr:",mean_absolute_error(y_test,lr.predict(X_test)),"mse lr:",mean_squared_error(y_test,lr.predict(X_test)))

mae rf: 9427320.61935388 mse rf: 1290993347254333.8
mae xgb: 9267412.095816799 mse xgb: 1391596443163870.2
mae lr: 20234717.403505757 mse lr: 2498908809018927.0


In [36]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22676 entries, 0 to 22675
Data columns (total 10 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Apartment type    22676 non-null  int64  
 1   Metro station     22676 non-null  int64  
 2   Minutes to metro  22676 non-null  float64
 3   Number of rooms   22676 non-null  float64
 4   Area              22676 non-null  float64
 5   Living area       22676 non-null  float64
 6   Kitchen area      22676 non-null  float64
 7   Floor             22676 non-null  float64
 8   Number of floors  22676 non-null  int64  
 9   Renovation        22676 non-null  int64  
dtypes: float64(6), int64(4)
memory usage: 1.7 MB


In [37]:
joblib.dump(rf, 'model.pkl')
joblib.dump(label_encoders, 'label_encoders.pkl')

# Создание словарей для категориальных признаков
cat_dict = {col: {str(k): int(v) for k, v in zip(le.classes_, le.transform(le.classes_))} for col, le in label_encoders.items()}

# Сохранение словарей в файл
with open('cat_dict.json', 'w', encoding='utf-8') as f:
    json.dump(cat_dict, f, ensure_ascii=False, indent=4)