In [1]:
import numpy as np
import pandas as pd

In [2]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer

In [3]:
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

### подгтовка данных

In [4]:
ds = pd.read_csv('../data/r_1_SPB_clear_w_time_to_m.csv',index_col=0)

In [5]:
ds.columns

Index(['id', 'floor', 'floors_count', 'total_meters', 'price_per_month',
       'district', 'underground', 'Холодильник', 'Стиральная машина',
       'Телевизор', 'Посудомоечная машина', 'Кондиционер', 'Интернет',
       'Площадь кухни', 'Высота потолков', 'Санузел', 'Балкон/лоджия',
       'Вид из окон', 'Ремонт', 'Год постройки', 'Тип дома', 'Парковка',
       'adress', 'time_to_rent', 'm_lat', 'm_long', 'a_lat', 'a_long',
       'time_to_metro'],
      dtype='object')

удаляем координаты дома и ближайшего метро, округляем расстояние до метро до целых минут и удаляем стиральную машинку из-за высокой корреляции с холодильником, а так же район из-за высокой кореляции с метро в этом районе.

In [6]:
ds_reg = ds.drop(['a_lat','a_long','m_lat','m_long','adress','Стиральная машина','district','id'],axis = 1)

In [7]:
#todo смотреть на балконы в анализе и удалять там

In [8]:
ds_reg = ds_reg.drop(ds_reg[ds_reg['Балкон/лоджия'].isin(['2 лоджии','2 балкона'])].index)

In [9]:
ds_reg.reset_index(inplace=True, drop=True)

In [10]:
ds_reg.columns

Index(['floor', 'floors_count', 'total_meters', 'price_per_month',
       'underground', 'Холодильник', 'Телевизор', 'Посудомоечная машина',
       'Кондиционер', 'Интернет', 'Площадь кухни', 'Высота потолков',
       'Санузел', 'Балкон/лоджия', 'Вид из окон', 'Ремонт', 'Год постройки',
       'Тип дома', 'Парковка', 'time_to_rent', 'time_to_metro'],
      dtype='object')

In [11]:
ds_reg['time_to_metro'] = ds_reg['time_to_metro'].apply(lambda x: round(x))

In [12]:
ds_reg['first_floor'] = ds_reg['floor'].apply(lambda x: 1 if x == 1 else 0)

In [13]:
ds_reg.price_per_month = np.log(ds_reg.price_per_month.values)

In [14]:
ds_shuffled = ds_reg.sample(len(ds_reg),random_state= 66).reset_index(drop=True)
ds_train = ds_shuffled[:int(len(ds_reg)*0.85)]
ds_test = ds_shuffled[int(len(ds_reg)*0.85):]

In [15]:
Y_train = ds_train['price_per_month']
X_train = ds_train.drop(['price_per_month','time_to_rent'],axis = 1)

In [16]:
Y_test = ds_test['price_per_month']
X_test = ds_test.drop(['price_per_month','time_to_rent'],axis = 1)

In [17]:
bin_cols = list(X_train.nunique()[X_train.nunique() == 2].index)
bin_cols

['Холодильник',
 'Телевизор',
 'Посудомоечная машина',
 'Кондиционер',
 'Интернет',
 'Санузел',
 'first_floor']

In [18]:
num_cols = list(X_train.drop(bin_cols,axis =1).select_dtypes(include = 'number').columns)
num_cols

['floor',
 'floors_count',
 'total_meters',
 'Площадь кухни',
 'Высота потолков',
 'Год постройки',
 'time_to_metro']

In [19]:
non_cat_cols = num_cols.copy()
non_cat_cols.extend(bin_cols)
cat_cols = list(X_train.drop(non_cat_cols,axis =1).columns)
cat_cols

['underground',
 'Балкон/лоджия',
 'Вид из окон',
 'Ремонт',
 'Тип дома',
 'Парковка']

In [20]:
preprocessor = ColumnTransformer(
    transformers=[
        ('cat',OneHotEncoder() ,cat_cols),
        ('num', 'passthrough', num_cols),
        ('bin','passthrough',bin_cols)
    ])

In [21]:
X_train_norm = preprocessor.fit_transform(X_train)
X_test_norm = preprocessor.transform(X_test)
X_train_norm.shape,X_test_norm.shape

((690, 78), (122, 78))

In [22]:
col_names = []
for item in preprocessor.get_feature_names_out():
    col_names.append(item.split('__')[1])

In [23]:
X_train = pd.DataFrame(X_train_norm.toarray(),columns=col_names)
X_test = pd.DataFrame(X_test_norm.toarray(),columns=col_names)
display(X_train,X_test)

Unnamed: 0,underground_Автово,underground_Академическая,underground_Балтийская,underground_Беговая,underground_Василеостровская,underground_Выборгская,underground_Гражданский проспект,underground_Девяткино,underground_Дунайская,underground_Елизаровская,...,Высота потолков,Год постройки,time_to_metro,Холодильник,Телевизор,Посудомоечная машина,Кондиционер,Интернет,Санузел,first_floor
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,2.500000,1974.0,11.0,1.0,1.0,1.0,0.0,1.0,1.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,3.000000,2016.0,3.0,1.0,1.0,0.0,1.0,1.0,1.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,2.600000,1977.0,16.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,2.672774,2022.0,54.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,3.000000,2004.0,15.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
685,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,2.500000,2016.0,40.0,1.0,1.0,1.0,0.0,1.0,1.0,0.0
686,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,2.672774,2003.0,13.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0
687,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,2.700000,2014.0,20.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0
688,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,2.672774,2020.0,72.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0


Unnamed: 0,underground_Автово,underground_Академическая,underground_Балтийская,underground_Беговая,underground_Василеостровская,underground_Выборгская,underground_Гражданский проспект,underground_Девяткино,underground_Дунайская,underground_Елизаровская,...,Высота потолков,Год постройки,time_to_metro,Холодильник,Телевизор,Посудомоечная машина,Кондиционер,Интернет,Санузел,first_floor
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,2.672774,2006.0,13.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,2.672774,1914.0,21.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,2.700000,2015.0,90.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,2.600000,1974.0,22.0,1.0,1.0,1.0,0.0,0.0,1.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,2.672774,1987.0,10.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
117,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,2.672774,2011.0,14.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0
118,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,3.000000,2022.0,42.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0
119,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,2.672774,1990.0,12.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0
120,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,2.800000,2008.0,11.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0


---

### регрессия

In [24]:
import statsmodels.api as sm

In [25]:
X_train = sm.add_constant(X_train,has_constant='add')
results= sm.OLS(Y_train,X_train).fit(cov_type = 'HC3')

In [26]:
results.summary()



0,1,2,3
Dep. Variable:,price_per_month,R-squared:,0.774
Model:,OLS,Adj. R-squared:,0.747
Method:,Least Squares,F-statistic:,49070.0
Date:,"Mon, 04 Mar 2024",Prob (F-statistic):,0.0
Time:,02:49:02,Log-Likelihood:,337.25
No. Observations:,690,AIC:,-528.5
Df Residuals:,617,BIC:,-197.3
Df Model:,72,,
Covariance Type:,HC3,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,1.5967,0.551,2.899,0.004,0.517,2.676
underground_Автово,-0.1088,0.070,-1.548,0.122,-0.247,0.029
underground_Академическая,-0.1250,0.050,-2.503,0.012,-0.223,-0.027
underground_Балтийская,0.3398,0.132,2.575,0.010,0.081,0.598
underground_Беговая,0.0702,0.071,0.993,0.321,-0.068,0.209
underground_Василеостровская,0.2793,0.077,3.647,0.000,0.129,0.429
underground_Выборгская,0.0657,0.080,0.824,0.410,-0.090,0.222
underground_Гражданский проспект,-0.0771,0.050,-1.551,0.121,-0.175,0.020
underground_Девяткино,-0.1183,0.074,-1.598,0.110,-0.264,0.027

0,1,2,3
Omnibus:,7.477,Durbin-Watson:,2.145
Prob(Omnibus):,0.024,Jarque-Bera (JB):,9.538
Skew:,-0.113,Prob(JB):,0.00849
Kurtosis:,3.53,Cond. No.,2.98e+18


In [27]:
X_test = sm.add_constant(X_test,has_constant='add')
preds_ts = results.predict(X_test)

In [28]:
print('Тестовая средння абсолютная ошибка: {}'.format(mean_absolute_error(Y_test,preds_ts)))
print('Тестовая среднеквадратичная ошибка: {}'.format(mean_squared_error(Y_test,preds_ts)))
print('Тестовый r2: {}'.format(r2_score(Y_test,preds_ts)))
print('Тестовое отношение mae к среднему: {}%'.format(round((mean_absolute_error(Y_test,preds_ts)/Y_test.mean())*100,2)))
print('Тестовая средняя цена: {}'.format(Y_test.mean()))
print('Предсказанная средняя цена: {}'.format(preds_ts.mean()))

Тестовая средння абсолютная ошибка: 0.13280094857617583
Тестовая среднеквадратичная ошибка: 0.02852533036220662
Тестовый r2: 0.6767140835172855
Тестовое отношение mae к среднему: 1.28%
Тестовая средняя цена: 10.389780335624057
Предсказанная средняя цена: 10.403093847289952


---

Этаж получился спорным, так как либо при съеме это не влияет, либо, что равновероятно, там нелинейная зависимость, так как самые ценные этажи обычно находятся в середине дома

удалим стобец с метро, так как он выглядит незначимым

---

<!-- Удалим столбец с районами, так как они стат не значимые и несут в себе примерно такую же инфу, что и метро только более обощенную -->

In [29]:
ds_reg = ds_reg.drop(['underground'],axis = 1)

In [30]:
ds_shuffled = ds_reg.sample(len(ds_reg),random_state= 66).reset_index(drop=True)
ds_train = ds_shuffled[:int(len(ds_reg)*0.85)]
ds_test = ds_shuffled[int(len(ds_reg)*0.85):]

In [31]:
Y_train = ds_train['price_per_month']
X_train = ds_train.drop(['price_per_month','time_to_rent'],axis = 1)
Y_test = ds_test['price_per_month']
X_test = ds_test.drop(['price_per_month','time_to_rent'],axis = 1)

In [32]:
bin_cols = list(X_train.nunique()[X_train.nunique() == 2].index)
num_cols = list(X_train.drop(bin_cols,axis =1).select_dtypes(include = 'number').columns)
non_cat_cols = num_cols.copy()
non_cat_cols.extend(bin_cols)
cat_cols = list(X_train.drop(non_cat_cols,axis =1).columns)

In [33]:
preprocessor = ColumnTransformer(
    transformers=[
        ('cat',OneHotEncoder() ,cat_cols),
        ('num', 'passthrough', num_cols),
        ('bin','passthrough',bin_cols)
    ])

In [34]:
X_train_norm = preprocessor.fit_transform(X_train)
X_test_norm = preprocessor.transform(X_test)
X_train_norm.shape,X_test_norm.shape

((690, 33), (122, 33))

In [35]:
col_names = []
for item in preprocessor.get_feature_names_out():
    col_names.append(item.split('__')[1])
X_train = pd.DataFrame(X_train_norm,columns=col_names)
X_test = pd.DataFrame(X_test_norm,columns=col_names)

In [36]:
X_train = sm.add_constant(X_train,has_constant='add')
results= sm.OLS(Y_train,X_train).fit(cov_type = 'HC3')

In [37]:
results.summary()



0,1,2,3
Dep. Variable:,price_per_month,R-squared:,0.665
Model:,OLS,Adj. R-squared:,0.651
Method:,Least Squares,F-statistic:,81950.0
Date:,"Mon, 04 Mar 2024",Prob (F-statistic):,0.0
Time:,02:49:03,Log-Likelihood:,202.52
No. Observations:,690,AIC:,-347.0
Df Residuals:,661,BIC:,-215.5
Df Model:,28,,
Covariance Type:,HC3,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,1.6498,0.730,2.260,0.024,0.219,3.080
Балкон/лоджия_1 балкон,0.4248,0.185,2.301,0.021,0.063,0.787
Балкон/лоджия_1 лоджия,0.4302,0.186,2.307,0.021,0.065,0.796
"Балкон/лоджия_1 лоджия, 1 балкон",0.3667,0.192,1.912,0.056,-0.009,0.743
Балкон/лоджия_нет балкона,0.4282,0.182,2.349,0.019,0.071,0.786
Вид из окон_Во двор,0.5158,0.244,2.118,0.034,0.038,0.993
Вид из окон_На улицу,0.5683,0.244,2.325,0.020,0.089,1.047
Вид из окон_На улицу и двор,0.5657,0.244,2.315,0.021,0.087,1.045
Ремонт_Дизайнерский,0.6754,0.247,2.736,0.006,0.192,1.159

0,1,2,3
Omnibus:,5.351,Durbin-Watson:,2.121
Prob(Omnibus):,0.069,Jarque-Bera (JB):,6.881
Skew:,-0.032,Prob(JB):,0.0321
Kurtosis:,3.485,Cond. No.,1e+16


F-статистика,Log-Likelihood говорят, что мы все сделали правильно, хотя информационные критерии говорят иначе, но так как потом мы все равно будем сравнить с ГВР, где не будет метро, то на все равно нужна эта модель

<!-- #### Как и предполагалось r^2 слегка упал, зато большинство станций метро стали стат. значимыми -->

удалим этаж и санузел

In [38]:
ds_reg = ds_reg.drop(['Санузел','floor'],axis = 1)

In [39]:
ds_shuffled = ds_reg.sample(len(ds_reg),random_state= 66).reset_index(drop=True)
ds_train = ds_shuffled[:int(len(ds_reg)*0.85)]
ds_test = ds_shuffled[int(len(ds_reg)*0.85):]

In [40]:
Y_train = ds_train['price_per_month']
X_train = ds_train.drop(['price_per_month','time_to_rent'],axis = 1)
Y_test = ds_test['price_per_month']
X_test = ds_test.drop(['price_per_month','time_to_rent'],axis = 1)

In [41]:
bin_cols = list(X_train.nunique()[X_train.nunique() == 2].index)
num_cols = list(X_train.drop(bin_cols,axis =1).select_dtypes(include = 'number').columns)
non_cat_cols = num_cols.copy()
non_cat_cols.extend(bin_cols)
cat_cols = list(X_train.drop(non_cat_cols,axis =1).columns)

In [42]:
preprocessor = ColumnTransformer(
    transformers=[
        ('cat',OneHotEncoder() ,cat_cols),
        ('num', 'passthrough', num_cols),
        ('bin','passthrough',bin_cols)
    ])

In [43]:
X_train_norm = preprocessor.fit_transform(X_train)
X_test_norm = preprocessor.transform(X_test)
X_train_norm.shape,X_test_norm.shape

((690, 31), (122, 31))

In [44]:
col_names = []
for item in preprocessor.get_feature_names_out():
    col_names.append(item.split('__')[1])
X_train = pd.DataFrame(X_train_norm,columns=col_names)
X_test = pd.DataFrame(X_test_norm,columns=col_names)

In [45]:
X_train = sm.add_constant(X_train,has_constant='add')
results= sm.OLS(Y_train,X_train).fit(cov_type = 'HC3')

In [46]:
results.summary()



0,1,2,3
Dep. Variable:,price_per_month,R-squared:,0.665
Model:,OLS,Adj. R-squared:,0.652
Method:,Least Squares,F-statistic:,88620.0
Date:,"Mon, 04 Mar 2024",Prob (F-statistic):,0.0
Time:,02:49:03,Log-Likelihood:,202.45
No. Observations:,690,AIC:,-350.9
Df Residuals:,663,BIC:,-228.4
Df Model:,26,,
Covariance Type:,HC3,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,1.6599,0.728,2.279,0.023,0.233,3.087
Балкон/лоджия_1 балкон,0.4274,0.184,2.322,0.020,0.067,0.788
Балкон/лоджия_1 лоджия,0.4323,0.186,2.325,0.020,0.068,0.797
"Балкон/лоджия_1 лоджия, 1 балкон",0.3697,0.192,1.928,0.054,-0.006,0.745
Балкон/лоджия_нет балкона,0.4305,0.182,2.368,0.018,0.074,0.787
Вид из окон_Во двор,0.5195,0.243,2.137,0.033,0.043,0.996
Вид из окон_На улицу,0.5715,0.244,2.345,0.019,0.094,1.049
Вид из окон_На улицу и двор,0.5689,0.244,2.333,0.020,0.091,1.047
Ремонт_Дизайнерский,0.6788,0.246,2.756,0.006,0.196,1.162

0,1,2,3
Omnibus:,5.295,Durbin-Watson:,2.12
Prob(Omnibus):,0.071,Jarque-Bera (JB):,6.788
Skew:,-0.031,Prob(JB):,0.0336
Kurtosis:,3.482,Cond. No.,1e+16


Остальные коэффициенты стали еще чуть более значимыми, F-статистика и информациооные коэфы подросли

Тот факт, что наличие интернета отрицательно влияет на стоимость не несет в себе никакого экономеческого смысла, поэтому его тоже удалим

In [47]:
ds_reg = ds_reg.drop(['Интернет'],axis = 1)

In [48]:
ds_shuffled = ds_reg.sample(len(ds_reg),random_state= 66).reset_index(drop=True)
ds_train = ds_shuffled[:int(len(ds_reg)*0.85)]
ds_test = ds_shuffled[int(len(ds_reg)*0.85):]

In [49]:
Y_train = ds_train['price_per_month']
X_train = ds_train.drop(['price_per_month','time_to_rent'],axis = 1)
Y_test = ds_test['price_per_month']
X_test = ds_test.drop(['price_per_month','time_to_rent'],axis = 1)

In [50]:
bin_cols = list(X_train.nunique()[X_train.nunique() == 2].index)
num_cols = list(X_train.drop(bin_cols,axis =1).select_dtypes(include = 'number').columns)
non_cat_cols = num_cols.copy()
non_cat_cols.extend(bin_cols)
cat_cols = list(X_train.drop(non_cat_cols,axis =1).columns)

In [51]:
preprocessor = ColumnTransformer(
    transformers=[
        ('cat',OneHotEncoder() ,cat_cols),
        ('num', 'passthrough', num_cols),
        ('bin','passthrough',bin_cols)
    ])

In [52]:
X_train_norm = preprocessor.fit_transform(X_train)
X_test_norm = preprocessor.transform(X_test)
X_train_norm.shape,X_test_norm.shape

((690, 30), (122, 30))

In [53]:
col_names = []
for item in preprocessor.get_feature_names_out():
    col_names.append(item.split('__')[1])
X_train = pd.DataFrame(X_train_norm,columns=col_names)
X_test = pd.DataFrame(X_test_norm,columns=col_names)

In [54]:
X_train = sm.add_constant(X_train,has_constant='add')
results= sm.OLS(Y_train,X_train).fit(cov_type = 'HC3')

In [55]:
results.summary()



0,1,2,3
Dep. Variable:,price_per_month,R-squared:,0.665
Model:,OLS,Adj. R-squared:,0.652
Method:,Least Squares,F-statistic:,91520.0
Date:,"Mon, 04 Mar 2024",Prob (F-statistic):,0.0
Time:,02:49:03,Log-Likelihood:,202.02
No. Observations:,690,AIC:,-352.0
Df Residuals:,664,BIC:,-234.1
Df Model:,25,,
Covariance Type:,HC3,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,1.6549,0.732,2.260,0.024,0.220,3.090
Балкон/лоджия_1 балкон,0.4250,0.185,2.299,0.022,0.063,0.787
Балкон/лоджия_1 лоджия,0.4299,0.187,2.301,0.021,0.064,0.796
"Балкон/лоджия_1 лоджия, 1 балкон",0.3720,0.192,1.937,0.053,-0.004,0.748
Балкон/лоджия_нет балкона,0.4279,0.183,2.345,0.019,0.070,0.786
Вид из окон_Во двор,0.5180,0.244,2.120,0.034,0.039,0.997
Вид из окон_На улицу,0.5702,0.245,2.327,0.020,0.090,1.051
Вид из окон_На улицу и двор,0.5667,0.245,2.312,0.021,0.086,1.047
Ремонт_Дизайнерский,0.6762,0.248,2.731,0.006,0.191,1.161

0,1,2,3
Omnibus:,5.243,Durbin-Watson:,2.124
Prob(Omnibus):,0.073,Jarque-Bera (JB):,6.679
Skew:,-0.036,Prob(JB):,0.0354
Kurtosis:,3.477,Cond. No.,1e+16


---

In [56]:
X_test = sm.add_constant(X_test,has_constant='add')
preds_ts = results.predict(X_test)

In [57]:
print('Тестовая средння абсолютная ошибка: {}'.format(mean_absolute_error(Y_test,preds_ts)))
print('Тестовая среднеквадратичная ошибка: {}'.format(mean_squared_error(Y_test,preds_ts)))
print('Тестовый r2: {}'.format(r2_score(Y_test,preds_ts)))
print('Тестовое отношение mae к среднему: {}%'.format(round((mean_absolute_error(Y_test,preds_ts)/Y_test.mean())*100,2)))
print('Тестовая средняя цена: {}'.format(Y_test.mean()))
print('Предсказанная средняя цена: {}'.format(preds_ts.mean()))

Тестовая средння абсолютная ошибка: 0.14724257609847943
Тестовая среднеквадратичная ошибка: 0.03690821398478084
Тестовый r2: 0.5817084103040339
Тестовое отношение mae к среднему: 1.42%
Тестовая средняя цена: 10.389780335624057
Предсказанная средняя цена: 10.402436572260665


Модель обладает обощающей способностью

к удалению: 'Интернет', 'Санузел','floor','underground','Стиральная машина','district'

---

посмотрим, может быть районы значимы

In [None]:
ds = pd.read_csv('../data/r_1_SPB_clear_w_time_to_m.csv',index_col=0)

In [None]:
ds_reg = ds.drop(['a_lat','a_long','m_lat','m_long','adress','Стиральная машина','underground','id'],axis = 1)
ds_reg = ds_reg.drop(ds_reg[ds_reg['Балкон/лоджия'].isin(['2 лоджии','2 балкона'])].index)
ds_reg.reset_index(inplace=True, drop=True)
ds_reg['time_to_metro'] = ds_reg['time_to_metro'].apply(lambda x: round(x))
ds_reg['first_floor'] = ds_reg['floor'].apply(lambda x: 1 if x == 1 else 0)
ds_reg.price_per_month = np.log(ds_reg.price_per_month.values)
ds_reg = ds_reg.drop(['Санузел','floor'],axis = 1)


In [None]:
ds_shuffled = ds_reg.sample(len(ds_reg),random_state= 66).reset_index(drop=True)
ds_train = ds_shuffled[:int(len(ds_reg)*0.85)]
ds_test = ds_shuffled[int(len(ds_reg)*0.85):]

In [None]:
Y_train = ds_train['price_per_month']
X_train = ds_train.drop(['price_per_month','time_to_rent'],axis = 1)
Y_test = ds_test['price_per_month']
X_test = ds_test.drop(['price_per_month','time_to_rent'],axis = 1)

In [None]:
bin_cols = list(X_train.nunique()[X_train.nunique() == 2].index)
num_cols = list(X_train.drop(bin_cols,axis =1).select_dtypes(include = 'number').columns)
non_cat_cols = num_cols.copy()
non_cat_cols.extend(bin_cols)
cat_cols = list(X_train.drop(non_cat_cols,axis =1).columns)

In [None]:
preprocessor = ColumnTransformer(
    transformers=[
        ('cat',OneHotEncoder() ,cat_cols),
        ('num', 'passthrough', num_cols),
        ('bin','passthrough',bin_cols)
    ])

In [None]:
X_train_norm = preprocessor.fit_transform(X_train)
X_test_norm = preprocessor.transform(X_test)
X_train_norm.shape,X_test_norm.shape

((690, 45), (122, 45))

In [None]:
col_names = []
for item in preprocessor.get_feature_names_out():
    col_names.append(item.split('__')[1])
X_train = pd.DataFrame(X_train_norm,columns=col_names)
X_test = pd.DataFrame(X_test_norm,columns=col_names)

In [None]:
X_train = sm.add_constant(X_train,has_constant='add')
results= sm.OLS(Y_train,X_train).fit(cov_type = 'HC3')

In [None]:
results.summary()



0,1,2,3
Dep. Variable:,price_per_month,R-squared:,0.716
Model:,OLS,Adj. R-squared:,0.699
Method:,Least Squares,F-statistic:,70650.0
Date:,"Mon, 04 Mar 2024",Prob (F-statistic):,0.0
Time:,02:39:41,Log-Likelihood:,259.44
No. Observations:,690,AIC:,-438.9
Df Residuals:,650,BIC:,-257.4
Df Model:,39,,
Covariance Type:,HC3,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,1.6165,0.722,2.240,0.025,0.202,3.031
district_Адмиралтейский,0.3211,1.117,0.287,0.774,-1.868,2.510
district_Василеостровский,0.2825,1.116,0.253,0.800,-1.906,2.471
district_Выборгский,0.0943,1.116,0.084,0.933,-2.093,2.282
district_Калининский,0.0932,1.116,0.083,0.933,-2.094,2.280
district_Кировский,0.0344,1.116,0.031,0.975,-2.154,2.223
district_Колпинский,-0.2656,14.968,-0.018,0.986,-29.602,29.071
district_Красногвардейский,0.1607,1.116,0.144,0.886,-2.027,2.348
district_Красносельский,0.1128,1.116,0.101,0.920,-2.075,2.300

0,1,2,3
Omnibus:,4.482,Durbin-Watson:,2.137
Prob(Omnibus):,0.106,Jarque-Bera (JB):,5.541
Skew:,0.001,Prob(JB):,0.0626
Kurtosis:,3.439,Cond. No.,3.15e+16


районы тоже незначимые