In [1]:
import pandas as pd

In [2]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer

In [3]:
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

In [4]:
ds = pd.read_csv('r_1_SPB_clear.csv',index_col=0)

In [5]:
ds_shuffled = ds.sample(len(ds),random_state= 66).reset_index(drop=True)
ds_train = ds_shuffled[:int(len(ds)*0.75)]
ds_test = ds_shuffled[int(len(ds)*0.75):]

In [6]:
Y_train = ds_train['time_to_rent']
X_train = ds_train.drop('time_to_rent',axis = 1)

In [7]:
Y_test = ds_test['time_to_rent']
X_test = ds_test.drop('time_to_rent',axis = 1)

In [8]:
bin_cols = list(ds.nunique()[ds.nunique() == 2].index)
bin_cols

['Холодильник',
 'Стиральная машина',
 'Телевизор',
 'Посудомоечная машина',
 'Кондиционер',
 'Интернет',
 'Санузел']

In [9]:
num_cols = list(X_train.drop(bin_cols,axis =1).select_dtypes(include = 'number').columns)
num_cols

['floor',
 'floors_count',
 'total_meters',
 'price_per_month',
 'Площадь кухни',
 'Высота потолков',
 'Год постройки']

In [10]:
non_cat_cols = num_cols.copy()
non_cat_cols.extend(bin_cols)
cat_cols = list(X_train.drop(non_cat_cols,axis =1).columns)
cat_cols

['district',
 'underground',
 'Балкон/лоджия',
 'Вид из окон',
 'Ремонт',
 'Тип дома',
 'Парковка']

In [11]:
preprocessor = ColumnTransformer(
    transformers=[
        ('cat',OneHotEncoder() ,cat_cols),
        ('num', 'passthrough', num_cols),
        ('bin','passthrough',bin_cols)
    ])

In [12]:
X_train_norm = preprocessor.fit_transform(X_train)
X_test_norm = preprocessor.transform(X_test)
X_train_norm.shape,X_test_norm.shape

((941, 102), (314, 102))

In [13]:
col_names = []
for item in preprocessor.get_feature_names_out():
    col_names.append(item.split('__')[1])

In [14]:
X_train = pd.DataFrame(X_train_norm.toarray(),columns=col_names)
X_test = pd.DataFrame(X_test_norm.toarray(),columns=col_names)
display(X_train,X_test)

Unnamed: 0,district_Адмиралтейский,district_Василеостровский,district_Выборгский,district_Калининский,district_Кировский,district_Колпинский,district_Красногвардейский,district_Красносельский,district_Кронштадтский,district_Курортный,...,Площадь кухни,Высота потолков,Год постройки,Холодильник,Стиральная машина,Телевизор,Посудомоечная машина,Кондиционер,Интернет,Санузел
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,12.0,2.7000,2019.0,1.0,1.0,1.0,0.0,0.0,1.0,1.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,8.0,2.6678,2013.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0
2,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,10.0,2.7000,2010.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,9.0,2.5000,2019.0,1.0,1.0,0.0,1.0,0.0,1.0,1.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,10.0,2.8000,2012.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
936,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,11.0,2.6000,2006.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
937,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,15.0,2.8000,2020.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0
938,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,20.5,2.7000,2009.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0
939,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,10.0,3.0000,2020.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0


Unnamed: 0,district_Адмиралтейский,district_Василеостровский,district_Выборгский,district_Калининский,district_Кировский,district_Колпинский,district_Красногвардейский,district_Красносельский,district_Кронштадтский,district_Курортный,...,Площадь кухни,Высота потолков,Год постройки,Холодильник,Стиральная машина,Телевизор,Посудомоечная машина,Кондиционер,Интернет,Санузел
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,10.0,3.2000,1899.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0
1,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,10.0,2.7000,2011.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0
2,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,5.0,2.5000,1959.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0
3,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,17.0,2.7000,2022.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0
4,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,7.0,2.6678,1972.0,1.0,1.0,1.0,0.0,0.0,1.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
309,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,5.5,2.6678,1963.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0
310,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,7.0,2.6000,1977.0,1.0,1.0,1.0,0.0,0.0,1.0,1.0
311,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,15.0,2.6000,2015.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0
312,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,13.5,2.6000,2014.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0


---

In [15]:
import statsmodels.api as sm

In [16]:
X_train = sm.add_constant(X_train)
results= sm.OLS(Y_train,X_train).fit(cov_type = 'HC3')

In [17]:
results.params

const                        1.201630
district_Адмиралтейский      0.228802
district_Василеостровский    0.801593
district_Выборгский         -0.233490
district_Калининский        -0.330185
                               ...   
Телевизор                   -0.318517
Посудомоечная машина        -0.289861
Кондиционер                  0.531071
Интернет                     0.041832
Санузел                      1.878872
Length: 103, dtype: float64

In [18]:
results.summary()



0,1,2,3
Dep. Variable:,time_to_rent,R-squared:,0.133
Model:,OLS,Adj. R-squared:,0.036
Method:,Least Squares,F-statistic:,15.35
Date:,"Mon, 26 Feb 2024",Prob (F-statistic):,2.5500000000000003e-129
Time:,06:46:32,Log-Likelihood:,-2071.5
No. Observations:,941,AIC:,4335.0
Df Residuals:,845,BIC:,4800.0
Df Model:,95,,
Covariance Type:,HC3,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,1.2016,5.087,0.236,0.813,-8.768,11.171
district_Адмиралтейский,0.2288,1.433,0.160,0.873,-2.579,3.037
district_Василеостровский,0.8016,1.873,0.428,0.669,-2.870,4.473
district_Выборгский,-0.2335,1.392,-0.168,0.867,-2.961,2.494
district_Калининский,-0.3302,1.368,-0.241,0.809,-3.012,2.352
district_Кировский,-0.2119,1.414,-0.150,0.881,-2.984,2.560
district_Колпинский,0.9554,1.534,0.623,0.533,-2.052,3.963
district_Красногвардейский,-0.2312,1.335,-0.173,0.863,-2.848,2.385
district_Красносельский,0.1619,1.419,0.114,0.909,-2.619,2.943

0,1,2,3
Omnibus:,86.339,Durbin-Watson:,2.089
Prob(Omnibus):,0.0,Jarque-Bera (JB):,108.363
Skew:,0.804,Prob(JB):,2.9499999999999997e-24
Kurtosis:,3.421,Cond. No.,9.7e+16


In [19]:
X_test = sm.add_constant(X_test)
preds_ts = results.predict(X_test)

In [20]:
print('Тестовая средння абсолютная ошибка: {}'.format(mean_absolute_error(Y_test,preds_ts)))
print('Тестовая среднеквадратичная ошибка: {}'.format(mean_squared_error(Y_test,preds_ts)))
print('Тестовый r2: {}'.format(r2_score(Y_test,preds_ts)))
print('Тестовое отношение mae к среднему: {}%'.format(round((mean_absolute_error(Y_test,preds_ts)/Y_test.mean())*100,2)))
print('Тестовая средняя цена: {}'.format(Y_test.mean()))
print('Предсказанная средняя цена: {}'.format(preds_ts.mean()))

Тестовая средння абсолютная ошибка: 1.8905314334211432
Тестовая среднеквадратичная ошибка: 5.600309572209118
Тестовый r2: -0.08001432261770947
Тестовое отношение mae к среднему: 71.01%
Тестовая средняя цена: 2.662420382165605
Предсказанная средняя цена: 2.7012679575123983


---

### Результат ужасный, так как данные не отражают действительности, так как прошло слишком мало дней


In [21]:
ds = ds.drop(['district'],axis = 1)

In [22]:
ds_shuffled = ds.sample(len(ds),random_state= 66).reset_index(drop=True)
ds_train = ds_shuffled[:int(len(ds)*0.75)]
ds_test = ds_shuffled[int(len(ds)*0.75):]

In [23]:
Y_train = ds_train['time_to_rent']
X_train = ds_train.drop('time_to_rent',axis = 1)
Y_test = ds_test['time_to_rent']
X_test = ds_test.drop('time_to_rent',axis = 1)

In [24]:
bin_cols = list(ds.nunique()[ds.nunique() == 2].index)
num_cols = list(X_train.drop(bin_cols,axis =1).select_dtypes(include = 'number').columns)
non_cat_cols = num_cols.copy()
non_cat_cols.extend(bin_cols)
cat_cols = list(X_train.drop(non_cat_cols,axis =1).columns)

In [25]:
preprocessor = ColumnTransformer(
    transformers=[
        ('cat',OneHotEncoder() ,cat_cols),
        ('num', 'passthrough', num_cols),
        ('bin','passthrough',bin_cols)
    ])

In [26]:
X_train_norm = preprocessor.fit_transform(X_train)
X_test_norm = preprocessor.transform(X_test)
X_train_norm.shape,X_test_norm.shape

((941, 84), (314, 84))

In [27]:
col_names = []
for item in preprocessor.get_feature_names_out():
    col_names.append(item.split('__')[1])
X_train = pd.DataFrame(X_train_norm.toarray(),columns=col_names)
X_test = pd.DataFrame(X_test_norm.toarray(),columns=col_names)

In [28]:
X_train = sm.add_constant(X_train)
results= sm.OLS(Y_train,X_train).fit(cov_type = 'HC3')

In [29]:
results.summary()



0,1,2,3
Dep. Variable:,time_to_rent,R-squared:,0.114
Model:,OLS,Adj. R-squared:,0.034
Method:,Least Squares,F-statistic:,18.0
Date:,"Mon, 26 Feb 2024",Prob (F-statistic):,2.21e-134
Time:,06:53:15,Log-Likelihood:,-2081.9
No. Observations:,941,AIC:,4322.0
Df Residuals:,862,BIC:,4705.0
Df Model:,78,,
Covariance Type:,HC3,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,-0.1376,4.965,-0.028,0.978,-9.869,9.593
underground_Автово,-0.4313,0.642,-0.672,0.502,-1.690,0.827
underground_Академическая,-0.0696,0.458,-0.152,0.879,-0.968,0.829
underground_Балтийская,0.9132,1.245,0.733,0.463,-1.527,3.354
underground_Беговая,0.3690,0.399,0.924,0.356,-0.414,1.152
underground_Василеостровская,0.6879,0.875,0.786,0.432,-1.028,2.403
underground_Выборгская,1.3900,0.932,1.491,0.136,-0.438,3.218
underground_Гостиный двор,1.4006,1.572,0.891,0.373,-1.680,4.481
underground_Гражданский проспект,0.1747,0.400,0.437,0.662,-0.609,0.958

0,1,2,3
Omnibus:,88.829,Durbin-Watson:,2.1
Prob(Omnibus):,0.0,Jarque-Bera (JB):,112.323
Skew:,0.828,Prob(JB):,4.07e-25
Kurtosis:,3.348,Cond. No.,8.84e+18


In [None]:
X_test = sm.add_constant(X_test)
preds_ts = results.predict(X_test)

In [30]:
print('Тестовая средння абсолютная ошибка: {}'.format(mean_absolute_error(Y_test,preds_ts)))
print('Тестовая среднеквадратичная ошибка: {}'.format(mean_squared_error(Y_test,preds_ts)))
print('Тестовый r2: {}'.format(r2_score(Y_test,preds_ts)))
print('Тестовое отношение mae к среднему: {}%'.format(round((mean_absolute_error(Y_test,preds_ts)/Y_test.mean())*100,2)))
print('Тестовая средняя цена: {}'.format(Y_test.mean()))
print('Предсказанная средняя цена: {}'.format(preds_ts.mean()))

Тестовая средння абсолютная ошибка: 1.8905314334211432
Тестовая среднеквадратичная ошибка: 5.600309572209118
Тестовый r2: -0.08001432261770947
Тестовое отношение mae к среднему: 71.01%
Тестовая средняя цена: 2.662420382165605
Предсказанная средняя цена: 2.7012679575123983


#### Удаление района ситуацию не сильно изменило, хотя по идее это примерно тоже самое, что и метро