In [1]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

In [2]:
ds = pd.read_csv('r_1_SPB_clear.csv',index_col=0)

In [3]:
ds_shuffled = ds.sample(len(ds),random_state= 66).reset_index(drop=True)
ds_train = ds_shuffled[:int(len(ds)*0.75)]
ds_test = ds_shuffled[int(len(ds)*0.75):]

In [4]:
Y_train = ds_train['price_per_month']
X_train = ds_train.drop(['price_per_month','time_to_rent'],axis = 1)

In [5]:
Y_test = ds_test['price_per_month']
X_test = ds_test.drop(['price_per_month','time_to_rent'],axis = 1)

In [6]:
bin_cols = list(ds.nunique()[ds.nunique() == 2].index)
bin_cols

['Холодильник',
 'Стиральная машина',
 'Телевизор',
 'Посудомоечная машина',
 'Кондиционер',
 'Интернет',
 'Санузел']

In [7]:
num_cols = list(X_train.drop(bin_cols,axis =1).select_dtypes(include = 'number').columns)
num_cols

['floor',
 'floors_count',
 'total_meters',
 'Площадь кухни',
 'Высота потолков',
 'Год постройки']

In [8]:
non_cat_cols = num_cols.copy()
non_cat_cols.extend(bin_cols)
cat_cols = list(X_train.drop(non_cat_cols,axis =1).columns)
cat_cols

['district',
 'underground',
 'Балкон/лоджия',
 'Вид из окон',
 'Ремонт',
 'Тип дома',
 'Парковка']

In [9]:
preprocessor = ColumnTransformer(
    transformers=[
        ('cat',OneHotEncoder() ,cat_cols),
        ('num', StandardScaler(), num_cols),
        ('bin','passthrough',bin_cols)
    ])

In [10]:
X_train_norm = preprocessor.fit_transform(X_train)
X_test_norm = preprocessor.transform(X_test)
X_train_norm.shape,X_test_norm.shape

((941, 101), (314, 101))

In [11]:
col_names = []
for item in preprocessor.get_feature_names_out():
    col_names.append(item.split('__')[1])

In [12]:
X_train = pd.DataFrame(X_train_norm.toarray(),columns=col_names)
X_test = pd.DataFrame(X_test_norm.toarray(),columns=col_names)
display(X_train,X_test)

Unnamed: 0,district_Адмиралтейский,district_Василеостровский,district_Выборгский,district_Калининский,district_Кировский,district_Колпинский,district_Красногвардейский,district_Красносельский,district_Кронштадтский,district_Курортный,...,Площадь кухни,Высота потолков,Год постройки,Холодильник,Стиральная машина,Телевизор,Посудомоечная машина,Кондиционер,Интернет,Санузел
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.307610,0.124209,0.673101,1.0,1.0,1.0,0.0,0.0,1.0,1.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.738320,-0.061678,0.460031,1.0,1.0,0.0,0.0,0.0,0.0,1.0
2,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.215355,0.124209,0.353495,1.0,1.0,1.0,1.0,0.0,0.0,1.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.476837,-1.030355,0.673101,1.0,1.0,0.0,1.0,0.0,1.0,1.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.215355,0.701491,0.424519,1.0,1.0,0.0,0.0,0.0,1.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
936,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.046128,-0.453073,0.211448,1.0,1.0,1.0,1.0,1.0,1.0,1.0
937,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.092057,0.701491,0.708613,1.0,1.0,1.0,1.0,0.0,1.0,1.0
938,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,2.530210,0.124209,0.317983,1.0,1.0,1.0,1.0,0.0,1.0,1.0
939,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.215355,1.856054,0.708613,1.0,1.0,1.0,1.0,0.0,1.0,1.0


Unnamed: 0,district_Адмиралтейский,district_Василеостровский,district_Выборгский,district_Калининский,district_Кировский,district_Колпинский,district_Красногвардейский,district_Красносельский,district_Кронштадтский,district_Курортный,...,Площадь кухни,Высота потолков,Год постройки,Холодильник,Стиральная машина,Телевизор,Посудомоечная машина,Кондиционер,Интернет,Санузел
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.215355,3.010618,-3.588315,1.0,1.0,0.0,0.0,0.0,1.0,1.0
1,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.215355,0.124209,0.389007,1.0,1.0,1.0,0.0,0.0,0.0,1.0
2,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,-1.522767,-1.030355,-1.457607,1.0,1.0,1.0,0.0,0.0,0.0,1.0
3,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.615022,0.124209,0.779637,1.0,1.0,1.0,1.0,0.0,0.0,1.0
4,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.999802,-0.061678,-0.995953,1.0,1.0,1.0,0.0,0.0,1.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
309,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-1.392025,-0.061678,-1.315559,1.0,1.0,0.0,0.0,0.0,1.0,1.0
310,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.999802,-0.453073,-0.818394,1.0,1.0,1.0,0.0,0.0,1.0,1.0
311,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.092057,-0.453073,0.531054,1.0,1.0,0.0,0.0,0.0,1.0,1.0
312,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.699834,-0.453073,0.495542,1.0,1.0,0.0,0.0,0.0,1.0,1.0


In [13]:
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import SGDRegressor

In [15]:
lin_reg = LinearRegression()
lin_reg.fit(X_train,Y_train)

In [16]:
preds_tr = lin_reg.predict(X_train)
preds_ts = lin_reg.predict(X_test)

In [17]:
print('Трейн средння абсолютная ошибка: {}'.format(mean_absolute_error(Y_train,preds_tr)))
print('Трейн среднеквадратичная ошибка: {}'.format(mean_squared_error(Y_train,preds_tr)))
print('Трейн r2: {}'.format(r2_score(Y_train,preds_tr)))
print('Трейн отношение mae к среднему: {}%'.format(round((mean_absolute_error(Y_train,preds_tr)/Y_train.mean())*100,2)))
print('Трейн средняя цена: {}'.format(round(Y_train.mean(),2)))
print('Предсказанная средняя цена: {}'.format(round(preds_tr.mean(),2)))

Трейн средння абсолютная ошибка: 4384.221041445271
Трейн среднеквадратичная ошибка: 35830746.34431456
Трейн r2: 0.7483633457156467
Трейн отношение mae к среднему: 13.05%
Трейн средняя цена: 33595.52
Предсказанная средняя цена: 33612.72


In [18]:
print('Тестовая средння абсолютная ошибка: {}'.format(mean_absolute_error(Y_test,preds_ts)))
print('Тестовая среднеквадратичная ошибка: {}'.format(mean_squared_error(Y_test,preds_ts)))
print('Тестовый r2: {}'.format(r2_score(Y_test,preds_ts)))
print('Тестовое отношение mae к среднему: {}%'.format(round((mean_absolute_error(Y_test,preds_ts)/Y_test.mean())*100,2)))
print('Тестовая средняя цена: {}'.format(Y_test.mean()))
print('Предсказанная средняя цена: {}'.format(preds_ts.mean()))

Тестовая средння абсолютная ошибка: 4714.6878980891715
Тестовая среднеквадратичная ошибка: 42487033.63057325
Тестовый r2: 0.6646423704210462
Тестовое отношение mae к среднему: 14.36%
Тестовая средняя цена: 32833.43949044586
Предсказанная средняя цена: 32664.91719745223


In [19]:
sgd_reg = SGDRegressor(penalty='elasticnet')
sgd_reg.fit(X_train,Y_train)

In [20]:
preds_tr = sgd_reg.predict(X_train)
preds_ts = sgd_reg.predict(X_test)

In [21]:
print('Трейн средння абсолютная ошибка: {}'.format(mean_absolute_error(Y_train,preds_tr)))
print('Трейн среднеквадратичная ошибка: {}'.format(mean_squared_error(Y_train,preds_tr)))
print('Трейн r2: {}'.format(r2_score(Y_train,preds_tr)))
print('Трейн отношение mae к среднему: {}%'.format(round((mean_absolute_error(Y_train,preds_tr)/Y_train.mean())*100,2)))
print('Трейн средняя цена: {}'.format(round(Y_train.mean(),2)))
print('Предсказанная средняя цена: {}'.format(round(preds_tr.mean(),2)))

Трейн средння абсолютная ошибка: 4510.8699422080135
Трейн среднеквадратичная ошибка: 38015868.88121315
Трейн r2: 0.733017393412486
Трейн отношение mae к среднему: 13.43%
Трейн средняя цена: 33595.52
Предсказанная средняя цена: 33466.8


In [22]:
print('Тестовая средння абсолютная ошибка: {}'.format(mean_absolute_error(Y_test,preds_ts)))
print('Тестовая среднеквадратичная ошибка: {}'.format(mean_squared_error(Y_test,preds_ts)))
print('Тестовый r2: {}'.format(r2_score(Y_test,preds_ts)))
print('Тестовое отношение mae к среднему: {}%'.format(round((mean_absolute_error(Y_test,preds_ts)/Y_test.mean())*100,2)))
print('Тестовая средняя цена: {}'.format(Y_test.mean()))
print('Предсказанная средняя цена: {}'.format(preds_ts.mean()))

Тестовая средння абсолютная ошибка: 4831.106011127384
Тестовая среднеквадратичная ошибка: 41460745.051689714
Тестовый r2: 0.6727430466902098
Тестовое отношение mae к среднему: 14.71%
Тестовая средняя цена: 32833.43949044586
Предсказанная средняя цена: 32635.42378877133


## Прирост в качестве не стоит того, чтобы считать робастные ошибки в ручную для тестов коэффицентов