In [1211]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score as r2
from sklearn.metrics import mean_squared_error as mse

In [1212]:
from sklearn.model_selection import train_test_split

In [1213]:
data = pd.read_csv('input/train.csv')

In [1214]:
data = data.loc[data['Rooms'] < 10, :]
data = data.loc[data['Price'].between(30000, 600000), :]
data = data.loc[data['HouseYear'].between(1800, 2025), :]
data = data.loc[data['Square'].between(1, 1000), :]
data = data.loc[data['KitchenSquare'].between(1, 1000), :]

In [1215]:
data = data.drop('Healthcare_1', 1)

In [1216]:
x_year = 1917
data['OldHouse'] = (data['HouseYear'] <= x_year).astype(int)

In [1217]:
data = pd.get_dummies(data)

### Меняем местами Square и LifeSquare, если Square < LifeSquare

In [1218]:
data['_Square'] = data['Square']

In [1219]:
data.loc[(data['LifeSquare'] > data['Square']), 'Square'] = data['LifeSquare']

In [1220]:
data.loc[(data['LifeSquare'] > data['_Square']), 'LifeSquare'] = data['_Square']

In [1221]:
data = data.drop('_Square', 1)

### Чиним площади

In [1222]:
data.loc[(data['Square'] < 15) & ((data['Rooms']) <= 1),'Square'] = \
    data.loc[(data['Rooms'] <= 1) & (data['Square'] > 15),'Square'].mean()

In [1223]:
data.loc[(data['Square'] < 15) & ((data['Rooms']) == 3),'Square'] = \
    data.loc[(data['Rooms'] == 3) & (data['Square'] > 15),'Square'].mean()

In [1224]:
data['Square_2'] = data['Square'] **2

### Чиним LifeSquare

In [1225]:
data.loc[(data['Rooms'] <=1) & (data['LifeSquare'] <= 6) & (data['Square'] > 15), 'LifeSquare'] = \
    data.loc[(data['Rooms'] <=1) & (data['LifeSquare'] >= 6) & (data['Square'] > 15), 'LifeSquare'].median()

In [1226]:
data.loc[(data['Rooms'] == 2) & (data['LifeSquare'] <= 6) & (data['Square'] > 15), 'LifeSquare'] = \
    data.loc[(data['Rooms'] ==2) & (data['LifeSquare'] >= 6) & (data['Square'] > 15), 'LifeSquare'].median()

In [1227]:
data.loc[(data['Rooms'] == 3) & (data['LifeSquare'] <= 6) & (data['Square'] > 15), 'LifeSquare'] = \
    data.loc[(data['Rooms'] ==3) & (data['LifeSquare'] >= 6) & (data['Square'] > 15), 'LifeSquare'].median()

In [1228]:
data.loc[(data['Rooms'] == 4) & (data['LifeSquare'] <= 6) & (data['Square'] > 15), 'LifeSquare'] = \
    data.loc[(data['Rooms'] ==4) & (data['LifeSquare'] >= 6) & (data['Square'] > 15), 'LifeSquare'].median()

### Заполняем NaN LifeSquare

In [1229]:
data.loc[(data['LifeSquare']).isnull() & (data['Rooms'] <= 1), 'LifeSquare'] = data['Square']*0.625

In [1230]:
data.loc[(data['LifeSquare']).isnull() & (data['Rooms'] == 2), 'LifeSquare'] = data['Square']*0.648

In [1231]:
data.loc[(data['LifeSquare']).isnull() & (data['Rooms'] == 3), 'LifeSquare'] = data['Square']*0.641

In [1232]:
data.loc[(data['LifeSquare']).isnull() & (data['Rooms'] == 4), 'LifeSquare'] = data['Square']*0.684

In [1233]:
data.loc[(data['LifeSquare']).isnull()]

Unnamed: 0,Id,DistrictId,Rooms,Square,LifeSquare,KitchenSquare,Floor,HouseFloor,HouseYear,Ecology_1,...,Shops_1,Price,OldHouse,Ecology_2_A,Ecology_2_B,Ecology_3_A,Ecology_3_B,Shops_2_A,Shops_2_B,Square_2


### Делим на тестовый и проверочный

In [1234]:
train, valid = train_test_split(data, test_size=0.3, random_state=42)

### Вычисляем районы

In [1235]:
district_stat = train.groupby(['DistrictId', 'Rooms'])[['Price']].mean().reset_index().rename(columns={'Price':'mean_price'})

In [1236]:
train = pd.merge(train, district_stat, on=['DistrictId', 'Rooms'], how='left')

In [1237]:
valid = pd.merge(valid, district_stat, on=['DistrictId', 'Rooms'], how='left')

In [1238]:
valid['mean_price'].isnull().sum()

55

In [1239]:
room_stat = train.groupby(['Rooms'])[['Price']].mean().reset_index().rename(columns={'Price': 'mean_price2'})

In [1240]:
train = pd.merge(train, room_stat, on=['Rooms'], how='left')

In [1241]:
valid = pd.merge(valid, room_stat, on='Rooms', how='left')

In [1242]:
train.loc[train['Rooms'] == 5, 'Price'].mean()

417397.0835718896

In [1243]:
valid.loc[valid['mean_price2'].isnull(), :]

Unnamed: 0,Id,DistrictId,Rooms,Square,LifeSquare,KitchenSquare,Floor,HouseFloor,HouseYear,Ecology_1,...,OldHouse,Ecology_2_A,Ecology_2_B,Ecology_3_A,Ecology_3_B,Shops_2_A,Shops_2_B,Square_2,mean_price,mean_price2


In [1244]:
valid.loc[valid['Rooms'] == 6, 'mean_price2'] = train.loc[train['Rooms'] == 5, 'Price'].mean()

In [1245]:
valid['mean_price'] = valid['mean_price'].fillna(valid['mean_price2'])

In [1246]:
valid.loc[valid['Rooms'] == 6, :]

Unnamed: 0,Id,DistrictId,Rooms,Square,LifeSquare,KitchenSquare,Floor,HouseFloor,HouseYear,Ecology_1,...,OldHouse,Ecology_2_A,Ecology_2_B,Ecology_3_A,Ecology_3_B,Shops_2_A,Shops_2_B,Square_2,mean_price,mean_price2


In [1247]:
valid['mean_price'].isnull().sum()

0

### Приступаем  к построению модели. Линейная регрессия

In [1248]:
data.columns

Index(['Id', 'DistrictId', 'Rooms', 'Square', 'LifeSquare', 'KitchenSquare',
       'Floor', 'HouseFloor', 'HouseYear', 'Ecology_1', 'Social_1', 'Social_2',
       'Social_3', 'Helthcare_2', 'Shops_1', 'Price', 'OldHouse',
       'Ecology_2_A', 'Ecology_2_B', 'Ecology_3_A', 'Ecology_3_B', 'Shops_2_A',
       'Shops_2_B', 'Square_2'],
      dtype='object')

In [1249]:
fts = ['DistrictId','Rooms', 'Square', 'LifeSquare', 'KitchenSquare',
       'Floor', 'HouseFloor', 'HouseYear', 'Ecology_1', 'Social_1', 'Social_2',
       'Social_3', 'Helthcare_2', 'Shops_1',
       'Ecology_2_A', 'Ecology_2_B', 'Ecology_3_A', 'Ecology_3_B', 'mean_price','Square_2', 'OldHouse']

In [1250]:
lr = LinearRegression()

In [1251]:
lr.fit(train.loc[:, fts], train['Price'])

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
         normalize=False)

In [1252]:
valid.shape

(2784, 26)

In [1253]:
pred = lr.predict(train.loc[:, fts])

In [1254]:
r2(train['Price'], pred)

0.6984408543964573

In [1255]:
pred_valid = lr.predict(valid.loc[:, fts])

In [1256]:
r2(valid['Price'], pred_valid)

0.5719101705722744

### Случайный лес

In [1257]:
from sklearn.ensemble import RandomForestRegressor as RF

rf = RF(n_estimators=20, max_depth=10, min_samples_leaf=2, random_state=42)

In [1258]:
rf.fit(train.loc[:, fts], train['Price'])

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=10,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=2, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=20, n_jobs=None,
           oob_score=False, random_state=42, verbose=0, warm_start=False)

In [1259]:
pred = rf.predict(train.loc[:, fts])

In [1260]:
r2(train['Price'], pred)

0.8753738995727274

In [1261]:
pred_valid = rf.predict(valid.loc[:, fts])

In [1262]:
r2(valid['Price'], pred_valid)

0.6573317395957068

## Предсказание на тесте

In [1263]:
test = pd.read_csv('input/test.csv')

In [1264]:
test = test.drop('Healthcare_1', 1)

In [1265]:
x_year = 1917
test['OldHouse'] = (test['HouseYear'] <= x_year).astype(int)

In [1266]:
test = pd.get_dummies(test)

In [1267]:
test['_Square'] = test['Square']

In [1268]:
test.loc[(test['LifeSquare'] > test['Square']), 'Square'] = test['LifeSquare']

In [1269]:
test.loc[(test['LifeSquare'] > test['_Square']), 'LifeSquare'] = test['_Square']

In [1270]:
test = test.drop('_Square', 1)

In [1271]:
test.head()

Unnamed: 0,Id,DistrictId,Rooms,Square,LifeSquare,KitchenSquare,Floor,HouseFloor,HouseYear,Ecology_1,...,Social_3,Helthcare_2,Shops_1,OldHouse,Ecology_2_A,Ecology_2_B,Ecology_3_A,Ecology_3_B,Shops_2_A,Shops_2_B
0,725,58,2.0,49.882643,33.432782,6.0,6,14.0,1972,0.310199,...,1,0,0,0,0,1,0,1,0,1
1,15856,74,2.0,69.263183,,1.0,6,1.0,1977,0.075779,...,3,0,2,0,0,1,0,1,0,1
2,5480,190,1.0,15.948246,13.597819,12.0,2,5.0,1909,0.0,...,87,5,5,1,0,1,0,1,0,1
3,15664,47,2.0,73.046609,51.940842,9.0,22,22.0,2007,0.101872,...,3,3,3,0,0,1,0,1,0,1
4,14275,27,1.0,47.527111,43.387569,1.0,17,17.0,2017,0.072158,...,1,0,0,0,0,1,0,1,1,0


In [1272]:
test.loc[(test['HouseYear'] == test['Floor'])]

Unnamed: 0,Id,DistrictId,Rooms,Square,LifeSquare,KitchenSquare,Floor,HouseFloor,HouseYear,Ecology_1,...,Social_3,Helthcare_2,Shops_1,OldHouse,Ecology_2_A,Ecology_2_B,Ecology_3_A,Ecology_3_B,Shops_2_A,Shops_2_B


### Чиним площади

In [1273]:
test.loc[(test['Square'] < 15) & ((test['Rooms']) <= 1),'Square'] = \
    test.loc[(test['Rooms'] <= 1) & (test['Square'] > 15),'Square'].mean()

In [1274]:
test.loc[(test['Square'] < 15) & ((test['Rooms']) == 3),'Square'] = \
    test.loc[(test['Rooms'] == 3) & (test['Square'] > 15),'Square'].mean()

In [1275]:
test.loc[(test['Square'] < 15)]

Unnamed: 0,Id,DistrictId,Rooms,Square,LifeSquare,KitchenSquare,Floor,HouseFloor,HouseYear,Ecology_1,...,Social_3,Helthcare_2,Shops_1,OldHouse,Ecology_2_A,Ecology_2_B,Ecology_3_A,Ecology_3_B,Shops_2_A,Shops_2_B


In [1276]:
test.loc[(test['HouseYear'] == test['Floor'])]

Unnamed: 0,Id,DistrictId,Rooms,Square,LifeSquare,KitchenSquare,Floor,HouseFloor,HouseYear,Ecology_1,...,Social_3,Helthcare_2,Shops_1,OldHouse,Ecology_2_A,Ecology_2_B,Ecology_3_A,Ecology_3_B,Shops_2_A,Shops_2_B


In [1277]:
test['Square_2'] = test['Square'] **2

In [1278]:
test.loc[(test['HouseYear'] == test['Floor'])]

Unnamed: 0,Id,DistrictId,Rooms,Square,LifeSquare,KitchenSquare,Floor,HouseFloor,HouseYear,Ecology_1,...,Helthcare_2,Shops_1,OldHouse,Ecology_2_A,Ecology_2_B,Ecology_3_A,Ecology_3_B,Shops_2_A,Shops_2_B,Square_2


In [1279]:
test.head()

Unnamed: 0,Id,DistrictId,Rooms,Square,LifeSquare,KitchenSquare,Floor,HouseFloor,HouseYear,Ecology_1,...,Helthcare_2,Shops_1,OldHouse,Ecology_2_A,Ecology_2_B,Ecology_3_A,Ecology_3_B,Shops_2_A,Shops_2_B,Square_2
0,725,58,2.0,49.882643,33.432782,6.0,6,14.0,1972,0.310199,...,0,0,0,0,1,0,1,0,1,2488.278112
1,15856,74,2.0,69.263183,,1.0,6,1.0,1977,0.075779,...,0,2,0,0,1,0,1,0,1,4797.388585
2,5480,190,1.0,15.948246,13.597819,12.0,2,5.0,1909,0.0,...,5,5,1,0,1,0,1,0,1,254.346563
3,15664,47,2.0,73.046609,51.940842,9.0,22,22.0,2007,0.101872,...,3,3,0,0,1,0,1,0,1,5335.807118
4,14275,27,1.0,47.527111,43.387569,1.0,17,17.0,2017,0.072158,...,0,0,0,0,1,0,1,1,0,2258.826252


### Чиним LifeSquare

In [1280]:
test.loc[(test['Rooms'] <=1) & (test['LifeSquare'] <= 6) & (test['Square'] > 15), 'LifeSquare'] = \
    test.loc[(test['Rooms'] <=1) & (test['LifeSquare'] >= 6) & (test['Square'] > 15), 'LifeSquare'].median()

In [1281]:
test.loc[(test['Rooms'] == 2) & (test['LifeSquare'] <= 6) & (test['Square'] > 15), 'LifeSquare'] = \
    test.loc[(test['Rooms'] ==2) & (test['LifeSquare'] >= 6) & (test['Square'] > 15), 'LifeSquare'].median()

In [1282]:
test.loc[(test['Rooms'] == 3) & (test['LifeSquare'] <= 6) & (test['Square'] > 15), 'LifeSquare'] = \
    test.loc[(test['Rooms'] ==3) & (test['LifeSquare'] >= 6) & (test['Square'] > 15), 'LifeSquare'].median()

In [1283]:
test.loc[(test['Rooms'] == 4) & (test['LifeSquare'] <= 6) & (test['Square'] > 15), 'LifeSquare'] = \
    test.loc[(test['Rooms'] ==4) & (test['LifeSquare'] >= 6) & (test['Square'] > 15), 'LifeSquare'].median()

In [1284]:
test.loc[(test['Rooms'] == 5) & (test['LifeSquare'] <= 6) & (test['Square'] > 15), 'LifeSquare'] = \
    test.loc[(test['Rooms'] ==5) & (test['LifeSquare'] >= 6) & (test['Square'] > 15), 'LifeSquare'].median()

In [1285]:
test.loc[(test['LifeSquare'] <= 6) & (test['Square'] > 15)]

Unnamed: 0,Id,DistrictId,Rooms,Square,LifeSquare,KitchenSquare,Floor,HouseFloor,HouseYear,Ecology_1,...,Helthcare_2,Shops_1,OldHouse,Ecology_2_A,Ecology_2_B,Ecology_3_A,Ecology_3_B,Shops_2_A,Shops_2_B,Square_2


In [1286]:
test.loc[(test['HouseYear'] == test['Floor'])]

Unnamed: 0,Id,DistrictId,Rooms,Square,LifeSquare,KitchenSquare,Floor,HouseFloor,HouseYear,Ecology_1,...,Helthcare_2,Shops_1,OldHouse,Ecology_2_A,Ecology_2_B,Ecology_3_A,Ecology_3_B,Shops_2_A,Shops_2_B,Square_2


In [1287]:
test.head()

Unnamed: 0,Id,DistrictId,Rooms,Square,LifeSquare,KitchenSquare,Floor,HouseFloor,HouseYear,Ecology_1,...,Helthcare_2,Shops_1,OldHouse,Ecology_2_A,Ecology_2_B,Ecology_3_A,Ecology_3_B,Shops_2_A,Shops_2_B,Square_2
0,725,58,2.0,49.882643,33.432782,6.0,6,14.0,1972,0.310199,...,0,0,0,0,1,0,1,0,1,2488.278112
1,15856,74,2.0,69.263183,,1.0,6,1.0,1977,0.075779,...,0,2,0,0,1,0,1,0,1,4797.388585
2,5480,190,1.0,15.948246,13.597819,12.0,2,5.0,1909,0.0,...,5,5,1,0,1,0,1,0,1,254.346563
3,15664,47,2.0,73.046609,51.940842,9.0,22,22.0,2007,0.101872,...,3,3,0,0,1,0,1,0,1,5335.807118
4,14275,27,1.0,47.527111,43.387569,1.0,17,17.0,2017,0.072158,...,0,0,0,0,1,0,1,1,0,2258.826252


### Заполняем NaN LifeSquare

In [1288]:
test.loc[(test['LifeSquare']).isnull() & (test['Rooms'] <= 1), 'LifeSquare'] = test['Square']*0.625

In [1289]:
test.loc[(test['LifeSquare']).isnull() & (test['Rooms'] == 2), 'LifeSquare'] = test['Square']*0.648

In [1290]:
test.loc[(test['LifeSquare']).isnull() & (test['Rooms'] == 3), 'LifeSquare'] = test['Square']*0.641

In [1291]:
test.loc[(test['LifeSquare']).isnull() & (test['Rooms'] == 4), 'LifeSquare'] = test['Square']*0.684

In [1292]:
test.loc[(test['LifeSquare']).isnull() & (test['Rooms'] == 6), 'LifeSquare'] = test['Square']*0.69

In [1293]:
test.loc[(test['LifeSquare']).isnull()]

Unnamed: 0,Id,DistrictId,Rooms,Square,LifeSquare,KitchenSquare,Floor,HouseFloor,HouseYear,Ecology_1,...,Helthcare_2,Shops_1,OldHouse,Ecology_2_A,Ecology_2_B,Ecology_3_A,Ecology_3_B,Shops_2_A,Shops_2_B,Square_2


In [1294]:
test.loc[(test['HouseYear'] == test['Floor'])]

Unnamed: 0,Id,DistrictId,Rooms,Square,LifeSquare,KitchenSquare,Floor,HouseFloor,HouseYear,Ecology_1,...,Helthcare_2,Shops_1,OldHouse,Ecology_2_A,Ecology_2_B,Ecology_3_A,Ecology_3_B,Shops_2_A,Shops_2_B,Square_2


In [1295]:
test.head()

Unnamed: 0,Id,DistrictId,Rooms,Square,LifeSquare,KitchenSquare,Floor,HouseFloor,HouseYear,Ecology_1,...,Helthcare_2,Shops_1,OldHouse,Ecology_2_A,Ecology_2_B,Ecology_3_A,Ecology_3_B,Shops_2_A,Shops_2_B,Square_2
0,725,58,2.0,49.882643,33.432782,6.0,6,14.0,1972,0.310199,...,0,0,0,0,1,0,1,0,1,2488.278112
1,15856,74,2.0,69.263183,44.882543,1.0,6,1.0,1977,0.075779,...,0,2,0,0,1,0,1,0,1,4797.388585
2,5480,190,1.0,15.948246,13.597819,12.0,2,5.0,1909,0.0,...,5,5,1,0,1,0,1,0,1,254.346563
3,15664,47,2.0,73.046609,51.940842,9.0,22,22.0,2007,0.101872,...,3,3,0,0,1,0,1,0,1,5335.807118
4,14275,27,1.0,47.527111,43.387569,1.0,17,17.0,2017,0.072158,...,0,0,0,0,1,0,1,1,0,2258.826252


In [1296]:
test[test.isnull().any(axis=1)]

Unnamed: 0,Id,DistrictId,Rooms,Square,LifeSquare,KitchenSquare,Floor,HouseFloor,HouseYear,Ecology_1,...,Helthcare_2,Shops_1,OldHouse,Ecology_2_A,Ecology_2_B,Ecology_3_A,Ecology_3_B,Shops_2_A,Shops_2_B,Square_2


In [1297]:
district_stat.head()

Unnamed: 0,DistrictId,Rooms,mean_price
0,0,1.0,148526.379217
1,0,2.0,201970.893363
2,0,3.0,304666.831553
3,1,1.0,146569.563192
4,1,2.0,197744.90968


In [1298]:
test = pd.merge(test, district_stat, on=['DistrictId', 'Rooms'], how='left')

In [1299]:
test.head()

Unnamed: 0,Id,DistrictId,Rooms,Square,LifeSquare,KitchenSquare,Floor,HouseFloor,HouseYear,Ecology_1,...,Shops_1,OldHouse,Ecology_2_A,Ecology_2_B,Ecology_3_A,Ecology_3_B,Shops_2_A,Shops_2_B,Square_2,mean_price
0,725,58,2.0,49.882643,33.432782,6.0,6,14.0,1972,0.310199,...,0,0,0,1,0,1,0,1,2488.278112,171809.963617
1,15856,74,2.0,69.263183,44.882543,1.0,6,1.0,1977,0.075779,...,2,0,0,1,0,1,0,1,4797.388585,244431.660687
2,5480,190,1.0,15.948246,13.597819,12.0,2,5.0,1909,0.0,...,5,1,0,1,0,1,0,1,254.346563,
3,15664,47,2.0,73.046609,51.940842,9.0,22,22.0,2007,0.101872,...,3,0,0,1,0,1,0,1,5335.807118,212673.196383
4,14275,27,1.0,47.527111,43.387569,1.0,17,17.0,2017,0.072158,...,0,0,0,1,0,1,1,0,2258.826252,123879.648723


In [1300]:
test['mean_price'].isnull().sum()

100

In [1301]:
test[test.isnull().any(axis=1)].head()

Unnamed: 0,Id,DistrictId,Rooms,Square,LifeSquare,KitchenSquare,Floor,HouseFloor,HouseYear,Ecology_1,...,Shops_1,OldHouse,Ecology_2_A,Ecology_2_B,Ecology_3_A,Ecology_3_B,Shops_2_A,Shops_2_B,Square_2,mean_price
2,5480,190,1.0,15.948246,13.597819,12.0,2,5.0,1909,0.0,...,5,1,0,1,0,1,0,1,254.346563,
51,15748,212,2.0,75.655182,36.877626,15.0,2,7.0,1929,0.0,...,8,0,0,1,0,1,0,1,5723.706607,
60,10131,141,3.0,53.368865,28.356335,6.0,11,16.0,1970,0.0,...,4,0,0,1,0,1,0,1,2848.235785,
78,6553,34,5.0,122.375273,89.790274,0.0,9,0.0,1977,0.069753,...,11,0,0,1,0,1,0,1,14975.707492,
149,3764,193,4.0,97.931642,72.386905,8.0,5,5.0,1957,0.319809,...,8,0,0,1,0,1,0,1,9590.6065,


In [1302]:
test = pd.merge(test, room_stat, on=['Rooms'], how='left')

In [1303]:
test.loc[test['mean_price2'].isnull(), :]

Unnamed: 0,Id,DistrictId,Rooms,Square,LifeSquare,KitchenSquare,Floor,HouseFloor,HouseYear,Ecology_1,...,OldHouse,Ecology_2_A,Ecology_2_B,Ecology_3_A,Ecology_3_B,Shops_2_A,Shops_2_B,Square_2,mean_price,mean_price2
3398,1435,111,17.0,52.866107,32.528342,8.0,15,17.0,1987,0.093443,...,0,0,1,0,1,0,1,2794.82523,,


In [1304]:
test[test.isnull().any(axis=1)].head()

Unnamed: 0,Id,DistrictId,Rooms,Square,LifeSquare,KitchenSquare,Floor,HouseFloor,HouseYear,Ecology_1,...,OldHouse,Ecology_2_A,Ecology_2_B,Ecology_3_A,Ecology_3_B,Shops_2_A,Shops_2_B,Square_2,mean_price,mean_price2
2,5480,190,1.0,15.948246,13.597819,12.0,2,5.0,1909,0.0,...,1,0,1,0,1,0,1,254.346563,,162476.595636
51,15748,212,2.0,75.655182,36.877626,15.0,2,7.0,1929,0.0,...,0,0,1,0,1,0,1,5723.706607,,216635.040749
60,10131,141,3.0,53.368865,28.356335,6.0,11,16.0,1970,0.0,...,0,0,1,0,1,0,1,2848.235785,,290657.276078
78,6553,34,5.0,122.375273,89.790274,0.0,9,0.0,1977,0.069753,...,0,0,1,0,1,0,1,14975.707492,,417397.083572
149,3764,193,4.0,97.931642,72.386905,8.0,5,5.0,1957,0.319809,...,0,0,1,0,1,0,1,9590.6065,,385586.312375


In [1305]:
test['mean_price'] = test['mean_price'].fillna(test['mean_price2'])

In [1306]:
test.loc[test['Rooms'] == 6, :]

Unnamed: 0,Id,DistrictId,Rooms,Square,LifeSquare,KitchenSquare,Floor,HouseFloor,HouseYear,Ecology_1,...,OldHouse,Ecology_2_A,Ecology_2_B,Ecology_3_A,Ecology_3_B,Shops_2_A,Shops_2_B,Square_2,mean_price,mean_price2
2071,10793,23,6.0,110.750226,76.417656,0.0,2,2.0,2015,0.014073,...,0,0,1,0,1,0,1,12265.612492,229661.964416,229661.964416
3217,4058,27,6.0,223.453689,104.113552,16.0,2,2.0,2017,0.041116,...,0,0,1,0,1,0,1,49931.551043,229661.964416,229661.964416


In [1307]:
test['mean_price'].isnull().sum()

1

In [1308]:
test = test.fillna(test.mean())

In [1309]:
test[test.isnull().any(axis=1)].head()

Unnamed: 0,Id,DistrictId,Rooms,Square,LifeSquare,KitchenSquare,Floor,HouseFloor,HouseYear,Ecology_1,...,OldHouse,Ecology_2_A,Ecology_2_B,Ecology_3_A,Ecology_3_B,Shops_2_A,Shops_2_B,Square_2,mean_price,mean_price2


In [1310]:
test.loc[test['KitchenSquare'] > test['Square'], 'KitchenSquare'] = test['Square']*0.2

In [1311]:
test.loc[test['KitchenSquare'] > test['Square']]

Unnamed: 0,Id,DistrictId,Rooms,Square,LifeSquare,KitchenSquare,Floor,HouseFloor,HouseYear,Ecology_1,...,OldHouse,Ecology_2_A,Ecology_2_B,Ecology_3_A,Ecology_3_B,Shops_2_A,Shops_2_B,Square_2,mean_price,mean_price2


In [1312]:
test.columns

Index(['Id', 'DistrictId', 'Rooms', 'Square', 'LifeSquare', 'KitchenSquare',
       'Floor', 'HouseFloor', 'HouseYear', 'Ecology_1', 'Social_1', 'Social_2',
       'Social_3', 'Helthcare_2', 'Shops_1', 'OldHouse', 'Ecology_2_A',
       'Ecology_2_B', 'Ecology_3_A', 'Ecology_3_B', 'Shops_2_A', 'Shops_2_B',
       'Square_2', 'mean_price', 'mean_price2'],
      dtype='object')

In [1313]:
fts = ['DistrictId','Rooms', 'Square', 'LifeSquare', 'KitchenSquare',
       'Floor', 'HouseFloor', 'HouseYear', 'Ecology_1', 'Social_1', 'Social_2',
       'Social_3', 'Helthcare_2', 'Shops_1',
       'Ecology_2_A', 'Ecology_2_B', 'Ecology_3_A', 'Ecology_3_B', 'mean_price','Square_2', 'OldHouse']

In [1314]:
pred_test = lr.predict(test.loc[:, fts])

In [1315]:
pred_test

array([157223.48559337, 234022.77795108, 149803.21037998, ...,
       346855.71749382, 191105.37703385, 181962.52037028])

In [1316]:
pred_test.shape

(5000,)

In [1317]:
test['Price'] = pred_test

In [1318]:
test.max()

Id                16795.000000
DistrictId          212.000000
Rooms                17.000000
Square              303.071094
LifeSquare          168.729035
KitchenSquare       112.000000
Floor                78.000000
HouseFloor           99.000000
HouseYear          2020.000000
Ecology_1             0.521867
Social_1             74.000000
Social_2          19083.000000
Social_3            141.000000
Helthcare_2           6.000000
Shops_1              23.000000
OldHouse              1.000000
Ecology_2_A           1.000000
Ecology_2_B           1.000000
Ecology_3_A           1.000000
Ecology_3_B           1.000000
Shops_2_A             1.000000
Shops_2_B             1.000000
Square_2          91852.088257
mean_price       593618.746096
mean_price2      417397.083572
Price            633077.937215
dtype: float64

In [1319]:
pred

array([165939.21997976, 277785.62233736, 238444.659062  , ...,
       166723.23643236,  91210.48436232, 183341.41405557])

In [1321]:
test.loc[:, ['Id', 'Price']].to_csv('output/aguzenko.csv', index=None)