In [504]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score as r2
from sklearn.metrics import mean_squared_error as mse

In [505]:
from sklearn.model_selection import train_test_split

In [506]:
data = pd.read_csv('input/train.csv')

In [507]:
data = data.loc[data['Rooms'] < 10, :]
data = data.loc[data['Price'].between(30000, 600000), :]
data = data.loc[data['HouseYear'].between(1800, 2025), :]
data = data.loc[data['Square'].between(1, 1000), :]
data = data.loc[data['KitchenSquare'].between(1, 1000), :]

In [508]:
data = data.drop('Healthcare_1', 1)

In [509]:
x_year = 1917
data['OldHouse'] = (data['HouseYear'] <= x_year).astype(int)

In [510]:
data = pd.get_dummies(data)

### Меняем местами Square и LifeSquare, если Square < LifeSquare

In [511]:
data['_Square'] = data['Square']

In [512]:
data.loc[(data['LifeSquare'] > data['Square']), 'Square'] = data['LifeSquare']

In [513]:
data.loc[(data['LifeSquare'] > data['_Square']), 'LifeSquare'] = data['_Square']

In [514]:
data = data.drop('_Square', 1)

### Чиним площади

In [515]:
data.loc[(data['Square'] < 15) & ((data['Rooms']) <= 1),'Square'] = \
    data.loc[(data['Rooms'] <= 1) & (data['Square'] > 15),'Square'].mean()

In [516]:
data.loc[(data['Square'] < 15) & ((data['Rooms']) == 3),'Square'] = \
    data.loc[(data['Rooms'] == 3) & (data['Square'] > 15),'Square'].mean()

In [517]:
data['Square_2'] = data['Square'] **2

### Чиним LifeSquare

In [518]:
data.loc[(data['Rooms'] <=1) & (data['LifeSquare'] <= 6) & (data['Square'] > 15), 'LifeSquare'] = \
    data.loc[(data['Rooms'] <=1) & (data['LifeSquare'] >= 6) & (data['Square'] > 15), 'LifeSquare'].median()

In [519]:
data.loc[(data['Rooms'] == 2) & (data['LifeSquare'] <= 6) & (data['Square'] > 15), 'LifeSquare'] = \
    data.loc[(data['Rooms'] ==2) & (data['LifeSquare'] >= 6) & (data['Square'] > 15), 'LifeSquare'].median()

In [520]:
data.loc[(data['Rooms'] == 3) & (data['LifeSquare'] <= 6) & (data['Square'] > 15), 'LifeSquare'] = \
    data.loc[(data['Rooms'] ==3) & (data['LifeSquare'] >= 6) & (data['Square'] > 15), 'LifeSquare'].median()

In [521]:
data.loc[(data['Rooms'] == 4) & (data['LifeSquare'] <= 6) & (data['Square'] > 15), 'LifeSquare'] = \
    data.loc[(data['Rooms'] ==4) & (data['LifeSquare'] >= 6) & (data['Square'] > 15), 'LifeSquare'].median()

### Заполняем NaN LifeSquare

In [522]:
data.loc[(data['LifeSquare']).isnull() & (data['Rooms'] <= 1), 'LifeSquare'] = data['Square']*0.625

In [523]:
data.loc[(data['LifeSquare']).isnull() & (data['Rooms'] == 2), 'LifeSquare'] = data['Square']*0.648

In [524]:
data.loc[(data['LifeSquare']).isnull() & (data['Rooms'] == 3), 'LifeSquare'] = data['Square']*0.641

In [525]:
data.loc[(data['LifeSquare']).isnull() & (data['Rooms'] == 4), 'LifeSquare'] = data['Square']*0.684

In [526]:
data.loc[(data['LifeSquare']).isnull()]

Unnamed: 0,Id,DistrictId,Rooms,Square,LifeSquare,KitchenSquare,Floor,HouseFloor,HouseYear,Ecology_1,...,Shops_1,Price,OldHouse,Ecology_2_A,Ecology_2_B,Ecology_3_A,Ecology_3_B,Shops_2_A,Shops_2_B,Square_2


### Делим на тестовый и проверочный

In [527]:
train, valid = train_test_split(data, test_size=0.3, random_state=42)

### Вычисляем районы

In [528]:
district_stat = train.groupby(['DistrictId', 'Rooms'])[['Price']].mean().reset_index().rename(columns={'Price':'mean_price'})

In [529]:
train = pd.merge(train, district_stat, on=['DistrictId', 'Rooms'], how='left')

In [530]:
valid = pd.merge(valid, district_stat, on=['DistrictId', 'Rooms'], how='left')

In [531]:
valid['mean_price'].isnull().sum()

55

In [532]:
room_stat = train.groupby(['Rooms'])[['Price']].mean().reset_index().rename(columns={'Price': 'mean_price2'})

In [533]:
train = pd.merge(train, room_stat, on=['Rooms'], how='left')

In [534]:
valid = pd.merge(valid, room_stat, on='Rooms', how='left')

In [535]:
train.loc[train['Rooms'] == 5, 'Price'].mean()

417397.0835718896

In [536]:
valid.loc[valid['mean_price2'].isnull(), :]

Unnamed: 0,Id,DistrictId,Rooms,Square,LifeSquare,KitchenSquare,Floor,HouseFloor,HouseYear,Ecology_1,...,OldHouse,Ecology_2_A,Ecology_2_B,Ecology_3_A,Ecology_3_B,Shops_2_A,Shops_2_B,Square_2,mean_price,mean_price2


In [537]:
valid.loc[valid['Rooms'] == 6, 'mean_price2'] = train.loc[train['Rooms'] == 5, 'Price'].mean()

In [538]:
valid['mean_price'] = valid['mean_price'].fillna(valid['mean_price2'])

In [539]:
valid.loc[valid['Rooms'] == 6, :]

Unnamed: 0,Id,DistrictId,Rooms,Square,LifeSquare,KitchenSquare,Floor,HouseFloor,HouseYear,Ecology_1,...,OldHouse,Ecology_2_A,Ecology_2_B,Ecology_3_A,Ecology_3_B,Shops_2_A,Shops_2_B,Square_2,mean_price,mean_price2


In [540]:
valid['mean_price'].isnull().sum()

0

### Приступаем  к построению модели. Линейная регрессия

In [541]:
data.columns

Index(['Id', 'DistrictId', 'Rooms', 'Square', 'LifeSquare', 'KitchenSquare',
       'Floor', 'HouseFloor', 'HouseYear', 'Ecology_1', 'Social_1', 'Social_2',
       'Social_3', 'Helthcare_2', 'Shops_1', 'Price', 'OldHouse',
       'Ecology_2_A', 'Ecology_2_B', 'Ecology_3_A', 'Ecology_3_B', 'Shops_2_A',
       'Shops_2_B', 'Square_2'],
      dtype='object')

In [542]:
fts = ['DistrictId','Rooms', 'Square', 'LifeSquare', 'KitchenSquare',
       'Floor', 'HouseFloor', 'HouseYear', 'Ecology_1', 'Social_1', 'Social_2',
       'Social_3', 'Helthcare_2', 'Shops_1',
       'Ecology_2_A', 'Ecology_2_B', 'Ecology_3_A', 'Ecology_3_B', 'mean_price','Square_2', 'OldHouse']

In [543]:
lr = LinearRegression()

In [544]:
lr.fit(train.loc[:, fts], train['Price'])

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
         normalize=False)

In [545]:
valid.shape

(2784, 26)

In [546]:
pred = lr.predict(train.loc[:, fts])

In [547]:
r2(train['Price'], pred)

0.6984408543964573

In [548]:
pred_valid = lr.predict(valid.loc[:, fts])

In [549]:
r2(valid['Price'], pred_valid)

0.5719101705722744

### Случайный лес

In [550]:
from sklearn.ensemble import RandomForestRegressor as RF

rf = RF(n_estimators=30, max_depth=20, min_samples_leaf=3, random_state=42)

In [551]:
rf.fit(train.loc[:, fts], train['Price'])

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=20,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=3, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=30, n_jobs=None,
           oob_score=False, random_state=42, verbose=0, warm_start=False)

In [552]:
pred = rf.predict(train.loc[:, fts])

In [553]:
r2(train['Price'], pred)

0.9168491763777323

In [554]:
pred_valid = rf.predict(valid.loc[:, fts])

In [555]:
r2(valid['Price'], pred_valid)

0.671768708249747

## Предсказание на тесте

In [556]:
test = pd.read_csv('input/test.csv')

In [557]:
test = test.drop('Healthcare_1', 1)

In [558]:
x_year = 1917
test['OldHouse'] = (test['HouseYear'] <= x_year).astype(int)

In [559]:
test = pd.get_dummies(test)

In [560]:
test['_Square'] = test['Square']

In [561]:
test.loc[(test['LifeSquare'] > test['Square']), 'Square'] = test['LifeSquare']

In [562]:
test.loc[(test['LifeSquare'] > test['_Square']), 'LifeSquare'] = test['_Square']

In [563]:
test = test.drop('_Square', 1)

In [564]:
test.head()

Unnamed: 0,Id,DistrictId,Rooms,Square,LifeSquare,KitchenSquare,Floor,HouseFloor,HouseYear,Ecology_1,...,Social_3,Helthcare_2,Shops_1,OldHouse,Ecology_2_A,Ecology_2_B,Ecology_3_A,Ecology_3_B,Shops_2_A,Shops_2_B
0,725,58,2.0,49.882643,33.432782,6.0,6,14.0,1972,0.310199,...,1,0,0,0,0,1,0,1,0,1
1,15856,74,2.0,69.263183,,1.0,6,1.0,1977,0.075779,...,3,0,2,0,0,1,0,1,0,1
2,5480,190,1.0,15.948246,13.597819,12.0,2,5.0,1909,0.0,...,87,5,5,1,0,1,0,1,0,1
3,15664,47,2.0,73.046609,51.940842,9.0,22,22.0,2007,0.101872,...,3,3,3,0,0,1,0,1,0,1
4,14275,27,1.0,47.527111,43.387569,1.0,17,17.0,2017,0.072158,...,1,0,0,0,0,1,0,1,1,0


In [565]:
test.loc[(test['HouseYear'] == test['Floor'])]

Unnamed: 0,Id,DistrictId,Rooms,Square,LifeSquare,KitchenSquare,Floor,HouseFloor,HouseYear,Ecology_1,...,Social_3,Helthcare_2,Shops_1,OldHouse,Ecology_2_A,Ecology_2_B,Ecology_3_A,Ecology_3_B,Shops_2_A,Shops_2_B


### Чиним площади

In [566]:
test.loc[(test['Square'] < 15) & ((test['Rooms']) <= 1),'Square'] = \
    test.loc[(test['Rooms'] <= 1) & (test['Square'] > 15),'Square'].mean()

In [567]:
test.loc[(test['Square'] < 15) & ((test['Rooms']) == 3),'Square'] = \
    test.loc[(test['Rooms'] == 3) & (test['Square'] > 15),'Square'].mean()

In [568]:
test.loc[(test['Square'] < 15)]

Unnamed: 0,Id,DistrictId,Rooms,Square,LifeSquare,KitchenSquare,Floor,HouseFloor,HouseYear,Ecology_1,...,Social_3,Helthcare_2,Shops_1,OldHouse,Ecology_2_A,Ecology_2_B,Ecology_3_A,Ecology_3_B,Shops_2_A,Shops_2_B


In [569]:
test.loc[(test['HouseYear'] == test['Floor'])]

Unnamed: 0,Id,DistrictId,Rooms,Square,LifeSquare,KitchenSquare,Floor,HouseFloor,HouseYear,Ecology_1,...,Social_3,Helthcare_2,Shops_1,OldHouse,Ecology_2_A,Ecology_2_B,Ecology_3_A,Ecology_3_B,Shops_2_A,Shops_2_B


In [570]:
test['Square_2'] = test['Square'] **2

In [571]:
test.loc[(test['HouseYear'] == test['Floor'])]

Unnamed: 0,Id,DistrictId,Rooms,Square,LifeSquare,KitchenSquare,Floor,HouseFloor,HouseYear,Ecology_1,...,Helthcare_2,Shops_1,OldHouse,Ecology_2_A,Ecology_2_B,Ecology_3_A,Ecology_3_B,Shops_2_A,Shops_2_B,Square_2


In [572]:
test.head()

Unnamed: 0,Id,DistrictId,Rooms,Square,LifeSquare,KitchenSquare,Floor,HouseFloor,HouseYear,Ecology_1,...,Helthcare_2,Shops_1,OldHouse,Ecology_2_A,Ecology_2_B,Ecology_3_A,Ecology_3_B,Shops_2_A,Shops_2_B,Square_2
0,725,58,2.0,49.882643,33.432782,6.0,6,14.0,1972,0.310199,...,0,0,0,0,1,0,1,0,1,2488.278112
1,15856,74,2.0,69.263183,,1.0,6,1.0,1977,0.075779,...,0,2,0,0,1,0,1,0,1,4797.388585
2,5480,190,1.0,15.948246,13.597819,12.0,2,5.0,1909,0.0,...,5,5,1,0,1,0,1,0,1,254.346563
3,15664,47,2.0,73.046609,51.940842,9.0,22,22.0,2007,0.101872,...,3,3,0,0,1,0,1,0,1,5335.807118
4,14275,27,1.0,47.527111,43.387569,1.0,17,17.0,2017,0.072158,...,0,0,0,0,1,0,1,1,0,2258.826252


### Чиним LifeSquare

In [573]:
test.loc[(test['Rooms'] <=1) & (test['LifeSquare'] <= 6) & (test['Square'] > 15), 'LifeSquare'] = \
    test.loc[(test['Rooms'] <=1) & (test['LifeSquare'] >= 6) & (test['Square'] > 15), 'LifeSquare'].median()

In [574]:
test.loc[(test['Rooms'] == 2) & (test['LifeSquare'] <= 6) & (test['Square'] > 15), 'LifeSquare'] = \
    test.loc[(test['Rooms'] ==2) & (test['LifeSquare'] >= 6) & (test['Square'] > 15), 'LifeSquare'].median()

In [575]:
test.loc[(test['Rooms'] == 3) & (test['LifeSquare'] <= 6) & (test['Square'] > 15), 'LifeSquare'] = \
    test.loc[(test['Rooms'] ==3) & (test['LifeSquare'] >= 6) & (test['Square'] > 15), 'LifeSquare'].median()

In [576]:
test.loc[(test['Rooms'] == 4) & (test['LifeSquare'] <= 6) & (test['Square'] > 15), 'LifeSquare'] = \
    test.loc[(test['Rooms'] ==4) & (test['LifeSquare'] >= 6) & (test['Square'] > 15), 'LifeSquare'].median()

In [577]:
test.loc[(test['Rooms'] == 5) & (test['LifeSquare'] <= 6) & (test['Square'] > 15), 'LifeSquare'] = \
    test.loc[(test['Rooms'] ==5) & (test['LifeSquare'] >= 6) & (test['Square'] > 15), 'LifeSquare'].median()

In [578]:
test.loc[(test['LifeSquare'] <= 6) & (test['Square'] > 15)]

Unnamed: 0,Id,DistrictId,Rooms,Square,LifeSquare,KitchenSquare,Floor,HouseFloor,HouseYear,Ecology_1,...,Helthcare_2,Shops_1,OldHouse,Ecology_2_A,Ecology_2_B,Ecology_3_A,Ecology_3_B,Shops_2_A,Shops_2_B,Square_2


In [579]:
test.loc[(test['HouseYear'] == test['Floor'])]

Unnamed: 0,Id,DistrictId,Rooms,Square,LifeSquare,KitchenSquare,Floor,HouseFloor,HouseYear,Ecology_1,...,Helthcare_2,Shops_1,OldHouse,Ecology_2_A,Ecology_2_B,Ecology_3_A,Ecology_3_B,Shops_2_A,Shops_2_B,Square_2


In [580]:
test.head()

Unnamed: 0,Id,DistrictId,Rooms,Square,LifeSquare,KitchenSquare,Floor,HouseFloor,HouseYear,Ecology_1,...,Helthcare_2,Shops_1,OldHouse,Ecology_2_A,Ecology_2_B,Ecology_3_A,Ecology_3_B,Shops_2_A,Shops_2_B,Square_2
0,725,58,2.0,49.882643,33.432782,6.0,6,14.0,1972,0.310199,...,0,0,0,0,1,0,1,0,1,2488.278112
1,15856,74,2.0,69.263183,,1.0,6,1.0,1977,0.075779,...,0,2,0,0,1,0,1,0,1,4797.388585
2,5480,190,1.0,15.948246,13.597819,12.0,2,5.0,1909,0.0,...,5,5,1,0,1,0,1,0,1,254.346563
3,15664,47,2.0,73.046609,51.940842,9.0,22,22.0,2007,0.101872,...,3,3,0,0,1,0,1,0,1,5335.807118
4,14275,27,1.0,47.527111,43.387569,1.0,17,17.0,2017,0.072158,...,0,0,0,0,1,0,1,1,0,2258.826252


### Заполняем NaN LifeSquare

In [581]:
test.loc[(test['LifeSquare']).isnull() & (test['Rooms'] <= 1), 'LifeSquare'] = test['Square']*0.625

In [582]:
test.loc[(test['LifeSquare']).isnull() & (test['Rooms'] == 2), 'LifeSquare'] = test['Square']*0.648

In [583]:
test.loc[(test['LifeSquare']).isnull() & (test['Rooms'] == 3), 'LifeSquare'] = test['Square']*0.641

In [584]:
test.loc[(test['LifeSquare']).isnull() & (test['Rooms'] == 4), 'LifeSquare'] = test['Square']*0.684

In [585]:
test.loc[(test['LifeSquare']).isnull() & (test['Rooms'] == 6), 'LifeSquare'] = test['Square']*0.69

In [586]:
test.loc[(test['LifeSquare']).isnull()]

Unnamed: 0,Id,DistrictId,Rooms,Square,LifeSquare,KitchenSquare,Floor,HouseFloor,HouseYear,Ecology_1,...,Helthcare_2,Shops_1,OldHouse,Ecology_2_A,Ecology_2_B,Ecology_3_A,Ecology_3_B,Shops_2_A,Shops_2_B,Square_2


In [587]:
test.loc[(test['HouseYear'] == test['Floor'])]

Unnamed: 0,Id,DistrictId,Rooms,Square,LifeSquare,KitchenSquare,Floor,HouseFloor,HouseYear,Ecology_1,...,Helthcare_2,Shops_1,OldHouse,Ecology_2_A,Ecology_2_B,Ecology_3_A,Ecology_3_B,Shops_2_A,Shops_2_B,Square_2


In [588]:
test.head()

Unnamed: 0,Id,DistrictId,Rooms,Square,LifeSquare,KitchenSquare,Floor,HouseFloor,HouseYear,Ecology_1,...,Helthcare_2,Shops_1,OldHouse,Ecology_2_A,Ecology_2_B,Ecology_3_A,Ecology_3_B,Shops_2_A,Shops_2_B,Square_2
0,725,58,2.0,49.882643,33.432782,6.0,6,14.0,1972,0.310199,...,0,0,0,0,1,0,1,0,1,2488.278112
1,15856,74,2.0,69.263183,44.882543,1.0,6,1.0,1977,0.075779,...,0,2,0,0,1,0,1,0,1,4797.388585
2,5480,190,1.0,15.948246,13.597819,12.0,2,5.0,1909,0.0,...,5,5,1,0,1,0,1,0,1,254.346563
3,15664,47,2.0,73.046609,51.940842,9.0,22,22.0,2007,0.101872,...,3,3,0,0,1,0,1,0,1,5335.807118
4,14275,27,1.0,47.527111,43.387569,1.0,17,17.0,2017,0.072158,...,0,0,0,0,1,0,1,1,0,2258.826252


In [589]:
test[test.isnull().any(axis=1)]

Unnamed: 0,Id,DistrictId,Rooms,Square,LifeSquare,KitchenSquare,Floor,HouseFloor,HouseYear,Ecology_1,...,Helthcare_2,Shops_1,OldHouse,Ecology_2_A,Ecology_2_B,Ecology_3_A,Ecology_3_B,Shops_2_A,Shops_2_B,Square_2


In [590]:
district_stat.head()

Unnamed: 0,DistrictId,Rooms,mean_price
0,0,1.0,148526.379217
1,0,2.0,201970.893363
2,0,3.0,304666.831553
3,1,1.0,146569.563192
4,1,2.0,197744.90968


In [591]:
test = pd.merge(test, district_stat, on=['DistrictId', 'Rooms'], how='left')

In [592]:
test.head()

Unnamed: 0,Id,DistrictId,Rooms,Square,LifeSquare,KitchenSquare,Floor,HouseFloor,HouseYear,Ecology_1,...,Shops_1,OldHouse,Ecology_2_A,Ecology_2_B,Ecology_3_A,Ecology_3_B,Shops_2_A,Shops_2_B,Square_2,mean_price
0,725,58,2.0,49.882643,33.432782,6.0,6,14.0,1972,0.310199,...,0,0,0,1,0,1,0,1,2488.278112,171809.963617
1,15856,74,2.0,69.263183,44.882543,1.0,6,1.0,1977,0.075779,...,2,0,0,1,0,1,0,1,4797.388585,244431.660687
2,5480,190,1.0,15.948246,13.597819,12.0,2,5.0,1909,0.0,...,5,1,0,1,0,1,0,1,254.346563,
3,15664,47,2.0,73.046609,51.940842,9.0,22,22.0,2007,0.101872,...,3,0,0,1,0,1,0,1,5335.807118,212673.196383
4,14275,27,1.0,47.527111,43.387569,1.0,17,17.0,2017,0.072158,...,0,0,0,1,0,1,1,0,2258.826252,123879.648723


In [593]:
test['mean_price'].isnull().sum()

100

In [594]:
test[test.isnull().any(axis=1)].head()

Unnamed: 0,Id,DistrictId,Rooms,Square,LifeSquare,KitchenSquare,Floor,HouseFloor,HouseYear,Ecology_1,...,Shops_1,OldHouse,Ecology_2_A,Ecology_2_B,Ecology_3_A,Ecology_3_B,Shops_2_A,Shops_2_B,Square_2,mean_price
2,5480,190,1.0,15.948246,13.597819,12.0,2,5.0,1909,0.0,...,5,1,0,1,0,1,0,1,254.346563,
51,15748,212,2.0,75.655182,36.877626,15.0,2,7.0,1929,0.0,...,8,0,0,1,0,1,0,1,5723.706607,
60,10131,141,3.0,53.368865,28.356335,6.0,11,16.0,1970,0.0,...,4,0,0,1,0,1,0,1,2848.235785,
78,6553,34,5.0,122.375273,89.790274,0.0,9,0.0,1977,0.069753,...,11,0,0,1,0,1,0,1,14975.707492,
149,3764,193,4.0,97.931642,72.386905,8.0,5,5.0,1957,0.319809,...,8,0,0,1,0,1,0,1,9590.6065,


In [595]:
test = pd.merge(test, room_stat, on=['Rooms'], how='left')

In [596]:
test.loc[test['mean_price2'].isnull(), :]

Unnamed: 0,Id,DistrictId,Rooms,Square,LifeSquare,KitchenSquare,Floor,HouseFloor,HouseYear,Ecology_1,...,OldHouse,Ecology_2_A,Ecology_2_B,Ecology_3_A,Ecology_3_B,Shops_2_A,Shops_2_B,Square_2,mean_price,mean_price2
3398,1435,111,17.0,52.866107,32.528342,8.0,15,17.0,1987,0.093443,...,0,0,1,0,1,0,1,2794.82523,,


In [597]:
test[test.isnull().any(axis=1)].head()

Unnamed: 0,Id,DistrictId,Rooms,Square,LifeSquare,KitchenSquare,Floor,HouseFloor,HouseYear,Ecology_1,...,OldHouse,Ecology_2_A,Ecology_2_B,Ecology_3_A,Ecology_3_B,Shops_2_A,Shops_2_B,Square_2,mean_price,mean_price2
2,5480,190,1.0,15.948246,13.597819,12.0,2,5.0,1909,0.0,...,1,0,1,0,1,0,1,254.346563,,162476.595636
51,15748,212,2.0,75.655182,36.877626,15.0,2,7.0,1929,0.0,...,0,0,1,0,1,0,1,5723.706607,,216635.040749
60,10131,141,3.0,53.368865,28.356335,6.0,11,16.0,1970,0.0,...,0,0,1,0,1,0,1,2848.235785,,290657.276078
78,6553,34,5.0,122.375273,89.790274,0.0,9,0.0,1977,0.069753,...,0,0,1,0,1,0,1,14975.707492,,417397.083572
149,3764,193,4.0,97.931642,72.386905,8.0,5,5.0,1957,0.319809,...,0,0,1,0,1,0,1,9590.6065,,385586.312375


In [598]:
test['mean_price'] = test['mean_price'].fillna(test['mean_price2'])

In [599]:
test.loc[test['Rooms'] == 6, :]

Unnamed: 0,Id,DistrictId,Rooms,Square,LifeSquare,KitchenSquare,Floor,HouseFloor,HouseYear,Ecology_1,...,OldHouse,Ecology_2_A,Ecology_2_B,Ecology_3_A,Ecology_3_B,Shops_2_A,Shops_2_B,Square_2,mean_price,mean_price2
2071,10793,23,6.0,110.750226,76.417656,0.0,2,2.0,2015,0.014073,...,0,0,1,0,1,0,1,12265.612492,229661.964416,229661.964416
3217,4058,27,6.0,223.453689,104.113552,16.0,2,2.0,2017,0.041116,...,0,0,1,0,1,0,1,49931.551043,229661.964416,229661.964416


In [600]:
test['mean_price'].isnull().sum()

1

In [601]:
test = test.fillna(test.mean())

In [602]:
test[test.isnull().any(axis=1)].head()

Unnamed: 0,Id,DistrictId,Rooms,Square,LifeSquare,KitchenSquare,Floor,HouseFloor,HouseYear,Ecology_1,...,OldHouse,Ecology_2_A,Ecology_2_B,Ecology_3_A,Ecology_3_B,Shops_2_A,Shops_2_B,Square_2,mean_price,mean_price2


In [603]:
test.loc[test['KitchenSquare'] > test['Square'], 'KitchenSquare'] = test['Square']*0.2

In [604]:
test.loc[test['KitchenSquare'] > test['Square']]

Unnamed: 0,Id,DistrictId,Rooms,Square,LifeSquare,KitchenSquare,Floor,HouseFloor,HouseYear,Ecology_1,...,OldHouse,Ecology_2_A,Ecology_2_B,Ecology_3_A,Ecology_3_B,Shops_2_A,Shops_2_B,Square_2,mean_price,mean_price2


In [606]:
test.columns

Index(['Id', 'DistrictId', 'Rooms', 'Square', 'LifeSquare', 'KitchenSquare',
       'Floor', 'HouseFloor', 'HouseYear', 'Ecology_1', 'Social_1', 'Social_2',
       'Social_3', 'Helthcare_2', 'Shops_1', 'OldHouse', 'Ecology_2_A',
       'Ecology_2_B', 'Ecology_3_A', 'Ecology_3_B', 'Shops_2_A', 'Shops_2_B',
       'Square_2', 'mean_price', 'mean_price2'],
      dtype='object')

In [607]:
fts = ['DistrictId','Rooms', 'Square', 'LifeSquare', 'KitchenSquare',
       'Floor', 'HouseFloor', 'HouseYear', 'Ecology_1', 'Social_1', 'Social_2',
       'Social_3', 'Helthcare_2', 'Shops_1',
       'Ecology_2_A', 'Ecology_2_B', 'Ecology_3_A', 'Ecology_3_B', 'mean_price','Square_2', 'OldHouse']

In [438]:
pred_test = lr.predict(test.loc[:, fts])

In [439]:
pred_test

array([157223.48559337, 234022.77795108, 149803.21037998, ...,
       346855.71749382, 191105.37703385, 181962.52037028])

In [440]:
pred_test.shape

(5000,)

In [441]:
test['Price'] = pred_test

In [442]:
test.max()

Id                16795.000000
DistrictId          212.000000
Rooms                17.000000
Square              303.071094
LifeSquare          168.729035
KitchenSquare       112.000000
Floor                78.000000
HouseFloor           99.000000
HouseYear          2020.000000
Ecology_1             0.521867
Social_1             74.000000
Social_2          19083.000000
Social_3            141.000000
Helthcare_2           6.000000
Shops_1              23.000000
OldHouse              1.000000
Ecology_2_A           1.000000
Ecology_2_B           1.000000
Ecology_3_A           1.000000
Ecology_3_B           1.000000
Shops_2_A             1.000000
Shops_2_B             1.000000
Square_2          91852.088257
mean_price       593618.746096
mean_price2      417397.083572
Price            633077.937215
dtype: float64

In [608]:
pred_test

array([157223.48559337, 234022.77795108, 149803.21037998, ...,
       346855.71749382, 191105.37703385, 181962.52037028])

### Предсказание на RandomForest

In [611]:
pred_test_rf = rf.predict(test.loc[:, fts])

In [612]:
pred_test_rf

array([159479.93854882, 247534.38119038, 136186.82834538, ...,
       315740.66556204, 217795.05336901, 168426.59063252])

In [614]:
test['Price'] = pred_test_rf

In [615]:
test.max()

Id                16795.000000
DistrictId          212.000000
Rooms                17.000000
Square              303.071094
LifeSquare          168.729035
KitchenSquare       112.000000
Floor                78.000000
HouseFloor           99.000000
HouseYear          2020.000000
Ecology_1             0.521867
Social_1             74.000000
Social_2          19083.000000
Social_3            141.000000
Helthcare_2           6.000000
Shops_1              23.000000
OldHouse              1.000000
Ecology_2_A           1.000000
Ecology_2_B           1.000000
Ecology_3_A           1.000000
Ecology_3_B           1.000000
Shops_2_A             1.000000
Shops_2_B             1.000000
Square_2          91852.088257
mean_price       593618.746096
mean_price2      417397.083572
Price            581484.447291
dtype: float64

In [616]:
pred_test.shape

(5000,)

In [617]:
test.loc[:, ['Id', 'Price']].head()

Unnamed: 0,Id,Price
0,725,159479.938549
1,15856,247534.38119
2,5480,136186.828345
3,15664,292926.921372
4,14275,149969.006865


In [618]:
test.loc[:, ['Id', 'Price']].to_csv('output/aguzenko_rf.csv', index=None)