In [578]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor as rfr
from sklearn.metrics import mean_squared_error as mse, r2_score
import warnings

warnings.filterwarnings('ignore')
%matplotlib inline
%config InlineBackend.figure_format = 'svg'

In [579]:
train_data=pd.read_csv('train.csv')
test_data=pd.read_csv('test.csv')

In [580]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 20 columns):
Id               10000 non-null int64
DistrictId       10000 non-null int64
Rooms            10000 non-null float64
Square           10000 non-null float64
LifeSquare       7887 non-null float64
KitchenSquare    10000 non-null float64
Floor            10000 non-null int64
HouseFloor       10000 non-null float64
HouseYear        10000 non-null int64
Ecology_1        10000 non-null float64
Ecology_2        10000 non-null object
Ecology_3        10000 non-null object
Social_1         10000 non-null int64
Social_2         10000 non-null int64
Social_3         10000 non-null int64
Healthcare_1     5202 non-null float64
Helthcare_2      10000 non-null int64
Shops_1          10000 non-null int64
Shops_2          10000 non-null object
Price            10000 non-null float64
dtypes: float64(8), int64(9), object(3)
memory usage: 1.5+ MB


In [581]:
train_data.describe()

Unnamed: 0,Id,DistrictId,Rooms,Square,LifeSquare,KitchenSquare,Floor,HouseFloor,HouseYear,Ecology_1,Social_1,Social_2,Social_3,Healthcare_1,Helthcare_2,Shops_1,Price
count,10000.0,10000.0,10000.0,10000.0,7887.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,5202.0,10000.0,10000.0,10000.0
mean,8383.4077,50.4008,1.8905,56.315775,37.199645,6.2733,8.5267,12.6094,3990.166,0.118858,24.687,5352.1574,8.0392,1142.90446,1.3195,4.2313,214138.857399
std,4859.01902,43.587592,0.839512,21.058732,86.241209,28.560917,5.241148,6.775974,200500.3,0.119025,17.532614,4006.799803,23.831875,1021.517264,1.493601,4.806341,92872.293865
min,0.0,0.0,0.0,1.136859,0.370619,0.0,1.0,0.0,1910.0,0.0,0.0,168.0,0.0,0.0,0.0,0.0,59174.778028
25%,4169.5,20.0,1.0,41.774881,22.769832,1.0,4.0,9.0,1974.0,0.017647,6.0,1564.0,0.0,350.0,0.0,1.0,153872.633942
50%,8394.5,36.0,2.0,52.51331,32.78126,6.0,7.0,13.0,1977.0,0.075424,25.0,5285.0,2.0,900.0,1.0,3.0,192269.644879
75%,12592.5,75.0,2.0,65.900625,45.128803,9.0,12.0,17.0,2001.0,0.195781,36.0,7227.0,5.0,1548.0,2.0,6.0,249135.462171
max,16798.0,209.0,19.0,641.065193,7480.592129,2014.0,42.0,117.0,20052010.0,0.521867,74.0,19083.0,141.0,4849.0,6.0,23.0,633233.46657


In [582]:
def prep_squares(data):
    data.loc[data.Square > 1000, "Square"] = data.Square / 100
    data.loc[data.LifeSquare > 1000, "LifeSquare"] = data.LifeSquare / 100
    data.loc[data.KitchenSquare > 1000, "KitchenSquare"] = data.KitchenSquare / 100

    data.loc[data.Square > 250, "Square"] = data.Square / 10
    data.loc[data.LifeSquare > 250, "LifeSquare"] = data.LifeSquare / 10
    data.loc[data.KitchenSquare > 250, "KitchenSquare"] = data.KitchenSquare / 10
    
    ratio_lifesquare=(data.Square/data.LifeSquare).mean()
    data.LifeSquare.fillna(data.Square/ratio_lifesquare, inplace=True)
    
    tmp_data = data.loc[data.Square < (data.LifeSquare + data.KitchenSquare)]
    mean_square_diff = (tmp_data.LifeSquare + tmp_data.KitchenSquare - tmp_data.Square).mean()
    data.loc[data.Square < (data.LifeSquare + data.KitchenSquare), "Square"] = data.LifeSquare + data.KitchenSquare + mean_square_diff

    return 

def prep_floors(data):
    data.loc[(data.Floor > data.HouseFloor) & (data.HouseFloor < 2), "HouseFloor"] = data.Floor
    data.loc[(data.Floor > data.HouseFloor) & (data.HouseFloor > 2), "Floor"] = data.HouseFloor

    return

def prep_rooms(data):
    data.loc[data.Rooms ==10, "Rooms"] = 2
    data.loc[data.Rooms > 10, "Rooms"] = 1

    data.loc[(data.Rooms == 0) & (data.LifeSquare > 100), "Rooms"] = round(data.loc[data.LifeSquare > 100 , "Rooms"].mean())
    data.loc[(data.Rooms == 0) & (data.LifeSquare > 50) & (data.LifeSquare <= 100), "Rooms"] = round(data.loc[(data.LifeSquare > 50) & (data.LifeSquare <= 100), "Rooms"].mean())
    data.loc[(data.Rooms == 0) & (data.LifeSquare <= 50), "Rooms"] = round(data.loc[data.LifeSquare <= 50, "Rooms"].mean())

    return

def set_mean_price(data, stat):
    data = data.merge(stat, on=['DistrictId','Rooms'],how='left')
    return data

def prepare_data(data):
    prep_squares(data)
    prep_floors(data)
    prep_rooms(data)

In [583]:
train_data.drop("Healthcare_1",axis=1, inplace=True)
train_data.loc[train_data.Id == 11607, "HouseYear"] = 1968
train_data.loc[train_data.Id == 10814, "HouseYear"] = 2005
prepare_data(train_data)
stats=train_data.groupby(['DistrictId','Rooms'],as_index=False)[['Price']].mean().rename(columns={'Price':'MeanPrice'})
train_data = set_mean_price(train_data, stats)

In [584]:
feature_names = ["Square", "LifeSquare", "Rooms", "Floor", "HouseYear", "MeanPrice"]
target_name = ["Price"]

train_features = train_data[feature_names]
train_target = train_data[target_name]

In [585]:
X_train, X_train_test, y_train, y_train_test=train_test_split(train_features,train_target,test_size=0.33,random_state=13)

In [586]:
forest = rfr()
forest.fit(X_train,y_train)
pred=forest.predict(X_train_test)

In [587]:
r2_score(y_train_test, pred)

0.7059752941141885

In [588]:
mse(y_train_test, pred)

2506409129.8625546

In [589]:
test_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 19 columns):
Id               5000 non-null int64
DistrictId       5000 non-null int64
Rooms            5000 non-null float64
Square           5000 non-null float64
LifeSquare       3959 non-null float64
KitchenSquare    5000 non-null float64
Floor            5000 non-null int64
HouseFloor       5000 non-null float64
HouseYear        5000 non-null int64
Ecology_1        5000 non-null float64
Ecology_2        5000 non-null object
Ecology_3        5000 non-null object
Social_1         5000 non-null int64
Social_2         5000 non-null int64
Social_3         5000 non-null int64
Healthcare_1     2623 non-null float64
Helthcare_2      5000 non-null int64
Shops_1          5000 non-null int64
Shops_2          5000 non-null object
dtypes: float64(7), int64(9), object(3)
memory usage: 742.3+ KB


In [590]:
test_data.describe()

Unnamed: 0,Id,DistrictId,Rooms,Square,LifeSquare,KitchenSquare,Floor,HouseFloor,HouseYear,Ecology_1,Social_1,Social_2,Social_3,Healthcare_1,Helthcare_2,Shops_1
count,5000.0,5000.0,5000.0,5000.0,3959.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,2623.0,5000.0,5000.0
mean,8412.5954,51.2792,1.91,56.4495,36.15881,5.9768,8.632,12.601,1984.3926,0.119874,24.9338,5406.9,8.2626,1146.657263,1.3194,4.2428
std,4832.674037,44.179466,0.838594,19.092787,17.825287,9.950018,5.483228,6.789213,18.573149,0.12007,17.532202,4026.614773,23.863762,1044.744231,1.47994,4.777365
min,1.0,0.0,0.0,1.378543,0.33349,0.0,1.0,0.0,1908.0,0.0,0.0,168.0,0.0,0.0,0.0,0.0
25%,4221.75,21.0,1.0,41.906231,23.092026,1.0,4.0,9.0,1973.0,0.019509,6.0,1564.0,0.0,325.0,0.0,1.0
50%,8320.5,37.0,2.0,52.92134,32.925087,6.0,7.0,12.0,1977.0,0.072158,25.0,5285.0,2.0,900.0,1.0,3.0
75%,12598.25,77.0,2.0,66.285129,45.174091,9.0,12.0,17.0,2000.0,0.195781,36.0,7287.0,5.0,1548.0,2.0,6.0
max,16795.0,212.0,17.0,223.453689,303.071094,620.0,78.0,99.0,2020.0,0.521867,74.0,19083.0,141.0,4849.0,6.0,23.0


In [591]:
prepare_data(test_data)
test_data = set_mean_price(test_data, stats)
test_data.MeanPrice=test_data.MeanPrice.fillna(train_data.MeanPrice.mean())

In [592]:
test_features = test_data[feature_names]

In [593]:
test_data['Price']=forest.predict(test_features)

In [594]:
test_data[['Id','Price']].to_csv('DLobanov_predictions.csv',index=None)