In [1]:
import numpy as np
import pandas as pd
#pd.set_option('display.width', 800)

import warnings
warnings.filterwarnings('ignore')

import matplotlib.pyplot as plt
from pylab import rcParams
%config InlineBackend.figure_format = 'svg'
%matplotlib inline

In [2]:
data = pd.read_csv('input/train.csv')

In [3]:
data = pd.get_dummies(data)
#data.describe()

### train_test_split

In [4]:
from sklearn.model_selection import train_test_split

In [5]:
train, valid = train_test_split(data, test_size=0.3, random_state=100)

In [6]:
train.shape, valid.shape

((7000, 23), (3000, 23))

### Prepare data

In [7]:
def prepare_house_year(df):
    df.loc[df['HouseYear'] < 1900, 'HouseYear'] = 1900
    df.loc[df['HouseYear'] > 2020, 'HouseYear'] = 2020
    return df

In [8]:
def prepare_floor(df, source_df):
    mean_Floor = source_df.groupby(['HouseFloor','HouseYear'], as_index=False)[['Floor']].mean().rename(columns={'Floor':'mean_Floor'})
#     mean_Square.to_csv('output/mean_Square.csv')
    df = pd.merge(df, mean_Floor, on=['HouseFloor','HouseYear'], how = 'left')
    df.loc[(df['Floor'] > df['HouseFloor']) ,'Floor'] = df['HouseFloor']
    return df

In [9]:
def prepare_house_floor(df, source_df):
    mean_HouseFloor = source_df.groupby(['HouseYear'], as_index=False)[['HouseFloor']].mean().rename(columns={'HouseFloor':'mean_HouseFloor'})
#     mean_Square.to_csv('output/mean_Square.csv')
    df = pd.merge(df, mean_HouseFloor, on=['HouseYear'], how = 'left')
    df.loc[(df['HouseFloor'] > 37) ,'HouseFloor'] = df['mean_HouseFloor']
    return df

In [10]:
def fillna_life_square(df, source_df):
    df['LifeSquare'] = df['LifeSquare'].fillna(source_df['LifeSquare'].mean())
    return df

In [11]:
def prepare_square(df, source_df, col):
#     df.loc[data['Square'] < 15,:] 
#     small_Square_idx = df.loc[df['Square'] < 15,'Id'].values
    mean_Square = source_df.groupby(['Rooms', 'HouseFloor', 'HouseYear'], 
                                    as_index=False)[['Square']].mean().rename(columns={'Square':'mean_Square'})
    #mean_Square.to_csv('output/mean_Square.csv')
    df = pd.merge(df, mean_Square, on=['Rooms', 'HouseFloor', 'HouseYear'], how = 'left')
    df.loc[(df['Square'] < 15) | (df['Square'] > 300) ,'Square'] = df['mean_Square']
    df['Square'] = df['Square'].fillna(df['mean_Square'])
    df['Square'] = df['Square'].fillna(15)
    return df

In [12]:
def prepare_life_square(df, col):
    df.loc[df[col] < 15, col] = 15
    df.loc[df[col] > 300, col] = 300
    return df

In [13]:
def prepare_kitchen_square(df, col):
    df.loc[df[col] > df['Square'] , col] = df['Square']
    return df

In [14]:
def prepare_rooms(df):
    df.loc[df['Rooms'] > 5, 'Rooms'] = 5
    return df

In [15]:
def fillna_healthcare_1(df, source_df):
    mean_Healthcare_1 = source_df.groupby(['DistrictId'], as_index=False)[['Healthcare_1']].mean().rename(columns={'Healthcare_1':'mean_Healthcare_1'})
    mean_Healthcare_1['mean_Healthcare_1'] = mean_Healthcare_1['mean_Healthcare_1'].fillna(mean_Healthcare_1['mean_Healthcare_1'].mean())
    df = pd.merge(df, mean_Healthcare_1, on=['DistrictId'], how = 'left')
    df['Healthcare_1'] = df['Healthcare_1'].fillna(df['mean_Healthcare_1'])
    #df.loc[df['Healthcare_1'].isnull] = 1500
    df.to_csv('output/df_Healthcare_1.csv')
    return df


In [16]:
def prepare_df(df, source_df):
    df = prepare_house_year(df)
    df = prepare_floor(df, source_df)
    df = prepare_house_floor(df, source_df)
    df = fillna_life_square(df, source_df)
    df = prepare_square(df, source_df, 'Square')
    df = prepare_life_square(df, 'LifeSquare')
    df = prepare_kitchen_square(df, 'KitchenSquare')
    df = prepare_rooms(df)
    df = fillna_healthcare_1(df, source_df) 
    df['Square_2'] = df['Square']**2
    return df

In [17]:
train = prepare_df(train, train)

In [18]:
valid = prepare_df(valid, train)

In [19]:
#train.loc[train['Rooms'] > 4,:]
#train.loc[train['HouseYear'] >2020,:]

pd.set_option('display.max_columns', 30)
#valid.describe()

### Model

In [20]:
train.columns

Index(['Id', 'DistrictId', 'Rooms', 'Square', 'LifeSquare', 'KitchenSquare',
       'Floor', 'HouseFloor', 'HouseYear', 'Ecology_1', 'Social_1', 'Social_2',
       'Social_3', 'Healthcare_1', 'Helthcare_2', 'Shops_1', 'Price',
       'Ecology_2_A', 'Ecology_2_B', 'Ecology_3_A', 'Ecology_3_B', 'Shops_2_A',
       'Shops_2_B', 'mean_Floor', 'mean_HouseFloor', 'mean_Square',
       'mean_Healthcare_1', 'Square_2'],
      dtype='object')

In [21]:
# data

In [22]:
# feats = ['Rooms', 'Square', 'HouseFloor', 'DistrictId']
#feats = ['Rooms','Square','HouseFloor','DistrictId','Ecology_1','Social_1','Social_2','Social_3','Healthcare_1']
feats = ['Rooms','Square','HouseFloor', 'Floor', 'DistrictId','Ecology_1','Social_1','Social_2','Social_3',
         'Helthcare_2', 'Ecology_2_A', 'Ecology_2_B', 'KitchenSquare']

In [23]:
from sklearn.linear_model import LinearRegression as LR
from sklearn.ensemble import RandomForestRegressor as RF

In [24]:
model = RF(n_estimators=120, max_depth=14, random_state=42)

In [25]:
model.fit(train.loc[:, feats], train['Price'])

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=14,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=120, n_jobs=1,
           oob_score=False, random_state=42, verbose=0, warm_start=False)

In [26]:
pred_train = model.predict(train.loc[:, feats])

In [27]:
pred_train.shape

(7000,)

In [28]:
pred_train

array([242023.33227735, 144494.70408748, 238661.74914299, ...,
       238276.26413748, 209228.30998001, 150419.27229226])

In [29]:
pred_valid = model.predict(valid.loc[:, feats])

In [30]:
pred_valid.shape

(3000,)

In [31]:
pred_valid

array([189601.84419511, 211691.64930421, 186947.2011938 , ...,
       366272.59845762, 261659.49169257, 195585.34408681])

### Evaluate

In [32]:
from sklearn.metrics import r2_score as r2

In [33]:
r2(train['Price'], pred_train)
#v.0 0.7534258166190457
# 0.7419914607508353
# 0.7473962548116684
# 0.8799925450308224
# 0.881246321936553
# 0.8880570430818824
# 0.8866563462076222
# 0.888025126317067
# 0.8891998359886355
# 0.9270000556247298

0.9270000556247298

In [34]:
r2(valid['Price'], pred_valid)
#v.0 0.4017893521183312
# 0.2436328294060932
# 0.26032498876209764
# 0.6529312408511583
# 0.6562683914894374
# 0.7138012652225872
# 0.7124606708315415
# 0.7189507960567822
# 0.7188148318501051
# 0.7200623761200757 #300 #14
# 0.721335011008816 #150 #14
# 0.7219290709496045 #125 #14
# 0.7275685457266974 #125 #14
# 0.7277370883452794 #125 #14
# 0.7283252037541543 #120 #14

0.7283252037541543

In [35]:
n_estimators_arr = np.arange(110, 145, 1)
# n_estimators_arr = np.arange(100, 100, 100)
accuracy_valid = []
accuracy_train = []
for val in n_estimators_arr:
    model = RF(n_estimators=val, max_depth=14, random_state=42)
    model.fit(train.loc[:, feats], train['Price'])
    pred_train = model.predict(train.loc[:, feats])
    pred_valid = model.predict(valid.loc[:, feats])
    acc_train = r2(train['Price'], pred_train)
    acc_valid = r2(valid['Price'], pred_valid)
    accuracy_train.append(acc_train)
    accuracy_valid.append(acc_valid)
    print('n_estimators = {} \n\tacc_valid = {} \n\tacc_train = {}\n'.format(val, acc_valid, acc_train))

n_estimators = 110 
	acc_valid = 0.7276678724564771 
	acc_train = 0.9268695002905497

n_estimators = 111 
	acc_valid = 0.7276401358220569 
	acc_train = 0.9268938459191974

n_estimators = 112 
	acc_valid = 0.7276401709626955 
	acc_train = 0.9269023609537443

n_estimators = 113 
	acc_valid = 0.7277830140178638 
	acc_train = 0.9268281681242091

n_estimators = 114 
	acc_valid = 0.728074632520011 
	acc_train = 0.9268386946163589

n_estimators = 115 
	acc_valid = 0.7281534740273483 
	acc_train = 0.9269100454067694

n_estimators = 116 
	acc_valid = 0.7283100054522511 
	acc_train = 0.9269381143937153

n_estimators = 117 
	acc_valid = 0.7281678497758057 
	acc_train = 0.9269459701555954

n_estimators = 118 
	acc_valid = 0.7282262936030142 
	acc_train = 0.9268713021611824

n_estimators = 119 
	acc_valid = 0.7280848216788818 
	acc_train = 0.9269248368962473

n_estimators = 120 
	acc_valid = 0.7283252037541543 
	acc_train = 0.9270000556247298

n_estimators = 121 
	acc_valid = 0.7281822219090839 
	a

In [36]:
# rcParams['figure.figsize'] = 8, 5
# plt.plot(n_estimators_arr, accuracy_valid)
# plt.plot(n_estimators_arr, accuracy_train)
# plt.xlabel('n_estimators')
# plt.ylabel('r2')
# plt.legend(['valid', 'train'])
# plt.xlim(100, 600)

In [37]:
max_depth_arr = np.arange(7, 16)
# max_depth_arr = np.arange(7, 7)
accuracy_valid = []
accuracy_train = []
for val in max_depth_arr:
    model = RF(n_estimators=120, max_depth=val, random_state=42)
    model.fit(train.loc[:, feats], train['Price'])
    pred_train = model.predict(train.loc[:, feats])
    pred_valid = model.predict(valid.loc[:, feats])
    acc_train = r2(train['Price'], pred_train)
    acc_valid = r2(valid['Price'], pred_valid)
    accuracy_train.append(acc_train)
    accuracy_valid.append(acc_valid)
    print('max_depth = {} \n\tacc_valid = {} \n\tacc_train = {}\n'.format(val, acc_valid, acc_train))

max_depth = 7 
	acc_valid = 0.6874740910721923 
	acc_train = 0.7486823568528003

max_depth = 8 
	acc_valid = 0.704183753533143 
	acc_train = 0.7855888328879517

max_depth = 9 
	acc_valid = 0.7132616595319833 
	acc_train = 0.8191450591922189

max_depth = 10 
	acc_valid = 0.7187053864480959 
	acc_train = 0.8487581006922638

max_depth = 11 
	acc_valid = 0.7221011043563341 
	acc_train = 0.8743200758158435

max_depth = 12 
	acc_valid = 0.7256503430854492 
	acc_train = 0.8956664439526872

max_depth = 13 
	acc_valid = 0.7277948532512043 
	acc_train = 0.9130756140122441

max_depth = 14 
	acc_valid = 0.7283252037541543 
	acc_train = 0.9270000556247298

max_depth = 15 
	acc_valid = 0.726304567975514 
	acc_train = 0.9374129117051692



### Test

In [38]:
test = pd.read_csv('input/test.csv')

In [39]:
test.shape

(5000, 19)

In [40]:
test = pd.get_dummies(test)

In [41]:
# test.to_numeric(test['Square'])
# test['Square'] = test['Square'].astype(np.float16)


# values = {'Square': 13, 'Healthcare_1': 0}
# test.fillna(value=values)

In [42]:
test = prepare_df(df=test, source_df=train)

# test.to_csv('output/test_prepare.csv')

In [43]:
# test

In [44]:
# test['LifeSquare']

In [45]:
# test.describe()

In [46]:
test[test['Square']>140]

Unnamed: 0,Id,DistrictId,Rooms,Square,LifeSquare,KitchenSquare,Floor,HouseFloor,HouseYear,Ecology_1,Social_1,Social_2,Social_3,Healthcare_1,Helthcare_2,Shops_1,Ecology_2_A,Ecology_2_B,Ecology_3_A,Ecology_3_B,Shops_2_A,Shops_2_B,mean_Floor,mean_HouseFloor,mean_Square,mean_Healthcare_1,Square_2
870,13736,27,1.0,141.540215,91.704806,42.0,3.0,3.0,2013,0.014073,2,475,0,1589.994813,0,0,0,1,0,1,0,1,1.0,16.039551,,1589.994813,20033.6326
1264,13589,27,3.0,173.97448,76.289475,9.0,3.0,3.0,2017,0.041116,53,14892,4,1589.994813,1,4,0,1,0,1,0,1,2.9,14.235849,85.952306,1589.994813,30267.119566
1499,7769,124,4.0,189.679576,114.708673,4.0,19.0,16.039551,2013,0.174739,17,3379,9,100.0,0,3,0,1,0,1,0,1,,16.039551,,1788.470588,35978.341697
2039,3357,27,3.0,170.476326,75.973612,8.0,2.0,2.0,2017,0.041116,53,14892,4,1589.994813,1,4,0,1,0,1,0,1,1.75,14.235849,80.773237,1589.994813,29062.177673
2371,1929,62,3.0,148.783492,36.278379,0.0,5.0,5.0,1977,0.072158,2,629,1,2300.0,0,0,0,1,0,1,1,0,4.461538,12.718788,96.328756,2300.0,22136.527377
2557,16053,17,4.0,168.729035,169.901701,0.0,0.0,0.0,2013,0.093443,23,4635,5,3300.0,2,4,0,1,0,1,0,1,,16.039551,,1145.282051,28469.487087
3217,4058,27,5.0,223.453689,104.113552,16.0,2.0,2.0,2017,0.041116,53,14892,4,1589.994813,1,4,0,1,0,1,0,1,1.75,14.235849,,1589.994813,49931.551043
3253,12316,183,5.0,157.976101,101.040485,12.0,7.0,8.0,1908,0.246624,14,2940,45,562.0,3,6,0,1,0,1,0,1,,,,562.0,24956.448565
3909,1397,76,3.0,221.138768,118.055342,4.0,32.0,19.020202,2011,0.0,7,1660,39,1786.0,1,5,0,1,0,1,0,1,,19.020202,,1786.0,48902.354763
4384,14498,23,1.0,163.325901,36.278379,1.0,1.0,1.0,1977,0.014073,2,475,0,1589.994813,0,0,0,1,0,1,0,1,1.0,12.718788,46.363811,1589.994813,26675.349937


In [47]:
test.to_csv('output/test_prepare.csv')

In [48]:
model = RF(n_estimators=120, max_depth=14, random_state=42)
model.fit(train.loc[:, feats], train['Price'])

test['Price'] = model.predict(test.loc[:, feats])

In [49]:
test.loc[:, ['Id', 'Price']].to_csv('output/RRZagidullin_predictions.csv', index=None)

In [50]:
VALIDator = pd.merge(test, train, on=['DistrictId', 'Rooms', 'HouseYear'], how = 'left')

In [51]:
# VALIDator

In [52]:
VALIDator[['DistrictId', 'Rooms', 'HouseYear', 'Price_x', 'Price_y']]

Unnamed: 0,DistrictId,Rooms,HouseYear,Price_x,Price_y
0,58,2.0,1972,166927.516553,
1,74,2.0,1977,208615.727887,245923.130060
2,74,2.0,1977,208615.727887,249093.779691
3,74,2.0,1977,208615.727887,244700.814865
4,74,2.0,1977,208615.727887,284488.564051
5,74,2.0,1977,208615.727887,226808.747168
6,74,2.0,1977,208615.727887,238317.681683
7,74,2.0,1977,208615.727887,195389.936837
8,74,2.0,1977,208615.727887,212801.122972
9,74,2.0,1977,208615.727887,254005.381863
