In [1]:
import numpy as np
import pandas as pd

from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
from sklearn.model_selection import GridSearchCV, train_test_split

import warnings
warnings.filterwarnings('ignore')

In [2]:
data = pd.read_csv('data/train.csv')
data.shape

(10000, 20)

In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 20 columns):
Id               10000 non-null int64
DistrictId       10000 non-null int64
Rooms            10000 non-null float64
Square           10000 non-null float64
LifeSquare       7887 non-null float64
KitchenSquare    10000 non-null float64
Floor            10000 non-null int64
HouseFloor       10000 non-null float64
HouseYear        10000 non-null int64
Ecology_1        10000 non-null float64
Ecology_2        10000 non-null object
Ecology_3        10000 non-null object
Social_1         10000 non-null int64
Social_2         10000 non-null int64
Social_3         10000 non-null int64
Healthcare_1     5202 non-null float64
Helthcare_2      10000 non-null int64
Shops_1          10000 non-null int64
Shops_2          10000 non-null object
Price            10000 non-null float64
dtypes: float64(8), int64(9), object(3)
memory usage: 1.5+ MB


In [4]:
data['LifeSquare'] = data['LifeSquare'].fillna(0)
data = pd.get_dummies(data)
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 23 columns):
Id               10000 non-null int64
DistrictId       10000 non-null int64
Rooms            10000 non-null float64
Square           10000 non-null float64
LifeSquare       10000 non-null float64
KitchenSquare    10000 non-null float64
Floor            10000 non-null int64
HouseFloor       10000 non-null float64
HouseYear        10000 non-null int64
Ecology_1        10000 non-null float64
Social_1         10000 non-null int64
Social_2         10000 non-null int64
Social_3         10000 non-null int64
Healthcare_1     5202 non-null float64
Helthcare_2      10000 non-null int64
Shops_1          10000 non-null int64
Price            10000 non-null float64
Ecology_2_A      10000 non-null uint8
Ecology_2_B      10000 non-null uint8
Ecology_3_A      10000 non-null uint8
Ecology_3_B      10000 non-null uint8
Shops_2_A        10000 non-null uint8
Shops_2_B        10000 non-null uint8
dty

In [5]:
x_year = 1930
data['OldHouse'] = (data['HouseYear'] <= x_year).astype(int)

In [6]:
train, valid = train_test_split(data, test_size=0.3, random_state=42)
train.shape, valid.shape

((7000, 24), (3000, 24))

## train

In [7]:
k = train.loc[(train.LifeSquare > 10), 'LifeSquare'].mean() / train.loc[(train.LifeSquare > 10), 'Square'].mean()

In [8]:
train.loc[(train.LifeSquare < 10), 'LifeSquare'] = \
train.loc[(train.LifeSquare < 10), 'Square'] * k

In [9]:
train = train.loc[(train.LifeSquare > 10), :]

In [10]:
train.loc[(train.Square - train.LifeSquare) < 10, 'LifeSquare'] = \
train.loc[(train.Square - train.LifeSquare) < 10, 'Square'] * k

In [11]:
train.loc[train['Rooms'] == 0, :]

Unnamed: 0,Id,DistrictId,Rooms,Square,LifeSquare,KitchenSquare,Floor,HouseFloor,HouseYear,Ecology_1,...,Helthcare_2,Shops_1,Price,Ecology_2_A,Ecology_2_B,Ecology_3_A,Ecology_3_B,Shops_2_A,Shops_2_B,OldHouse
4366,456,6,0.0,81.491446,56.702503,0.0,4,0.0,1977,0.243205,...,0,0,212864.799112,0,1,0,1,0,1,0
1397,12638,27,0.0,138.427694,96.319271,0.0,4,3.0,2016,0.075424,...,0,0,268394.744389,0,1,0,1,0,1,0
2269,7317,27,0.0,41.790881,29.078482,0.0,13,0.0,1977,0.211401,...,0,1,98129.976788,0,1,0,1,0,1,0
3911,770,28,0.0,49.483501,34.431078,0.0,16,0.0,2015,0.118537,...,1,0,217009.338463,0,1,0,1,0,1,0


In [12]:
train.loc[train['Id'] == 456, 'Rooms'] = 3
train.loc[train['Id'] == 12638, 'Rooms'] = 3
train.loc[train['Id'] == 7317, 'Rooms'] = 2
train.loc[train['Id'] == 770, 'Rooms'] = 2

In [13]:
train['MeanRoomSquare'] = train.LifeSquare / train.Rooms

In [14]:
train = train.loc[train['Rooms'] < 10, :]

In [15]:
train = train.loc[train['Price'].between(30000, 600000), :]

In [16]:
train.loc[(train['KitchenSquare'] > 30), 'KitchenSquare'] = train.loc[(train['KitchenSquare'] > 30), 'Square'] * 0.11

In [17]:
train.loc[(train['KitchenSquare'] < 4), 'KitchenSquare'] = train.loc[(train['KitchenSquare'] < 4), 'Square'] * 0.11

In [18]:
train.loc[(train['KitchenSquare'] > (train['Square'] - train['LifeSquare'])), 'LifeSquare'] = \
train.loc[(train['KitchenSquare'] > (train['Square'] - train['LifeSquare'])), 'Square'] * 0.11

In [19]:
train = train.loc[(train['HouseYear'] < 2021), :]

## valid

In [20]:
valid.loc[(valid.LifeSquare < 10), 'LifeSquare'] = \
valid.loc[(valid.LifeSquare < 10), 'Square'] * k

valid.loc[(valid.Square - valid.LifeSquare) < 10, 'LifeSquare'] = \
valid.loc[(valid.Square - valid.LifeSquare) < 10, 'Square'] * k

In [21]:
valid.loc[valid['Rooms'] == 0, :]

Unnamed: 0,Id,DistrictId,Rooms,Square,LifeSquare,KitchenSquare,Floor,HouseFloor,HouseYear,Ecology_1,...,Helthcare_2,Shops_1,Price,Ecology_2_A,Ecology_2_B,Ecology_3_A,Ecology_3_B,Shops_2_A,Shops_2_B,OldHouse
6149,3159,88,0.0,38.697117,19.345131,9.0,9,16.0,1982,0.127376,...,3,9,158998.110646,0,1,0,1,0,1,0
8834,9443,27,0.0,87.762616,61.06604,0.0,5,15.0,1977,0.211401,...,0,1,219281.918007,0,1,0,1,0,1,0
1981,7917,27,0.0,212.932361,148.160307,0.0,2,3.0,2008,0.211401,...,0,1,302211.260887,0,1,0,1,0,1,0


In [22]:
valid.loc[valid['Id'] == 3159, 'Rooms'] = 2
valid.loc[valid['Id'] == 9443, 'Rooms'] = 3
valid.loc[valid['Id'] == 7917, 'Rooms'] = 3

In [23]:
valid.loc[valid['Rooms'] > 6, :]

Unnamed: 0,Id,DistrictId,Rooms,Square,LifeSquare,KitchenSquare,Floor,HouseFloor,HouseYear,Ecology_1,...,Helthcare_2,Shops_1,Price,Ecology_2_A,Ecology_2_B,Ecology_3_A,Ecology_3_B,Shops_2_A,Shops_2_B,OldHouse
8849,14865,9,10.0,60.871266,38.420681,10.0,3,2.0,1994,0.161532,...,2,4,172329.270863,0,1,0,1,0,1,0


In [24]:
valid.loc[valid['Id'] == 14865, 'Rooms'] = 2

In [25]:
valid['MeanRoomSquare'] = valid.LifeSquare / valid.Rooms

In [26]:
valid.loc[(valid['KitchenSquare'] > 30), 'KitchenSquare'] = \
valid.loc[(valid['KitchenSquare'] > 30), 'Square'] * 0.11
valid.loc[(valid['KitchenSquare'] < 4), 'KitchenSquare'] = \
valid.loc[(valid['KitchenSquare'] < 4), 'Square'] * 0.11

valid.loc[(valid['KitchenSquare'] > (valid['Square'] - valid['LifeSquare'])), 'LifeSquare'] = \
valid.loc[(valid['KitchenSquare'] > (valid['Square'] - valid['LifeSquare'])), 'Square'] * 0.11

## Receive features

In [27]:
features = list(train.columns.values)
ex_col = ['Id', 'DistrictId', 'Price', 'Healthcare_1']
features = [i for i in features if not(i in ex_col)]
features

['Rooms',
 'Square',
 'LifeSquare',
 'KitchenSquare',
 'Floor',
 'HouseFloor',
 'HouseYear',
 'Ecology_1',
 'Social_1',
 'Social_2',
 'Social_3',
 'Helthcare_2',
 'Shops_1',
 'Ecology_2_A',
 'Ecology_2_B',
 'Ecology_3_A',
 'Ecology_3_B',
 'Shops_2_A',
 'Shops_2_B',
 'OldHouse',
 'MeanRoomSquare']

## GridSearchCV

In [28]:
param_grid = {
    'bootstrap': [True, False],
    'max_depth': range(13, 15, 1),
    'max_features': range(2, 4, 1),
    'min_samples_leaf': range(2, 3, 1),
    'min_samples_split': range(2, 4, 2),
    'n_estimators': range(80, 90, 5)
}

In [29]:
rf=RandomForestRegressor(random_state=42)
grid_search=GridSearchCV(estimator=rf, param_grid=param_grid, 
                          cv=3, n_jobs=-1, verbose=2, scoring='r2')

In [30]:
grid_search.fit(train.loc[:, features], train['Price'])

Fitting 3 folds for each of 16 candidates, totalling 48 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:    4.1s
[Parallel(n_jobs=-1)]: Done  48 out of  48 | elapsed:    5.9s finished


GridSearchCV(cv=3, error_score='raise-deprecating',
       estimator=RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators='warn', n_jobs=None,
           oob_score=False, random_state=42, verbose=0, warm_start=False),
       fit_params=None, iid='warn', n_jobs=-1,
       param_grid={'bootstrap': [True, False], 'max_depth': range(13, 15), 'max_features': range(2, 4), 'min_samples_leaf': range(2, 3), 'min_samples_split': range(2, 4, 2), 'n_estimators': range(80, 90, 5)},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='r2', verbose=2)

In [31]:
grid_search.best_params_

{'bootstrap': False,
 'max_depth': 14,
 'max_features': 3,
 'min_samples_leaf': 2,
 'min_samples_split': 2,
 'n_estimators': 85}

In [32]:
grid_pred = grid_search.predict(train.loc[:, features])
r2_score(train['Price'], grid_pred)

0.9159276718934741

In [33]:
grid_pred_valid = grid_search.predict(valid.loc[:, features])
r2_score(valid['Price'], grid_pred_valid)

0.7190712287529892

## RandomForestRegressor

In [34]:
rf = grid_search.best_estimator_
rf

RandomForestRegressor(bootstrap=False, criterion='mse', max_depth=14,
           max_features=3, max_leaf_nodes=None, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=2,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=85, n_jobs=None, oob_score=False, random_state=42,
           verbose=0, warm_start=False)

In [35]:
rf.fit(train.loc[:, features], train['Price'])

RandomForestRegressor(bootstrap=False, criterion='mse', max_depth=14,
           max_features=3, max_leaf_nodes=None, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=2,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=85, n_jobs=None, oob_score=False, random_state=42,
           verbose=0, warm_start=False)

In [36]:
rf_pred = rf.predict(train.loc[:, features])
r2_score(train['Price'], rf_pred)

0.9159276718934741

In [37]:
rf_pred_valid = rf.predict(valid.loc[:, features])
r2_score(valid['Price'], rf_pred_valid)

0.7190712287529892

In [38]:
feat_importances = pd.Series(rf.feature_importances_)

In [39]:
r2_max = 0
z_max = 0
for z in range(10, 26):
    bp = feat_importances.nlargest(z).index.values.tolist()

    best_features = []
    for i in bp:
        best_features.append(features[i])
    
    rf.fit(train.loc[:, best_features], train['Price'])
    
    rf_pred_valid = rf.predict(valid.loc[:, best_features])
    r2 = r2_score(valid['Price'], rf_pred_valid)
    if r2 > r2_max:
        r2_max = r2
        z_max = z
print(z_max, r2_max)

13 0.7240340481805096


In [40]:
bp = feat_importances.nlargest(z_max).index.values.tolist()

best_features = []
for i in bp:
    best_features.append(features[i])
best_features

['Square',
 'LifeSquare',
 'Rooms',
 'Social_2',
 'Social_1',
 'Social_3',
 'KitchenSquare',
 'MeanRoomSquare',
 'HouseYear',
 'Ecology_1',
 'Shops_1',
 'Helthcare_2',
 'HouseFloor']

In [41]:
rf.fit(train.loc[:, best_features], train['Price'])

RandomForestRegressor(bootstrap=False, criterion='mse', max_depth=14,
           max_features=3, max_leaf_nodes=None, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=2,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=85, n_jobs=None, oob_score=False, random_state=42,
           verbose=0, warm_start=False)

## test

In [42]:
test = pd.read_csv('data/test.csv')

In [43]:
test['LifeSquare'] = test['LifeSquare'].fillna(0)
test = pd.get_dummies(test)

In [44]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 24 columns):
Id               10000 non-null int64
DistrictId       10000 non-null int64
Rooms            10000 non-null float64
Square           10000 non-null float64
LifeSquare       10000 non-null float64
KitchenSquare    10000 non-null float64
Floor            10000 non-null int64
HouseFloor       10000 non-null float64
HouseYear        10000 non-null int64
Ecology_1        10000 non-null float64
Social_1         10000 non-null int64
Social_2         10000 non-null int64
Social_3         10000 non-null int64
Healthcare_1     5202 non-null float64
Helthcare_2      10000 non-null int64
Shops_1          10000 non-null int64
Price            10000 non-null float64
Ecology_2_A      10000 non-null uint8
Ecology_2_B      10000 non-null uint8
Ecology_3_A      10000 non-null uint8
Ecology_3_B      10000 non-null uint8
Shops_2_A        10000 non-null uint8
Shops_2_B        10000 non-null uint8
Old

In [45]:
test['OldHouse'] = (test['HouseYear'] <= x_year).astype(int)

In [46]:
test.loc[(test.LifeSquare < 10), 'LifeSquare'] = \
test.loc[(test.LifeSquare < 10), 'Square'] * k

test.loc[(test.Square - test.LifeSquare) < 10, 'LifeSquare'] = \
test.loc[(test.Square - test.LifeSquare) < 10, 'Square'] * k

In [47]:
test['Rooms'].unique()

array([ 2.,  1.,  3.,  4.,  5.,  6.,  0., 17.])

In [48]:
test.loc[test['Rooms'] == 0, :]

Unnamed: 0,Id,DistrictId,Rooms,Square,LifeSquare,KitchenSquare,Floor,HouseFloor,HouseYear,Ecology_1,...,Healthcare_1,Helthcare_2,Shops_1,Ecology_2_A,Ecology_2_B,Ecology_3_A,Ecology_3_B,Shops_2_A,Shops_2_B,OldHouse
2406,3343,58,0.0,116.824201,81.28736,0.0,3,3.0,1977,0.437885,...,1084.0,0,5,0,1,0,1,0,1,0
2524,10729,27,0.0,76.345154,42.820796,12.0,14,0.0,1977,0.017647,...,,0,0,0,1,0,1,0,1,0


In [49]:
test.loc[test['Id'] == 3343, 'Rooms'] = 3
test.loc[test['Id'] == 10729, 'Rooms'] = 2

In [50]:
test.loc[test['Rooms'] > 6, :]

Unnamed: 0,Id,DistrictId,Rooms,Square,LifeSquare,KitchenSquare,Floor,HouseFloor,HouseYear,Ecology_1,...,Healthcare_1,Helthcare_2,Shops_1,Ecology_2_A,Ecology_2_B,Ecology_3_A,Ecology_3_B,Shops_2_A,Shops_2_B,OldHouse
3398,1435,111,17.0,52.866107,32.528342,8.0,15,17.0,1987,0.093443,...,3300.0,2,4,0,1,0,1,0,1,0


In [51]:
test.loc[test['Id'] == 1435, 'Rooms'] = 2

In [52]:
test['MeanRoomSquare'] = test.LifeSquare / test.Rooms

test.loc[(test['KitchenSquare'] > 30), 'KitchenSquare'] = test.loc[(test['KitchenSquare'] > 30), 'Square'] * 0.11

test.loc[(test['KitchenSquare'] < 4), 'KitchenSquare'] = test.loc[(test['KitchenSquare'] < 4), 'Square'] * 0.11

test.loc[(test['KitchenSquare'] > (test['Square'] - test['LifeSquare'])), 'LifeSquare'] = \
test.loc[(test['KitchenSquare'] > (test['Square'] - test['LifeSquare'])), 'Square'] * 0.11

In [53]:
pred_test = rf.predict(test.loc[:, best_features])
pred_test

array([159339.00038684, 217423.99133177, 199592.18934559, ...,
       320790.38231425, 205662.30381012, 175253.61833374])

In [54]:
test['Price'] = pred_test
test.head()

Unnamed: 0,Id,DistrictId,Rooms,Square,LifeSquare,KitchenSquare,Floor,HouseFloor,HouseYear,Ecology_1,...,Shops_1,Ecology_2_A,Ecology_2_B,Ecology_3_A,Ecology_3_B,Shops_2_A,Shops_2_B,OldHouse,MeanRoomSquare,Price
0,725,58,2.0,49.882643,33.432782,6.0,6,14.0,1972,0.310199,...,0,0,1,0,1,0,1,0,16.716391,159339.000387
1,15856,74,2.0,69.263183,48.193964,7.61895,6,1.0,1977,0.075779,...,2,0,1,0,1,0,1,0,24.096982,217423.991332
2,5480,190,1.0,13.597819,1.49576,12.0,2,5.0,1909,0.0,...,5,0,1,0,1,0,1,1,9.461488,199592.189346
3,15664,47,2.0,73.046609,51.940842,9.0,22,22.0,2007,0.101872,...,3,0,1,0,1,0,1,0,25.970421,357391.522295
4,14275,27,1.0,47.527111,33.069803,5.227982,17,17.0,2017,0.072158,...,0,0,1,0,1,1,0,0,33.069803,131411.761233


In [55]:
test.shape

(5000, 25)

In [56]:
test.loc[:, ['Id', 'Price']].to_csv('IPashkov_predictions.csv', index=None)