In [1]:
import numpy as np
import pandas as pd

from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
from sklearn.model_selection import GridSearchCV, train_test_split

In [2]:
data = pd.read_csv('data/train.csv')
data.shape

(10000, 20)

In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 20 columns):
Id               10000 non-null int64
DistrictId       10000 non-null int64
Rooms            10000 non-null float64
Square           10000 non-null float64
LifeSquare       7887 non-null float64
KitchenSquare    10000 non-null float64
Floor            10000 non-null int64
HouseFloor       10000 non-null float64
HouseYear        10000 non-null int64
Ecology_1        10000 non-null float64
Ecology_2        10000 non-null object
Ecology_3        10000 non-null object
Social_1         10000 non-null int64
Social_2         10000 non-null int64
Social_3         10000 non-null int64
Healthcare_1     5202 non-null float64
Helthcare_2      10000 non-null int64
Shops_1          10000 non-null int64
Shops_2          10000 non-null object
Price            10000 non-null float64
dtypes: float64(8), int64(9), object(3)
memory usage: 1.5+ MB


In [4]:
data['LifeSquare'] = data['LifeSquare'].fillna(0)

In [5]:
for i in range(0, 9):
    data.loc[(data['Square'] < 15) & (data['Rooms'] == i), 'Square'] = \
        data.loc[data['Rooms'] == i, 'Square'].mean()
    
    data.loc[(data['LifeSquare'] < 15) & (data['Rooms'] == i), 'LifeSquare'] = \
        data.loc[data['Rooms'] == i, 'LifeSquare'].mean()
    
    data.loc[(data['Square'] < data['LifeSquare']) & (data['Rooms'] == i), 'LifeSquare'] = \
        data.loc[data['Rooms'] == i, 'LifeSquare'].mean()
    
    data.loc[(data['Square'] < data['LifeSquare']) & (data['Rooms'] == i), 'Square'] = \
        data.loc[data['Rooms'] == i, 'Square'].mean()
    
    data.loc[(data['KitchenSquare'] > (data['Square'] - data['LifeSquare'])) & (data['Rooms'] == i), 'KitchenSquare'] = \
        data.loc[data['Rooms'] == i, 'KitchenSquare'].mean()

In [6]:
data = data.loc[data['Rooms'].between(1, 9), :]
data = data.loc[data['Price'].between(30000, 600000), :]

data = pd.get_dummies(data)
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 9969 entries, 0 to 9999
Data columns (total 23 columns):
Id               9969 non-null int64
DistrictId       9969 non-null int64
Rooms            9969 non-null float64
Square           9969 non-null float64
LifeSquare       9969 non-null float64
KitchenSquare    9969 non-null float64
Floor            9969 non-null int64
HouseFloor       9969 non-null float64
HouseYear        9969 non-null int64
Ecology_1        9969 non-null float64
Social_1         9969 non-null int64
Social_2         9969 non-null int64
Social_3         9969 non-null int64
Healthcare_1     5183 non-null float64
Helthcare_2      9969 non-null int64
Shops_1          9969 non-null int64
Price            9969 non-null float64
Ecology_2_A      9969 non-null uint8
Ecology_2_B      9969 non-null uint8
Ecology_3_A      9969 non-null uint8
Ecology_3_B      9969 non-null uint8
Shops_2_A        9969 non-null uint8
Shops_2_B        9969 non-null uint8
dtypes: float64(8), int64(

In [7]:
train, valid = train_test_split(data, test_size=0.15, random_state=42)
train.shape, valid.shape

((8473, 23), (1496, 23))

In [8]:
features = list(data.columns.values)
ex_col = ['Id', 'DistrictId', 'Price', 'Healthcare_1']
features = [i for i in features if not(i in ex_col)]
features

['Rooms',
 'Square',
 'LifeSquare',
 'KitchenSquare',
 'Floor',
 'HouseFloor',
 'HouseYear',
 'Ecology_1',
 'Social_1',
 'Social_2',
 'Social_3',
 'Helthcare_2',
 'Shops_1',
 'Ecology_2_A',
 'Ecology_2_B',
 'Ecology_3_A',
 'Ecology_3_B',
 'Shops_2_A',
 'Shops_2_B']

In [9]:
lr = LinearRegression()
lr.fit(train.loc[:, features], train['Price'])
lr_pred = lr.predict(train.loc[:, features])
r2_score(train['Price'], lr_pred)

0.476961525029605

In [10]:
lr_pred_valid = lr.predict(valid.loc[:, features])
r2_score(valid['Price'], lr_pred_valid)

0.4936648454890692

In [11]:
param_grid = {
    'bootstrap': [True, False],
    'max_depth': range(13, 15, 1),
    'max_features': range(2, 4, 1),
    'min_samples_leaf': range(2, 3, 1),
    'min_samples_split': range(2, 4, 2),
    'n_estimators': range(70, 90, 5)
}

In [12]:
rf=RandomForestRegressor(random_state=42)
grid_search=GridSearchCV(estimator=rf, param_grid=param_grid, 
                          cv=3, n_jobs=-1, verbose=2, scoring='r2')

In [13]:
grid_search.fit(train.loc[:, features], train['Price'])

Fitting 3 folds for each of 32 candidates, totalling 96 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:    4.5s
[Parallel(n_jobs=-1)]: Done  96 out of  96 | elapsed:   11.5s finished


GridSearchCV(cv=3, error_score='raise-deprecating',
       estimator=RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators='warn', n_jobs=None,
           oob_score=False, random_state=42, verbose=0, warm_start=False),
       fit_params=None, iid='warn', n_jobs=-1,
       param_grid={'bootstrap': [True, False], 'max_depth': range(13, 15), 'max_features': range(2, 4), 'min_samples_leaf': range(2, 3), 'min_samples_split': range(2, 4, 2), 'n_estimators': range(70, 90, 5)},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='r2', verbose=2)

In [14]:
grid_search.best_params_

{'bootstrap': False,
 'max_depth': 13,
 'max_features': 3,
 'min_samples_leaf': 2,
 'min_samples_split': 2,
 'n_estimators': 85}

In [15]:
grid_pred = grid_search.predict(train.loc[:, features])
r2_score(train['Price'], grid_pred)

0.8903869123361913

In [16]:
grid_pred_valid = grid_search.predict(valid.loc[:, features])
r2_score(valid['Price'], grid_pred_valid)

0.7483527054249779

In [17]:
r2_score(valid['Price'], lr_pred_valid) < r2_score(valid['Price'], grid_pred_valid)

True

# Предсказание на тесте

In [18]:
test = pd.read_csv('data/test.csv')
test.shape

(5000, 19)

In [19]:
test = pd.get_dummies(test)
test.shape

(5000, 22)

In [20]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 22 columns):
Id               5000 non-null int64
DistrictId       5000 non-null int64
Rooms            5000 non-null float64
Square           5000 non-null float64
LifeSquare       3959 non-null float64
KitchenSquare    5000 non-null float64
Floor            5000 non-null int64
HouseFloor       5000 non-null float64
HouseYear        5000 non-null int64
Ecology_1        5000 non-null float64
Social_1         5000 non-null int64
Social_2         5000 non-null int64
Social_3         5000 non-null int64
Healthcare_1     2623 non-null float64
Helthcare_2      5000 non-null int64
Shops_1          5000 non-null int64
Ecology_2_A      5000 non-null uint8
Ecology_2_B      5000 non-null uint8
Ecology_3_A      5000 non-null uint8
Ecology_3_B      5000 non-null uint8
Shops_2_A        5000 non-null uint8
Shops_2_B        5000 non-null uint8
dtypes: float64(7), int64(9), uint8(6)
memory usage: 654.4 KB


In [21]:
test['LifeSquare'] = test['LifeSquare'].fillna(0)

In [22]:
test['Rooms'].unique()

array([ 2.,  1.,  3.,  4.,  5.,  6.,  0., 17.])

In [23]:
test.loc[test['Rooms'] == 17, :]

Unnamed: 0,Id,DistrictId,Rooms,Square,LifeSquare,KitchenSquare,Floor,HouseFloor,HouseYear,Ecology_1,...,Social_3,Healthcare_1,Helthcare_2,Shops_1,Ecology_2_A,Ecology_2_B,Ecology_3_A,Ecology_3_B,Shops_2_A,Shops_2_B
3398,1435,111,17.0,52.866107,32.528342,8.0,15,17.0,1987,0.093443,...,5,3300.0,2,4,0,1,0,1,0,1


In [24]:
test.loc[(test['Rooms'] < 17) & (test['Square'].between(52, 53)) & (test['KitchenSquare'] == 8) , 'Rooms'].mean()

2.04

In [25]:
test.loc[test['Id'] == 1435, 'Rooms'] = 2
test['Rooms'].unique()

array([2., 1., 3., 4., 5., 6., 0.])

In [26]:
for i in range(0, 9):
    test.loc[(test['Square'] < 15) & (test['Rooms'] == i), 'Square'] = \
        test.loc[test['Rooms'] == i, 'Square'].mean()
    
    test.loc[(test['LifeSquare'] < 15) & (test['Rooms'] == i), 'LifeSquare'] = \
        test.loc[test['Rooms'] == i, 'LifeSquare'].mean()
    
    test.loc[(test['Square'] < test['LifeSquare']) & (test['Rooms'] == i), 'LifeSquare'] = \
        test.loc[test['Rooms'] == i, 'LifeSquare'].mean()
    
    test.loc[(test['Square'] < test['LifeSquare']) & (test['Rooms'] == i), 'Square'] = \
        test.loc[test['Rooms'] == i, 'Square'].mean()

    test.loc[(test['KitchenSquare'] > (test['Square'] - test['LifeSquare'])) & (test['Rooms'] == i), 'KitchenSquare'] = \
        test.loc[test['Rooms'] == i, 'KitchenSquare'].mean()

In [27]:
pred_test = grid_search.predict(test.loc[:, features])
pred_test

array([163613.28852848, 208113.50512937, 256417.71989867, ...,
       335119.35557824, 191130.12935577, 184738.50356243])

In [28]:
test['Price'] = pred_test
test.head()

Unnamed: 0,Id,DistrictId,Rooms,Square,LifeSquare,KitchenSquare,Floor,HouseFloor,HouseYear,Ecology_1,...,Healthcare_1,Helthcare_2,Shops_1,Ecology_2_A,Ecology_2_B,Ecology_3_A,Ecology_3_B,Shops_2_A,Shops_2_B,Price
0,725,58,2.0,49.882643,33.432782,6.0,6,14.0,1972,0.310199,...,,0,0,0,1,0,1,0,1,163613.288528
1,15856,74,2.0,69.263183,28.319601,1.0,6,1.0,1977,0.075779,...,,0,2,0,1,0,1,0,1,208113.505129
2,5480,190,1.0,40.943143,15.948246,12.0,2,5.0,1909,0.0,...,4702.0,5,5,0,1,0,1,0,1,256417.719899
3,15664,47,2.0,73.046609,51.940842,9.0,22,22.0,2007,0.101872,...,,3,3,0,1,0,1,0,1,336799.563507
4,14275,27,1.0,47.527111,43.387569,1.0,17,17.0,2017,0.072158,...,,0,0,0,1,0,1,1,0,136816.653935


In [29]:
test.shape

(5000, 23)

In [30]:
test.loc[:, ['Id', 'Price']].to_csv('IPashkov_predictions.csv', index=None)