In [1]:
import numpy as np
import pandas as pd

from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.model_selection import GridSearchCV, train_test_split

from sklearn.preprocessing import Imputer
from sklearn.impute import SimpleImputer


from sklearn import ensemble

In [2]:
data = pd.read_csv('data/train.csv')
data.shape

(10000, 20)

In [3]:
dif_mean = (data['Square'] - data['LifeSquare']).mean()

In [4]:
data.loc[data['Square'] < data['LifeSquare'], 'LifeSquare'] = data['Square'] - dif_mean
data = data.loc[data['Rooms'].between(1, 9), :]
data = data.loc[(data['Square'] > 15) & (data['LifeSquare'] > 15), :]
data = data.loc[data['Price'].between(30000, 600000), :]
data = data.loc[data['HouseYear'] < 2020, :]

data = pd.get_dummies(data)
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7576 entries, 0 to 9999
Data columns (total 23 columns):
Id               7576 non-null int64
DistrictId       7576 non-null int64
Rooms            7576 non-null float64
Square           7576 non-null float64
LifeSquare       7576 non-null float64
KitchenSquare    7576 non-null float64
Floor            7576 non-null int64
HouseFloor       7576 non-null float64
HouseYear        7576 non-null int64
Ecology_1        7576 non-null float64
Social_1         7576 non-null int64
Social_2         7576 non-null int64
Social_3         7576 non-null int64
Healthcare_1     4301 non-null float64
Helthcare_2      7576 non-null int64
Shops_1          7576 non-null int64
Price            7576 non-null float64
Ecology_2_A      7576 non-null uint8
Ecology_2_B      7576 non-null uint8
Ecology_3_A      7576 non-null uint8
Ecology_3_B      7576 non-null uint8
Shops_2_A        7576 non-null uint8
Shops_2_B        7576 non-null uint8
dtypes: float64(8), int64(

In [5]:
train, valid = train_test_split(data, test_size=0.2, random_state=42)
train.shape, valid.shape

((6060, 23), (1516, 23))

In [6]:
features = list(data.columns.values)
ex_col = ['Id', 'DistrictId', 'Price', 'Healthcare_1']
features = [i for i in features if not(i in ex_col)]
features

['Rooms',
 'Square',
 'LifeSquare',
 'KitchenSquare',
 'Floor',
 'HouseFloor',
 'HouseYear',
 'Ecology_1',
 'Social_1',
 'Social_2',
 'Social_3',
 'Helthcare_2',
 'Shops_1',
 'Ecology_2_A',
 'Ecology_2_B',
 'Ecology_3_A',
 'Ecology_3_B',
 'Shops_2_A',
 'Shops_2_B']

In [7]:
lr = LinearRegression()
lr.fit(train.loc[:, features], train['Price'])
lr_pred = lr.predict(train.loc[:, features])
r2_score(train['Price'], lr_pred)

0.47282999359032896

In [8]:
lr_pred_valid = lr.predict(valid.loc[:, features])
r2_score(valid['Price'], lr_pred_valid)

0.4756060325551753

In [9]:
param_grid = {
    'bootstrap': [True, False],
    'max_depth': range(13, 15, 1),
    'max_features': range(3, 5, 1),
    'min_samples_leaf': range(2, 3, 1),
    'min_samples_split': range(2, 4, 2),
    'n_estimators': range(100, 120, 10)
}

In [10]:
rf = RandomForestRegressor(random_state=42)
grid_search = GridSearchCV(estimator = rf, param_grid = param_grid, 
                          cv = 3, n_jobs = -1, verbose = 2)

In [11]:
grid_search.fit(train.loc[:, features], train['Price'])

Fitting 3 folds for each of 16 candidates, totalling 48 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:    5.3s
[Parallel(n_jobs=-1)]: Done  48 out of  48 | elapsed:    8.2s finished


GridSearchCV(cv=3, error_score='raise-deprecating',
       estimator=RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators='warn', n_jobs=None,
           oob_score=False, random_state=42, verbose=0, warm_start=False),
       fit_params=None, iid='warn', n_jobs=-1,
       param_grid={'bootstrap': [True, False], 'max_depth': range(13, 15), 'max_features': range(3, 5), 'min_samples_leaf': range(2, 3), 'min_samples_split': range(2, 4, 2), 'n_estimators': range(100, 120, 10)},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=2)

In [12]:
grid_search.best_params_

{'bootstrap': False,
 'max_depth': 14,
 'max_features': 3,
 'min_samples_leaf': 2,
 'min_samples_split': 2,
 'n_estimators': 110}

In [13]:
grid_pred = grid_search.predict(train.loc[:, features])
r2_score(train['Price'], grid_pred)

0.9102105145054539

In [14]:
grid_pred_valid = grid_search.predict(valid.loc[:, features])
r2_score(valid['Price'], grid_pred_valid)

0.6995622271545112

In [15]:
r2_score(valid['Price'], lr_pred_valid) < r2_score(valid['Price'], grid_pred_valid)

True

# Предсказание на тесте

In [16]:
test = pd.read_csv('data/test.csv')
test.shape

(5000, 19)

In [17]:
test = pd.get_dummies(test)
test.shape

(5000, 22)

In [18]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 22 columns):
Id               5000 non-null int64
DistrictId       5000 non-null int64
Rooms            5000 non-null float64
Square           5000 non-null float64
LifeSquare       3959 non-null float64
KitchenSquare    5000 non-null float64
Floor            5000 non-null int64
HouseFloor       5000 non-null float64
HouseYear        5000 non-null int64
Ecology_1        5000 non-null float64
Social_1         5000 non-null int64
Social_2         5000 non-null int64
Social_3         5000 non-null int64
Healthcare_1     2623 non-null float64
Helthcare_2      5000 non-null int64
Shops_1          5000 non-null int64
Ecology_2_A      5000 non-null uint8
Ecology_2_B      5000 non-null uint8
Ecology_3_A      5000 non-null uint8
Ecology_3_B      5000 non-null uint8
Shops_2_A        5000 non-null uint8
Shops_2_B        5000 non-null uint8
dtypes: float64(7), int64(9), uint8(6)
memory usage: 654.4 KB


In [19]:
test = test.fillna(0)
test.loc[test['LifeSquare'] == 0, 'LifeSquare'] = test['Square'] - dif_mean
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 22 columns):
Id               5000 non-null int64
DistrictId       5000 non-null int64
Rooms            5000 non-null float64
Square           5000 non-null float64
LifeSquare       5000 non-null float64
KitchenSquare    5000 non-null float64
Floor            5000 non-null int64
HouseFloor       5000 non-null float64
HouseYear        5000 non-null int64
Ecology_1        5000 non-null float64
Social_1         5000 non-null int64
Social_2         5000 non-null int64
Social_3         5000 non-null int64
Healthcare_1     5000 non-null float64
Helthcare_2      5000 non-null int64
Shops_1          5000 non-null int64
Ecology_2_A      5000 non-null uint8
Ecology_2_B      5000 non-null uint8
Ecology_3_A      5000 non-null uint8
Ecology_3_B      5000 non-null uint8
Shops_2_A        5000 non-null uint8
Shops_2_B        5000 non-null uint8
dtypes: float64(7), int64(9), uint8(6)
memory usage: 654.4 KB


In [20]:
pred_test = grid_search.predict(test.loc[:, features])
pred_test

array([158010.57140945, 195118.88050963, 254426.94634735, ...,
       330553.62093178, 209027.27176872, 192772.38887635])

In [21]:
test['Price'] = pred_test
test.head()

Unnamed: 0,Id,DistrictId,Rooms,Square,LifeSquare,KitchenSquare,Floor,HouseFloor,HouseYear,Ecology_1,...,Healthcare_1,Helthcare_2,Shops_1,Ecology_2_A,Ecology_2_B,Ecology_3_A,Ecology_3_B,Shops_2_A,Shops_2_B,Price
0,725,58,2.0,49.882643,33.432782,6.0,6,14.0,1972,0.310199,...,0.0,0,0,0,1,0,1,0,1,158010.571409
1,15856,74,2.0,69.263183,51.228568,1.0,6,1.0,1977,0.075779,...,0.0,0,2,0,1,0,1,0,1,195118.88051
2,5480,190,1.0,13.597819,15.948246,12.0,2,5.0,1909,0.0,...,4702.0,5,5,0,1,0,1,0,1,254426.946347
3,15664,47,2.0,73.046609,51.940842,9.0,22,22.0,2007,0.101872,...,0.0,3,3,0,1,0,1,0,1,359564.565668
4,14275,27,1.0,47.527111,43.387569,1.0,17,17.0,2017,0.072158,...,0.0,0,0,0,1,0,1,1,0,139788.6963


In [22]:
test.loc[:, ['Id', 'Price']].to_csv('IPashkov_predictions.csv', index=None)