In [None]:
import numpy as np
import pandas as pd

from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
from sklearn.model_selection import GridSearchCV, train_test_split

In [None]:
data = pd.read_csv('data/train.csv')
data.shape

In [None]:
dif_mean = (data['Square'] - data['LifeSquare']).mean()

In [None]:
data.loc[data['Square'] < data['LifeSquare'], 'LifeSquare'] = data['Square'] - dif_mean
data = data.loc[data['Rooms'].between(1, 9), :]
data = data.loc[(data['Square'] > 15) & (data['LifeSquare'] > 15), :]
data = data.loc[data['Price'].between(30000, 600000), :]
data = data.loc[data['HouseYear'] < 2020, :]

data = pd.get_dummies(data)
data.info()

In [None]:
train, valid = train_test_split(data, test_size=0.2, random_state=42)
train.shape, valid.shape

In [None]:
features = list(data.columns.values)
ex_col = ['Id', 'DistrictId', 'Price', 'Healthcare_1']
features = [i for i in features if not(i in ex_col)]
features

In [None]:
lr = LinearRegression()
lr.fit(train.loc[:, features], train['Price'])
lr_pred = lr.predict(train.loc[:, features])
r2_score(train['Price'], lr_pred)

In [None]:
lr_pred_valid = lr.predict(valid.loc[:, features])
r2_score(valid['Price'], lr_pred_valid)

In [None]:
param_grid = {
    'bootstrap': [True, False],
    'max_depth': range(13, 15, 1),
    'max_features': range(3, 5, 1),
    'min_samples_leaf': range(2, 3, 1),
    'min_samples_split': range(2, 4, 2),
    'n_estimators': range(100, 120, 10)
}

In [None]:
rf = RandomForestRegressor(random_state=42)
grid_search = GridSearchCV(estimator = rf, param_grid = param_grid, 
                          cv = 3, n_jobs = -1, verbose = 2)

In [None]:
grid_search.fit(train.loc[:, features], train['Price'])

In [None]:
grid_search.best_params_

In [None]:
grid_pred = grid_search.predict(train.loc[:, features])
r2_score(train['Price'], grid_pred)

In [None]:
grid_pred_valid = grid_search.predict(valid.loc[:, features])
r2_score(valid['Price'], grid_pred_valid)

In [None]:
r2_score(valid['Price'], lr_pred_valid) < r2_score(valid['Price'], grid_pred_valid)

# Предсказание на тесте

In [None]:
test = pd.read_csv('data/test.csv')
test.shape

In [None]:
test = pd.get_dummies(test)
test.shape

In [None]:
test.info()

In [None]:
test = test.fillna(0)
test.loc[test['LifeSquare'] == 0, 'LifeSquare'] = test['Square'] - dif_mean
test.info()

In [None]:
pred_test = grid_search.predict(test.loc[:, features])
pred_test

In [None]:
test['Price'] = pred_test
test.head()

In [None]:
test.loc[:, ['Id', 'Price']].to_csv('IPashkov_predictions.csv', index=None)