In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
# import seaborn as sns
import warnings

warnings.filterwarnings('ignore')

pd.options.display.max_columns = 100
%matplotlib inline

## prepare data

In [None]:
data = pd.read_csv('input/train.csv')

In [None]:
train, valid = train_test_split(data, test_size=0.3, random_state=42)

In [None]:
train.head(20)

In [None]:
train.describe()

In [None]:
train.info()

In [None]:
def add_mean_price(df):

    mean_price_all = train['Price'].mean()
    mean_price = train.groupby(['DistrictId', 'Rooms'], as_index=False)[['Price']].mean().rename(columns={'Price':'mean_price'})
    mean_price_by_rooms = train.groupby('Rooms', as_index=False)[['Price']].mean().rename(columns={'Price':'Mean_price_by_rooms'})
    
    df = pd.merge(df, mean_price_by_rooms, on='Rooms', how='left')
    df['Mean_price_by_rooms'] = df['Mean_price_by_rooms'].fillna(mean_price_all)
    df = pd.merge(df, mean_price, on=['DistrictId', 'Rooms'], how='left')
   
    return df    
   

In [None]:
def clear_Healthcare_1(df):
    Healthcare_1_mean=train['Healthcare_1'].mean()
    Healthcare_1_mean_DistrictId=train.groupby('DistrictId', as_index=False)[['Healthcare_1']].mean()
    Healthcare_1_mean_DistrictId['Healthcare_1']=Healthcare_1_mean_DistrictId['Healthcare_1'].fillna(Healthcare_1_mean)
    Healthcare_1_mean_DistrictId=Healthcare_1_mean_DistrictId.rename(columns={'Healthcare_1':'Healthcare_1_mean'})

    clear_train=df
    clear_train=pd.merge(clear_train,Healthcare_1_mean_DistrictId,on='DistrictId',how='left')
    clear_train['Healthcare_1']=clear_train['Healthcare_1'].fillna(clear_train['Healthcare_1_mean'])
    clear_train['Healthcare_1']=clear_train['Healthcare_1'].fillna(Healthcare_1_mean)
    return clear_train
    

In [None]:
def add_floor_rate(df):
    clear_train=df
    clear_train['Is_first_last']=1   
    clear_train.loc[(clear_train['Floor']==clear_train['HouseFloor']) | (clear_train['Floor']<2),'Is_first_last']=0
    return clear_train

In [None]:
def prepare_data(data):
    clear_data=data
    clear_data=add_mean_price(clear_data)
    clear_data=add_floor_rate(clear_data)
    clear_data=clear_Healthcare_1(clear_data)
    return clear_data
    

In [None]:
clear_train=prepare_data(train)
clear_valid=prepare_data(valid)

In [None]:
clear_train.info()

## Model

In [None]:
feats = ['Rooms', 'Square', 
         'KitchenSquare', 
         'Floor', 
#          'HouseFloor', 
         'HouseYear',
         'Ecology_1',
         'Social_1',  'Social_2',  'Social_3',
         'Shops_1','Mean_price_by_rooms',
         'Is_first_last','Healthcare_1' ]

# ---------------------------------------------------------

In [None]:
from sklearn.metrics import r2_score as r2

In [None]:
def evaluate_model(model, train, valid, metric, feats, target, max_depth):
    dtr = model(max_depth=max_depth, random_state=42)
    dtr.fit(train.loc[:, feats], train[target])
    y_pred_train = dtr.predict(train.loc[:, feats])
    y_pred_valid = dtr.predict(valid.loc[:, feats])
    metric_train = metric(train[target], y_pred_train)
    metric_valid = metric(valid[target], y_pred_valid)
    return metric_train, metric_valid

In [None]:
def check_param(min_value, max_value, model, train=clear_train, valid=clear_valid, metric=r2, feats=feats, 
                             target='Price'):
    max_depth_arr = []
    metric_train_arr = []
    metric_valid_arr = []

    for i in range(min_value, max_value+1):
        res = evaluate_model(model, train, valid, metric, feats, 
                             target, max_depth=i)
        max_depth_arr.append(i)
        metric_train_arr.append(res[0])
        metric_valid_arr.append(res[1])
    return max_depth_arr, metric_train_arr, metric_valid_arr

In [None]:
from sklearn.ensemble import RandomForestRegressor as RFR

In [None]:
max_depth_arr, metric_train_arr, metric_valid_arr = check_param(min_value=1, max_value=13, model=RFR)

In [None]:
plt.plot(max_depth_arr, metric_train_arr)
plt.plot(max_depth_arr, metric_valid_arr)

In [None]:
np.max(metric_valid_arr)

In [None]:
np.max(metric_valid_arr)

In [None]:
metric_valid_arr

In [None]:
model = RFR(max_depth=11, random_state=42)
model.fit(clear_train.loc[:, feats], clear_train['Price'])

# R2

In [None]:
from sklearn.metrics import r2_score as r2

In [None]:
y_pred_valid = model.predict(clear_valid.loc[:, feats])

In [None]:
r2(clear_valid['Price'], y_pred_valid)

# Test

In [None]:
test = pd.read_csv('input/test.csv')

In [None]:
clear_test = prepare_data(test)


In [None]:
clear_test.info()

In [None]:
clear_test['Price'] = model.predict(clear_test.loc[:, feats])

In [None]:
clear_test.head()

In [None]:
clear_test.loc[:, ['Id', 'Price']].to_csv('IMasloed_predictions.csv', index=None)