In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt 
from scipy import stats

pd.options.display.max_columns = 100

In [2]:
datafr = pd.read_csv('train.csv')

In [3]:
datafr.head()

Unnamed: 0,Id,DistrictId,Rooms,Square,LifeSquare,KitchenSquare,Floor,HouseFloor,HouseYear,Ecology_1,Ecology_2,Ecology_3,Social_1,Social_2,Social_3,Healthcare_1,Helthcare_2,Shops_1,Shops_2,Price
0,14038,35,2.0,47.981561,29.442751,6.0,7,9.0,1969,0.08904,B,B,33,7976,5,,0,11,B,184966.93073
1,15053,41,3.0,65.68364,40.049543,8.0,7,9.0,1978,7e-05,B,B,46,10309,1,240.0,1,16,B,300009.450063
2,4765,53,2.0,44.947953,29.197612,0.0,8,12.0,1968,0.049637,B,B,34,7759,0,229.0,1,3,B,220925.908524
3,5809,58,2.0,53.352981,52.731512,9.0,8,17.0,1977,0.437885,B,B,23,5735,3,1084.0,0,5,B,175616.227217
4,10783,99,1.0,39.649192,23.776169,7.0,11,12.0,1976,0.012339,B,B,35,5776,1,2078.0,2,4,B,150226.531644


### split

In [4]:
from sklearn.model_selection import train_test_split 

In [5]:
train, valid = train_test_split(datafr, test_size=0.3, random_state=42)

### Prepare

In [6]:
health_mean = train['Healthcare_1'].mean()
life_square_mean = train['LifeSquare'].mean()

def health_care(data):
    data['Healthcare_1'] = data['Healthcare_1'].fillna(health_mean)
    return data

def life_square(data):
    data['LifeSquare'] = data['LifeSquare'].fillna(life_square_mean)
    return data

In [7]:
def clean_rooms(data):
    data.loc[data['Rooms'] > 5, 'Rooms'] = 5
    data.loc[data['Rooms'] == 0, 'Rooms'] = 1
    return data

In [8]:
def clean_square(data):
    data.loc[data['Square'] < 15, 'Square'] = 15
    return data

In [9]:
def clean_year(data):
    data.loc[data['HouseYear'] > 2020, 'HouseYear'] = 2020
    return data

In [10]:
mean_price = train['Price'].mean()
mean_price_for_dsr = train.groupby(['DistrictId', 'Rooms'], as_index=False)[['Price']].mean().rename(columns={'Price': 'mean_price_for_dsr'})
mean_price_for_rooms = train.groupby('Rooms', as_index=False)[['Price']].mean().rename(columns={'Price': 'mean_price_for_rooms'})

In [11]:
#col = ['Social_1', 'Social_2', 'Social_3']
#sum_soc = train[col].sum(axis=1)
#mean_social_sum = sum_soc.mean()

In [12]:
def add_smth(data, mean_price=mean_price, mean_price_for_rooms=mean_price_for_rooms, mean_price_for_dsr=mean_price_for_dsr):
    data = pd.merge(data, mean_price_for_dsr, on=['DistrictId', 'Rooms'], how='left')
    data = pd.merge(data, mean_price_for_rooms, on='Rooms', how='left')
    
    data['mean_price_for_rooms'] = data['mean_price_for_rooms'].fillna(mean_price)
    data['mean_price_for_dsr'] = data['mean_price_for_dsr'].fillna(data['mean_price_for_rooms'])
    return data


In [13]:
district_size = train['DistrictId'].value_counts().reset_index().\
    rename(columns={'index':'DistrictId', 'DistrictId':'large_district'})

In [14]:
district_size_mean = int(district_size['large_district'].mean())
district_size.loc[district_size['large_district'] <= district_size_mean, 'large_district'] = 0
district_size.loc[district_size['large_district'] > district_size_mean, 'large_district'] = 1

In [15]:
def large_district(data, district_size=district_size):
    data = pd.merge(data, district_size, on='DistrictId', how='left')
    data['large_district'] = data['large_district'].fillna(0)   
    return data

In [16]:
def prepare_data(data):
    data = pd.get_dummies(data)
    data = clean_rooms(data)
    data = clean_square(data)
    data = clean_year(data)
    data = health_care(data)
    data = life_square(data)
    #data = add_smth(data)
    data = large_district(data)
    
    return data

In [17]:
train = prepare_data(train)

In [18]:
valid = prepare_data(valid)

### Model

In [19]:
from sklearn.ensemble import RandomForestRegressor as RF
model = RF(n_estimators=30, max_depth=12, random_state=42)

In [20]:
from sklearn.metrics import r2_score as r2
#feats = ['Rooms','Square', 'HouseYear', 'mean_price_for_dsr']
feats = ['Rooms', 'Square', 'HouseYear', 'Social_1', 'Social_2', 'Social_3', 'Helthcare_2', 'Shops_1', 'large_district', 'Floor']

In [21]:
model.fit(train.loc[:, feats], train['Price'])

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=12,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=30, n_jobs=None,
           oob_score=False, random_state=42, verbose=0, warm_start=False)

In [22]:
def evaluate_model(df, model=model, feats=feats, calculate_r2='Yes'):
    pred = model.predict(df.loc[:, feats])
    if calculate_r2 == 'Yes':
        r2_ = r2(df['Price'], pred)
        print('R2: {}'.format(r2_))
    return pred

In [23]:
pred_train = evaluate_model(train)

R2: 0.8978403473898974


In [24]:
pred_valid = evaluate_model(valid)

R2: 0.6975807973057828


### Test

In [25]:
test = pd.read_csv('test.csv')

In [26]:
test = prepare_data(test)

In [27]:
test['Price'] = evaluate_model(test, calculate_r2='No')

In [28]:
test['Price'].head(10)

0    155218.383595
1    210958.289189
2    280394.638577
3    380965.882757
4    136296.483426
5    197616.406304
6    173820.351307
7    215178.204033
8    293830.865658
9    202925.084999
Name: Price, dtype: float64

In [29]:
test.loc[:, ['Id', 'Price']].to_csv('VSvirova_predictions.csv', index=None)