In [157]:
%matplotlib inline
import pandas as pd
import numpy as np
import xgboost as xgb

In [158]:
train = pd.read_csv('train_clean.csv')
test = pd.read_csv('test_clean.csv')

## Add additional Feature

In [159]:
# Add month-year
train['timestamp'] = pd.to_datetime(train['timestamp'])
month_year = (train.timestamp.dt.month + train.timestamp.dt.year * 100)
month_year_cnt_map = month_year.value_counts().to_dict()
train['month_year_cnt'] = month_year.map(month_year_cnt_map)
test['timestamp'] = pd.to_datetime(test['timestamp'])
month_year = (test.timestamp.dt.month + test.timestamp.dt.year * 100)
month_year_cnt_map = month_year.value_counts().to_dict()
test['month_year_cnt'] = month_year.map(month_year_cnt_map)
# Add week-year count
week_year = (train.timestamp.dt.weekofyear + train.timestamp.dt.year * 100)
week_year_cnt_map = week_year.value_counts().to_dict()
train['week_year_cnt'] = week_year.map(week_year_cnt_map)
week_year = (test.timestamp.dt.weekofyear + test.timestamp.dt.year * 100)
week_year_cnt_map = week_year.value_counts().to_dict()
test['week_year_cnt'] = week_year.map(week_year_cnt_map)
# Add month and day-of-week
train['month'] = train.timestamp.dt.month
train['dow'] = train.timestamp.dt.dayofweek
test['month'] = test.timestamp.dt.month
test['dow'] = test.timestamp.dt.dayofweek
# Other feature engineering
train['rel_floor'] = train['floor'] / train['max_floor'].astype(float)
train['rel_kitch_sq'] = train['kitch_sq'] / train['full_sq'].astype(float)
test['rel_floor'] = test['floor'] / test['max_floor'].astype(float)
test['rel_kitch_sq'] = test['kitch_sq'] / test['full_sq'].astype(float)
train.apartment_name=train.sub_area.astype(str) + train['metro_km_avto'].astype(str)
test.apartment_name=test.sub_area.astype(str) + train['metro_km_avto'].astype(str)
train['room_size'] = train['life_sq'] / train['num_room'].astype(float)
test['room_size'] = test['life_sq'] / test['num_room'].astype(float)

In [160]:
macro_cols = ['timestamp',"balance_trade", "balance_trade_growth", "eurrub", "average_provision_of_build_contract",
"micex_rgbi_tr", "micex_cbi_tr", "deposits_rate", "mortgage_value", "mortgage_rate",
"income_per_cap", "rent_price_4+room_bus", "museum_visitis_per_100_cap", "apartment_build"]
macro = pd.read_csv('macro_c.csv')[macro_cols]

In [161]:
train = train.merge(macro,how='left',on='timestamp')
test = test.merge(macro,how='left',on='timestamp')

In [162]:
def get_ratio_preschool(df):
    df['ratio_preschool'] = df['children_preschool'] / (df['children_school'] + 1)
    return df

train = get_ratio_preschool(train)
test = get_ratio_preschool(test)

def get_extra_area(df):
    df['extra_area'] = df['full_sq'] - df['life_sq']
    return df

train = get_extra_area(train)
test = get_extra_area(test)

def get_floor_ratio(df):
    df['floor_ratio'] = df['max_floor'] - df['floor']
    return df

train = get_floor_ratio(train)
test = get_floor_ratio(test)

def get_room_avg_size(df):
    df['room_avg_size'] = (df['life_sq'] - df['kitch_sq']) / (df['num_room'] + 1)
    return df

train = get_room_avg_size(train)
test = get_room_avg_size(test)

In [163]:
def quarter_cal(Month):
    if Month>=1 and Month <=3:
        return 'Q1'
    elif Month >3 and Month <=6:
        return 'Q2'
    elif Month >6 and Month <=9:
        return 'Q3'
    else:
        return 'Q4'

In [164]:
train_quarter = [str(x)+quarter_cal(y) for x,y in zip(list(train.timestamp.dt.year),list(train.month))]
test_quarter = [str(x)+quarter_cal(y) for x,y in zip(list(test.timestamp.dt.year),list(test.month))]

In [165]:
train['Quarter'] = train_quarter
test['Quarter'] = test_quarter

Join External Data

In [166]:
quarter_data = pd.read_csv('Quaterly Data.csv',sep=';')[['Quarter','Primary Average']]

In [167]:
train = train.merge(quarter_data,on='Quarter',how='left')
test = test.merge(quarter_data,on='Quarter',how='left')

In [168]:
del train['Quarter']
del test['Quarter']

## Idea: Predict Gap Between Prices instead of Price Directly

In [169]:
fullsq_fill = np.mean(train[train['full_sq'].notnull()]['full_sq'])
train['full_sq'] = train['full_sq'].fillna(fullsq_fill)
test['full_sq'] = test['full_sq'].fillna(fullsq_fill)

In [170]:
Price_sqm = [x/y for x,y in zip(list(train['price_doc']),list(train['full_sq']))]
Price_gap = Price_sqm - train['Primary Average']

In [171]:
train['Gap'] = Price_gap

**Create New Features of SQ and Drop Redundant Ones**

In [172]:
train['life_sq_ratio'] = train['life_sq']/train['full_sq']
train['kitch_sq_ratio'] = train['kitch_sq']/train['full_sq']
test['life_sq_ratio'] = test['life_sq']/test['full_sq']
test['kitch_sq_ratio'] = test['kitch_sq']/test['full_sq']

In [173]:
col = list(test.columns)[2:]
col = [each for each in col if each not in ['full_sq','life_sq','kitch_sq','Primary Average']]
label = train['Gap']

In [175]:
def get_feature_importance(model):
    Importance = model.get_fscore()
    Importance = list(Importance.items())
    Feature= []
    Score = []
    for each in Importance:
        Feature.append(each[0])
        Score.append(each[1])
    df = pd.DataFrame({'Feature':Feature,'Score':Score}).sort_values(by=['Score'],ascending=[0])
    return df  

In [140]:
dtrain = xgb.DMatrix(train[col],label)

In [145]:
params = {
   'eta': 0.05, ## Try 0.01,3,5
   'max_depth': 5,## Try 4,5,6
   'subsample': 0.7,
   'colsample_bytree': 0.7,
   'objective': 'reg:linear',
    'eval_metric': 'rmse',
    'silent': 1,
    'lambda':5,
    'min_child_weight':5
}
xgb_cvalid = xgb.cv(params, dtrain, num_boost_round=1000, early_stopping_rounds=20,
     verbose_eval=50, show_stdv=True,seed=42)
print(len(xgb_cvalid))

[0]	train-rmse:76820.8+796.256	test-rmse:76820.5+1597.85
[50]	train-rmse:36399.8+921.416	test-rmse:38080.7+1966.77
[100]	train-rmse:33444.8+798.356	test-rmse:36762.8+1971.74
[150]	train-rmse:31870.2+761.324	test-rmse:36426.2+1911.36
[200]	train-rmse:30659.7+710.495	test-rmse:36220.5+1937.27
[250]	train-rmse:29648.8+684.414	test-rmse:36113.6+1947.95
[300]	train-rmse:28775.5+745.087	test-rmse:36077.2+1951.24
[350]	train-rmse:27933.2+740.11	test-rmse:36034.8+1947.67
375


**Create Five Fold CV**

In [176]:
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error as mse

In [177]:
kf = KFold(n_splits=5,shuffle=False)

In [178]:
train = train.reset_index(drop=True)

In [191]:
Accuracy = []
for train_index, test_index in kf.split(Features):
    print("TRAIN:", train_index, "TEST:", test_index)
    X_train, X_test = train.iloc[train_index,:], train.iloc[test_index,:]
    y_train, y_test = label[train_index], label[test_index]
    dtrain = xgb.DMatrix(X_train[col],y_train)
    dtest = xgb.DMatrix(X_test[col])
    model = xgb.train(params,dtrain,num_boost_round=375)
    print('training done')
    pred = model.predict(dtest)
    actual_pred = (X_test['Primary Average'] + pred)*X_test['full_sq']
    RMSLE = mse(np.log(price_doc[test_index]+1),np.log(actual_pred+1))**0.5
    print('RMSLE: '+str(RMSLE))
    Accuracy.append(RMSLE)

TRAIN: [ 5853  5854  5855 ..., 29259 29260 29261] TEST: [   0    1    2 ..., 5850 5851 5852]
training done
RMSLE: 0.403819306815
TRAIN: [    0     1     2 ..., 29259 29260 29261] TEST: [ 5853  5854  5855 ..., 11703 11704 11705]
training done
RMSLE: 0.320027073813
TRAIN: [    0     1     2 ..., 29259 29260 29261] TEST: [11706 11707 11708 ..., 17555 17556 17557]
training done
RMSLE: 0.334029444864
TRAIN: [    0     1     2 ..., 29259 29260 29261] TEST: [17558 17559 17560 ..., 23407 23408 23409]
training done
RMSLE: 0.324464626265
TRAIN: [    0     1     2 ..., 23407 23408 23409] TEST: [23410 23411 23412 ..., 29259 29260 29261]
training done
RMSLE: 0.295966289126


In [192]:
np.mean(Accuracy)

0.33566134817671123

**Test New Method**

In [193]:
dtrain = xgb.DMatrix(train[col],label)
model = xgb.train(params,dtrain,num_boost_round=375)

In [194]:
dtest = xgb.DMatrix(test[col])

In [195]:
feature_importance = get_feature_importance(model)

In [196]:
feature_importance.iloc[0:50,]

Unnamed: 0,Feature,Score
153,extra_area,384
160,build_year,286
80,life_sq_ratio,226
178,week_year_cnt,168
165,floor,167
19,max_floor,139
14,rel_kitch_sq,132
203,num_room,125
37,radiation_km,124
217,month_year_cnt,123


In [197]:
pred = model.predict(dtest)

In [198]:
Actual_predict = (pred + test['Primary Average'])*test['full_sq']

In [201]:
sub = pd.DataFrame({'id':test['id'],'price_doc':Actual_predict})

In [202]:
sub.to_csv('Test_UseGapAsTarget.csv',index=False)