In [1]:
%matplotlib inline
import pandas as pd
import numpy as np
import xgboost as xgb



# Single Model Fitting

## Original Feature + 0.95 Label + Average Price

In [2]:
train = pd.read_csv('train_c.csv')
test = pd.read_csv('test_c.csv')

In [5]:
price = pd.read_csv('moscow_avg_price.csv')[['timestamp','moscow_avg_price_avg_rub']]

In [6]:
price

Unnamed: 0,timestamp,moscow_avg_price_avg_rub
0,2011-01-01,137590.544872
1,2011-01-02,137499.198718
2,2011-01-03,137407.852564
3,2011-01-04,137316.506410
4,2011-01-05,137225.160256
5,2011-01-06,137133.814103
6,2011-01-07,137042.467949
7,2011-01-08,136951.121795
8,2011-01-09,136859.775641
9,2011-01-10,136768.429487


In [187]:
price = pd.read_csv('moscow_avg_price.csv')

In [16]:
train = train.merge(price,on='timestamp',how = 'left')
test = test.merge(price,on = 'timestamp', how = 'left')

In [19]:
train['moscow_avg_price_rub'] = train['moscow_avg_price_rub']/train['full_sq']
test['moscow_avg_price_rub'] = test['moscow_avg_price_rub']/test['full_sq']

In [23]:
train = train.replace(np.inf, np.nan)
test = test.replace(np.inf,np.nan)

In [33]:
def get_feature_importance(model):
    Importance = model.get_fscore()
    Importance = list(Importance.items())
    Feature= []
    Score = []
    for each in Importance:
        Feature.append(each[0])
        Score.append(each[1])
    df = pd.DataFrame({'Feature':Feature,'Score':Score}).sort_values(by=['Score'],ascending=[0])
    return df    

**Fit the model**

In [27]:
label = train['price_doc']*0.95 + 10
col = list(test.columns)[2:]

In [28]:
dtrain = xgb.DMatrix(train[col],label)
dtest = xgb.DMatrix(test[col])

In [39]:
params = {
   'eta': 0.05, ## Try 0.01,3,5
   'max_depth': 5,## Try 4,5,6
   'subsample': 0.7,
   'colsample_bytree': 0.7,
   'objective': 'reg:linear',
    'eval_metric': 'rmse',
    'silent': 1
}
xgb_cvalid = xgb.cv(params, dtrain, num_boost_round=1000, early_stopping_rounds=20,
     verbose_eval=50, show_stdv=True,seed=42)
print('Performance doesnt improve from: '+str(len(xgb_cvalid)))

[0]	train-rmse:7.7947e+06+45478	test-rmse:7.79967e+06+95275.6
[50]	train-rmse:2.37387e+06+40432.6	test-rmse:2.73536e+06+82409.3
[100]	train-rmse:2.07026e+06+26538.8	test-rmse:2.58058e+06+103695
[150]	train-rmse:1.95881e+06+25371.1	test-rmse:2.54896e+06+104488
[200]	train-rmse:1.87585e+06+18308.2	test-rmse:2.53077e+06+103705
[250]	train-rmse:1.80625e+06+20571	test-rmse:2.51933e+06+102223
[300]	train-rmse:1.74894e+06+18545.7	test-rmse:2.51221e+06+100952
[350]	train-rmse:1.69543e+06+16569.4	test-rmse:2.50903e+06+98330.5
[400]	train-rmse:1.64934e+06+14140.5	test-rmse:2.50469e+06+99086
Performance doesnt improve from: 412


Create a submission for this

In [43]:
model = xgb.train(params,dtrain,num_boost_round=412) ## Overfitting with all iterations?
feature_importance = get_feature_importance(model)
feature_importance.iloc[0:20,:]

Unnamed: 0,Feature,Score
15,full_sq,1053
188,moscow_avg_price_rub,535
98,life_sq,444
127,floor,358
37,max_floor,301
192,build_year,296
196,kitch_sq,147
187,state,127
82,num_room,121
41,radiation_km,111


In [41]:
pred = model.predict(dtest)
sub = pd.DataFrame({'id':test['id'],'price_doc':pred})

In [42]:
sub.to_csv('xgb+original+average_priceNoLambda.csv',index=False)

**LB: 0.32441 All Iterations**

## Subsampled Data set with Gap as the target

In [60]:
train = pd.read_csv('train_clean.csv')
test = pd.read_csv('test_clean.csv')
train = train.merge(price,on='timestamp',how = 'left')
test = test.merge(price,on = 'timestamp', how = 'left')

In [61]:
fullsq_fill = np.mean(train[train['full_sq'].notnull()]['full_sq'])
train['full_sq'] = train['full_sq'].fillna(fullsq_fill)
fullsq_fill = np.mean(test[test['full_sq'].notnull()]['full_sq'])
test['full_sq'] = test['full_sq'].fillna(fullsq_fill)

In [62]:
#train['moscow_avg_price_low_rub'] = train['moscow_avg_price_low_rub']/train['full_sq']
#test['moscow_avg_price_low_rub'] = test['moscow_avg_price_low_rub']/test['full_sq']

In [63]:
train = train.replace(np.inf, np.nan)
test = test.replace(np.inf,np.nan)

In [64]:
# Add month-year
train['timestamp'] = pd.to_datetime(train['timestamp'])
month_year = (train.timestamp.dt.month + train.timestamp.dt.year * 100)
month_year_cnt_map = month_year.value_counts().to_dict()
train['month_year_cnt'] = month_year.map(month_year_cnt_map)
test['timestamp'] = pd.to_datetime(test['timestamp'])
month_year = (test.timestamp.dt.month + test.timestamp.dt.year * 100)
month_year_cnt_map = month_year.value_counts().to_dict()
test['month_year_cnt'] = month_year.map(month_year_cnt_map)
# Add week-year count
week_year = (train.timestamp.dt.weekofyear + train.timestamp.dt.year * 100)
week_year_cnt_map = week_year.value_counts().to_dict()
train['week_year_cnt'] = week_year.map(week_year_cnt_map)
week_year = (test.timestamp.dt.weekofyear + test.timestamp.dt.year * 100)
week_year_cnt_map = week_year.value_counts().to_dict()
test['week_year_cnt'] = week_year.map(week_year_cnt_map)
# Add month and day-of-week
train['month'] = train.timestamp.dt.month
train['dow'] = train.timestamp.dt.dayofweek
test['month'] = test.timestamp.dt.month
test['dow'] = test.timestamp.dt.dayofweek
# Other feature engineering
train['rel_floor'] = train['floor'] / train['max_floor'].astype(float)
train['rel_kitch_sq'] = train['kitch_sq'] / train['full_sq'].astype(float)
test['rel_floor'] = test['floor'] / test['max_floor'].astype(float)
test['rel_kitch_sq'] = test['kitch_sq'] / test['full_sq'].astype(float)
train.apartment_name=train.sub_area.astype(str) + train['metro_km_avto'].astype(str)
test.apartment_name=test.sub_area.astype(str) + train['metro_km_avto'].astype(str)
train['room_size'] = train['life_sq'] / train['num_room'].astype(float)
test['room_size'] = test['life_sq'] / test['num_room'].astype(float)

In [65]:
macro_cols = ['timestamp',"balance_trade", "balance_trade_growth", "eurrub", "average_provision_of_build_contract",
"micex_rgbi_tr", "micex_cbi_tr", "deposits_rate", "mortgage_value", "mortgage_rate",
"income_per_cap", "rent_price_4+room_bus", "museum_visitis_per_100_cap", "apartment_build"]
macro = pd.read_csv('macro_c.csv')[macro_cols]

In [66]:
train = train.merge(macro,how='left',on='timestamp')
test = test.merge(macro,how='left',on='timestamp')

In [67]:
def get_ratio_preschool(df):
    df['ratio_preschool'] = df['children_preschool'] / (df['children_school'] + 1)
    return df

train = get_ratio_preschool(train)
test = get_ratio_preschool(test)

def get_extra_area(df):
    df['extra_area'] = df['full_sq'] - df['life_sq']
    return df

train = get_extra_area(train)
test = get_extra_area(test)

def get_floor_ratio(df):
    df['floor_ratio'] = df['max_floor'] - df['floor']
    return df

train = get_floor_ratio(train)
test = get_floor_ratio(test)

def get_room_avg_size(df):
    df['room_avg_size'] = (df['life_sq'] - df['kitch_sq']) / (df['num_room'] + 1)
    return df

train = get_room_avg_size(train)
test = get_room_avg_size(test)

**Bold Move**

In [68]:
train = train[train.moscow_avg_price_avg_rub.notnull()]
train.shape

(29262, 317)

In [69]:
Price_sqm = [x/y for x,y in zip(list(train['price_doc']*0.95+10),list(train['full_sq']))]
Price_gap = Price_sqm - train['moscow_avg_price_avg_rub']

In [70]:
train['Price_sqm'] = Price_sqm

**Create New Features of SQ and Drop Redundant Ones**

In [71]:
train['life_sq_ratio'] = train['life_sq']/train['full_sq']
train['kitch_sq_ratio'] = train['kitch_sq']/train['full_sq']
test['life_sq_ratio'] = test['life_sq']/test['full_sq']
test['kitch_sq_ratio'] = test['kitch_sq']/test['full_sq']

In [72]:
col = list(test.columns)[2:]
col = [each for each in col if each not in ['full_sq','life_sq','kitch_sq']]
label = train['Price_sqm']

In [76]:
dtrain = xgb.DMatrix(train[col],label)

In [77]:
params = {
   'eta': 0.05, ## Try 0.01,3,5
   'max_depth': 5,## Try 4,5,6
   'subsample': 0.7,
   'colsample_bytree': 0.7,
   'objective': 'reg:linear',
    'eval_metric': 'rmse',
    'silent': 1
}
xgb_cvalid = xgb.cv(params, dtrain, num_boost_round=1000, early_stopping_rounds=20,
     verbose_eval=50, show_stdv=True,seed=42)
print(len(xgb_cvalid))

[0]	train-rmse:133513+614.99	test-rmse:133515+1251.11
[50]	train-rmse:34585+457.082	test-rmse:37293.6+1771.24
[100]	train-rmse:30287.3+321.5	test-rmse:35066+1807.64
[150]	train-rmse:28702.1+330.564	test-rmse:34718.2+1825.5
[200]	train-rmse:27513.2+270.886	test-rmse:34574.5+1841.89
[250]	train-rmse:26443.2+286.621	test-rmse:34450.9+1844.59
[300]	train-rmse:25481.9+297.655	test-rmse:34371.8+1847.45
[350]	train-rmse:24685.8+343.849	test-rmse:34321.7+1860.16
353


In [None]:
[0]	train-rmse:133513+614.837	test-rmse:133514+1251.8
[50]	train-rmse:34749.9+362.496	test-rmse:37396.1+1828.55
[100]	train-rmse:30665.1+342.071	test-rmse:35201.6+1791.8
[150]	train-rmse:29086+269.026	test-rmse:34881.4+1834.53
[200]	train-rmse:27856.7+264.026	test-rmse:34727.2+1822.09
[250]	train-rmse:26773+252.794	test-rmse:34665.8+1827.22
[300]	train-rmse:25856.5+275.036	test-rmse:34597.8+1823.6
[350]	train-rmse:25017.1+275.499	test-rmse:34568.2+1875.36
[400]	train-rmse:24257.9+281.502	test-rmse:34547.2+1871.69
400

In [78]:
dtrain = xgb.DMatrix(train[col],label)
model = xgb.train(params,dtrain,num_boost_round=353)

In [79]:
feature_importance = get_feature_importance(model).iloc[0:20,:]

In [80]:
feature_importance

Unnamed: 0,Feature,Score
258,floor,376
264,extra_area,371
168,build_year,293
92,max_floor,282
47,moscow_avg_price_avg_rub,250
113,life_sq_ratio,188
237,num_room,135
202,rel_kitch_sq,134
108,room_avg_size,106
222,week_year_cnt,101


In [81]:
dtest = xgb.DMatrix(test[col])

In [82]:
pred = model.predict(dtest)
Actual_predict = (pred)*test['full_sq']

In [83]:
sub = pd.DataFrame({'id':test['id'],'price_doc':Actual_predict})
sub.to_csv('Retesting3+xgb+additional+subsampled+average_price+0.95label.csv',index=False)

**0.31364**

## Subsampled Data set with Feature Average Price using gb:linear

In [257]:
train = pd.read_csv('train_clean.csv')
test = pd.read_csv('test_clean.csv')

In [258]:
# Add month-year
train['timestamp'] = pd.to_datetime(train['timestamp'])
month_year = (train.timestamp.dt.month + train.timestamp.dt.year * 100)
month_year_cnt_map = month_year.value_counts().to_dict()
train['month_year_cnt'] = month_year.map(month_year_cnt_map)
test['timestamp'] = pd.to_datetime(test['timestamp'])
month_year = (test.timestamp.dt.month + test.timestamp.dt.year * 100)
month_year_cnt_map = month_year.value_counts().to_dict()
test['month_year_cnt'] = month_year.map(month_year_cnt_map)
# Add week-year count
week_year = (train.timestamp.dt.weekofyear + train.timestamp.dt.year * 100)
week_year_cnt_map = week_year.value_counts().to_dict()
train['week_year_cnt'] = week_year.map(week_year_cnt_map)
week_year = (test.timestamp.dt.weekofyear + test.timestamp.dt.year * 100)
week_year_cnt_map = week_year.value_counts().to_dict()
test['week_year_cnt'] = week_year.map(week_year_cnt_map)
# Add month and day-of-week
train['month'] = train.timestamp.dt.month
train['dow'] = train.timestamp.dt.dayofweek
test['month'] = test.timestamp.dt.month
test['dow'] = test.timestamp.dt.dayofweek
# Other feature engineering
train['rel_floor'] = train['floor'] / train['max_floor'].astype(float)
train['rel_kitch_sq'] = train['kitch_sq'] / train['full_sq'].astype(float)
test['rel_floor'] = test['floor'] / test['max_floor'].astype(float)
test['rel_kitch_sq'] = test['kitch_sq'] / test['full_sq'].astype(float)
train.apartment_name=train.sub_area.astype(str) + train['metro_km_avto'].astype(str)
test.apartment_name=test.sub_area.astype(str) + train['metro_km_avto'].astype(str)
train['room_size'] = train['life_sq'] / train['num_room'].astype(float)
test['room_size'] = test['life_sq'] / test['num_room'].astype(float)

In [259]:
macro_cols = ['timestamp',"balance_trade", "balance_trade_growth", "eurrub", "average_provision_of_build_contract",
"micex_rgbi_tr", "micex_cbi_tr", "deposits_rate", "mortgage_value", "mortgage_rate",
"income_per_cap", "rent_price_4+room_bus", "museum_visitis_per_100_cap", "apartment_build"]
macro = pd.read_csv('macro_c.csv')[macro_cols]

In [260]:
train = train.merge(macro,how='left',on='timestamp')
test = test.merge(macro,how='left',on='timestamp')

In [261]:
def get_ratio_preschool(df):
    df['ratio_preschool'] = df['children_preschool'] / (df['children_school'] + 1)
    return df

train = get_ratio_preschool(train)
test = get_ratio_preschool(test)

def get_extra_area(df):
    df['extra_area'] = df['full_sq'] - df['life_sq']
    return df

train = get_extra_area(train)
test = get_extra_area(test)

def get_floor_ratio(df):
    df['floor_ratio'] = df['max_floor'] - df['floor']
    return df

train = get_floor_ratio(train)
test = get_floor_ratio(test)

def get_room_avg_size(df):
    df['room_avg_size'] = (df['life_sq'] - df['kitch_sq']) / (df['num_room'] + 1)
    return df

train = get_room_avg_size(train)
test = get_room_avg_size(test)

In [262]:
fullsq_fill = np.mean(train[train['full_sq'].notnull()]['full_sq'])
train['full_sq'] = train['full_sq'].fillna(fullsq_fill)
fullsq_fill = np.mean(test[test['full_sq'].notnull()]['full_sq'])
test['full_sq'] = test['full_sq'].fillna(fullsq_fill)

In [263]:
train['avg_price'] = train['price_doc']/train['full_sq']

**Calculate the average price per sqrm for each sub area**

In [284]:
price = train.groupby(['sub_area'])['avg_price'].mean()
sub_area = price.index

In [289]:
subarea_avg = pd.DataFrame({'sub_area':sub_area,'sub_avg_price':price}).reset_index(drop=True)

In [290]:
train = train.merge(subarea_avg,how='left',on='sub_area')
test = test.merge(subarea_avg,how='left',on='sub_area')

In [291]:
del train['avg_price']

In [294]:
col = list(test.columns)[2:]
label = train['price_doc']

In [295]:
dtrain = xgb.DMatrix(train[col],label)

In [296]:
params = {
   'eta': 0.05, ## Try 0.01,3,5
   'max_depth': 5,## Try 4,5,6
   'subsample': 0.7,
   'colsample_bytree': 0.7,
   'objective': 'reg:linear',
    'eval_metric': 'rmse',
    'silent': 1
}
xgb_cvalid = xgb.cv(params, dtrain, num_boost_round=1000, early_stopping_rounds=20,
     verbose_eval=50, show_stdv=True,seed=42)
print(len(xgb_cvalid))

[0]	train-rmse:8.3586e+06+16274.1	test-rmse:8.36429e+06+26572.4
[50]	train-rmse:2.21799e+06+26375.4	test-rmse:2.66704e+06+108087
[100]	train-rmse:1.88487e+06+11029	test-rmse:2.49606e+06+117576
[150]	train-rmse:1.76283e+06+8308.07	test-rmse:2.46068e+06+115387
[200]	train-rmse:1.66958e+06+3675.63	test-rmse:2.43904e+06+115952
[250]	train-rmse:1.59469e+06+7365.5	test-rmse:2.42396e+06+115038
[300]	train-rmse:1.52901e+06+6984.82	test-rmse:2.41548e+06+112460
[350]	train-rmse:1.47478e+06+9523.39	test-rmse:2.40856e+06+112232
374


In [297]:
model = xgb.train(params,dtrain,num_boost_round=len(xgb_cvalid))

In [298]:
dtest = xgb.DMatrix(test[col])

In [299]:
pred = model.predict(dtest)

In [300]:
sub_avg_price = pd.DataFrame({'id':test['id'],'price_doc':pred})

In [301]:
sub_avg_price.to_csv('Average_Sub_Area_Price+original_label.csv',index=False)