In [27]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import model_selection, preprocessing
import xgboost as xgb
import datetime

In [28]:
train = pd.read_csv('train.csv', parse_dates=['timestamp'])
test = pd.read_csv('test.csv', parse_dates=['timestamp'])
macro = pd.read_csv('macro.csv', parse_dates=['timestamp'])
id_test = test.id

## Clean up Data 

In [29]:
#clean data
bad_index = train[train.life_sq > train.full_sq].index
train.ix[bad_index, "life_sq"] = np.NaN
equal_index = [601,1896,2791]
test.ix[equal_index, "life_sq"] = test.ix[equal_index, "full_sq"]
bad_index = test[test.life_sq > test.full_sq].index
test.ix[bad_index, "life_sq"] = np.NaN
bad_index = train[train.life_sq < 5].index
train.ix[bad_index, "life_sq"] = np.NaN
bad_index = test[test.life_sq < 5].index
test.ix[bad_index, "life_sq"] = np.NaN
bad_index = train[train.full_sq < 5].index
train.ix[bad_index, "full_sq"] = np.NaN
bad_index = test[test.full_sq < 5].index
test.ix[bad_index, "full_sq"] = np.NaN
kitch_is_build_year = [13117]
train.ix[kitch_is_build_year, "build_year"] = train.ix[kitch_is_build_year, "kitch_sq"]
bad_index = train[train.kitch_sq >= train.life_sq].index
train.ix[bad_index, "kitch_sq"] = np.NaN
bad_index = test[test.kitch_sq >= test.life_sq].index
test.ix[bad_index, "kitch_sq"] = np.NaN
bad_index = train[(train.kitch_sq == 0).values + (train.kitch_sq == 1).values].index
train.ix[bad_index, "kitch_sq"] = np.NaN
bad_index = test[(test.kitch_sq == 0).values + (test.kitch_sq == 1).values].index
test.ix[bad_index, "kitch_sq"] = np.NaN
bad_index = train[(train.full_sq > 210) & (train.life_sq / train.full_sq < 0.3)].index
train.ix[bad_index, "full_sq"] = np.NaN
bad_index = test[(test.full_sq > 150) & (test.life_sq / test.full_sq < 0.3)].index
test.ix[bad_index, "full_sq"] = np.NaN
bad_index = train[train.life_sq > 300].index
train.ix[bad_index, ["life_sq", "full_sq"]] = np.NaN
bad_index = test[test.life_sq > 200].index
test.ix[bad_index, ["life_sq", "full_sq"]] = np.NaN
train.product_type.value_counts(normalize= True)
test.product_type.value_counts(normalize= True)
bad_index = train[train.build_year < 1500].index
train.ix[bad_index, "build_year"] = np.NaN
bad_index = test[test.build_year < 1500].index
test.ix[bad_index, "build_year"] = np.NaN
bad_index = train[train.num_room == 0].index 
train.ix[bad_index, "num_room"] = np.NaN
bad_index = test[test.num_room == 0].index 
test.ix[bad_index, "num_room"] = np.NaN
bad_index = [10076, 11621, 17764, 19390, 24007, 26713, 29172]
train.ix[bad_index, "num_room"] = np.NaN
bad_index = [3174, 7313]
test.ix[bad_index, "num_room"] = np.NaN
bad_index = train[(train.floor == 0).values * (train.max_floor == 0).values].index
train.ix[bad_index, ["max_floor", "floor"]] = np.NaN
bad_index = train[train.floor == 0].index
train.ix[bad_index, "floor"] = np.NaN
bad_index = train[train.max_floor == 0].index
train.ix[bad_index, "max_floor"] = np.NaN
bad_index = test[test.max_floor == 0].index
test.ix[bad_index, "max_floor"] = np.NaN
bad_index = train[train.floor > train.max_floor].index
train.ix[bad_index, "max_floor"] = np.NaN
bad_index = test[test.floor > test.max_floor].index
test.ix[bad_index, "max_floor"] = np.NaN
bad_index = [23584]
train.ix[bad_index, "floor"] = np.NaN
bad_index = train[train.state == 33].index
train.ix[bad_index, "state"] = np.NaN
# brings error down a lot by removing extreme price per sqm
train.loc[train.full_sq == 0, 'full_sq'] = 50
train = train[train.price_doc/train.full_sq <= 600000]
train = train[train.price_doc/train.full_sq >= 10000]

In [30]:
y_train = np.log(1+train["price_doc"])
col = list(test.columns)[2:]
x_train = train[col]
x_test = test[col]

num_train = len(x_train)
x_all = pd.concat([x_train, x_test])
cat = []
for c in x_all.columns:
    if x_all[c].dtype == 'object':
        x_all[c] = pd.factorize(x_all[c], sort=True)[0]
        cat.append(c)

In [31]:
### Additional Cat Features
add_cat = [each for each in list(x_all.columns) if 'ID' in each]
add_cat.append('build_year')
cat = cat+add_cat

In [32]:
for dd in add_cat:
    x_all[dd] = pd.factorize(x_all[dd],sort=True)[0]

# Normalize All con Data in dataset

In [33]:
import warnings

In [34]:
warnings.filterwarnings('ignore')

In [35]:
def normalize(col):
    mean = np.nanmean(x_all[col])
    std = np.nanstd(x_all[col])
    return [(each - mean)/std if np.isnan(each)!=True else np.nan for each in list(x_all[col])]

In [36]:
con_var = [each for each in list(x_all.columns) if each not in cat]

In [37]:
for each in con_var:
    x_all[each] = normalize(each)

In [40]:
## Save the data
x_train = x_all[:num_train]
x_test = x_all[num_train:]
#x_train['id'] = train['id']
#x_train['timestamp'] = train['timestamp']
#x_train['price_doc'] = train['price_doc']
#x_test['timestamp'] = test['timestamp']
#x_test['id'] = test['id']
#x_train.to_csv('train_normalized.csv',index=False)
#x_test.to_csv('test_normalized.csv',index=False)

# L2 Model Fitting

In [67]:
from sklearn.linear_model import Lasso
from sklearn.model_selection import cross_val_score

In [79]:
def cv_Lasso(lso):
    print(lso)
    x_train_lasso = x_train.fillna(-999)
    cv = cross_val_score(lso,x_train_lasso,y_train,cv=5,scoring='mean_squared_error')
    #print(cv)
    mean_mse = -1*cv.mean()
    print('RMSE: '+str(mean_mse**0.5))

In [80]:
alpha = [0.1,0.5,1,5,10,20]
for aa in alpha:
    cv_Lasso(Lasso(alpha=aa))

Lasso(alpha=0.1, copy_X=True, fit_intercept=True, max_iter=1000,
   normalize=False, positive=False, precompute=False, random_state=None,
   selection='cyclic', tol=0.0001, warm_start=False)
RMSE: 0.501600983504
Lasso(alpha=0.5, copy_X=True, fit_intercept=True, max_iter=1000,
   normalize=False, positive=False, precompute=False, random_state=None,
   selection='cyclic', tol=0.0001, warm_start=False)
RMSE: 0.576741888574
Lasso(alpha=1, copy_X=True, fit_intercept=True, max_iter=1000,
   normalize=False, positive=False, precompute=False, random_state=None,
   selection='cyclic', tol=0.0001, warm_start=False)
RMSE: 0.578438118054
Lasso(alpha=5, copy_X=True, fit_intercept=True, max_iter=1000,
   normalize=False, positive=False, precompute=False, random_state=None,
   selection='cyclic', tol=0.0001, warm_start=False)
RMSE: 0.5827268704
Lasso(alpha=10, copy_X=True, fit_intercept=True, max_iter=1000,
   normalize=False, positive=False, precompute=False, random_state=None,
   selection='cyclic'

In [82]:
x_train_lasso = x_train.fillna(-999)
lasso = Lasso(alpha=0.1)
lasso.fit(x_train_lasso,y_train)

Lasso(alpha=0.1, copy_X=True, fit_intercept=True, max_iter=1000,
   normalize=False, positive=False, precompute=False, random_state=None,
   selection='cyclic', tol=0.0001, warm_start=False)

In [83]:
coeff = lasso.coef_
feature = list(x_train_lasso.columns)
coeff_tbl= pd.DataFrame({'feature':feature,'coeff':coeff})

In [87]:
coeff_final = coeff_tbl[coeff_tbl['coeff']!=0]

In [89]:
coeff_final['feature']

0                                   full_sq
1                                   life_sq
2                                     floor
3                                 max_floor
4                                  material
5                                build_year
6                                  num_room
7                                  kitch_sq
8                                     state
10                                 sub_area
16                          preschool_quota
22                      hospital_beds_raion
66     raion_build_count_with_material_info
68                         build_count_wood
82                                 ID_metro
97                 ID_railroad_station_walk
100                ID_railroad_station_avto
111                            ID_big_road1
114                            ID_big_road2
158              cafe_sum_500_min_price_avg
159              cafe_sum_500_max_price_avg
181             cafe_sum_1000_min_price_avg
182             cafe_sum_1000_ma

# Base XGBoost Model

In [65]:
xgb_params = {
    #'booster':'gblinear',
    'eta': 0.05,
    'max_depth': 5,
    'subsample': 0.7,
    'colsample_bytree': 0.7,
    'objective': 'reg:linear',
    'eval_metric': 'rmse',
    'silent': 1
}

dtrain = xgb.DMatrix(x_train, y_train)
dtest = xgb.DMatrix(x_test)
cv_output = xgb.cv(xgb_params, dtrain, num_boost_round=4000, early_stopping_rounds=20,
    verbose_eval=50, show_stdv=False)
print(len(cv_output))

[0]	train-rmse:0.483103	test-rmse:0.484864
[50]	train-rmse:0.462595	test-rmse:0.468099
[100]	train-rmse:0.46185	test-rmse:0.467795
[150]	train-rmse:0.461458	test-rmse:0.467632
[200]	train-rmse:0.461208	test-rmse:0.467539
[250]	train-rmse:0.461041	test-rmse:0.4675
281


In [None]:
## With Linear Booster eta 5 tree 8
#[2850]	train-rmse:0.460279	test-rmse:0.467247
#[2900]	train-rmse:0.460273	test-rmse:0.467245
#2927
## With Lambda as 5
#[0]	train-rmse:14.3692	test-rmse:14.3692
#[50]	train-rmse:1.20731	test-rmse:1.21133
#[100]	train-rmse:0.442692	test-rmse:0.467986
#[150]	train-rmse:0.417063	test-rmse:0.455566
#[200]	train-rmse:0.403856	test-rmse:0.454054
#[250]	train-rmse:0.392721	test-rmse:0.453433
#Base Model
#[0]	train-rmse:14.3689	test-rmse:14.3689
#[50]	train-rmse:1.19822	test-rmse:1.20359
#[100]	train-rmse:0.435903	test-rmse:0.466432
#[150]	train-rmse:0.410839	test-rmse:0.455221
#[200]	train-rmse:0.397977	test-rmse:0.45433



In [42]:
model = xgb.train(xgb_params,dtrain,num_boost_round=len(cv_output))

In [43]:
def get_feature_importance(model):
    Importance = model.get_fscore()
    Importance = list(Importance.items())
    Feature= []
    Score = []
    for each in Importance:
        Feature.append(each[0])
        Score.append(each[1])
    df = pd.DataFrame({'Feature':Feature,'Score':Score}).sort_values(by=['Score'],ascending=[0])
    return df    


In [53]:
feature_importance_puremodel = get_feature_importance(model)

In [54]:
pred_puremodel = np.exp(model.predict(dtest))-1