## Fire Up

In [1]:
%matplotlib inline
import pandas as pd
import numpy as np
import xgboost as xgb



In [2]:
train = pd.read_csv('train_adjusted.csv')
test = pd.read_csv('test_c.csv')

## Convert Categorical Into Numerical

In [3]:
col = list(test.columns)[2:]
cat = []
for each in col:
    if train[each].dtype == 'object' or 'ID' in each:
        train[each] = pd.factorize(train[each], sort=True)[0]
        test[each]=pd.factorize(test[each], sort=True)[0]
        cat.append(each)

## Cleansing

In [4]:
bad_index = train[train.life_sq > train.full_sq].index
train.ix[bad_index, "life_sq"] = np.NaN
bad_index = test[test.life_sq > test.full_sq].index
test.ix[bad_index, "life_sq"] = np.NaN
bad_index = train[train.life_sq < 5].index
train.ix[bad_index, "life_sq"] = np.NaN
bad_index = test[test.life_sq < 5].index
test.ix[bad_index, "life_sq"] = np.NaN
bad_index = train[train.full_sq < 5].index
train.ix[bad_index, "full_sq"] = np.NaN
bad_index = test[test.full_sq < 5].index
test.ix[bad_index, "full_sq"] = np.NaN
bad_index = train[train.kitch_sq >= train.life_sq].index
train.ix[bad_index, "kitch_sq"] = np.NaN
bad_index = test[test.kitch_sq >= test.life_sq].index
test.ix[bad_index, "kitch_sq"] = np.NaN
bad_index = train[(train.kitch_sq == 0).values + (train.kitch_sq == 1).values].index
train.ix[bad_index, "kitch_sq"] = np.NaN
bad_index = test[(test.kitch_sq == 0).values + (test.kitch_sq == 1).values].index
test.ix[bad_index, "kitch_sq"] = np.NaN
bad_index = train[(train.full_sq > 210) & (train.life_sq / train.full_sq < 0.3)].index
train.ix[bad_index, "full_sq"] = np.NaN
bad_index = test[(test.full_sq > 150) & (test.life_sq / test.full_sq < 0.3)].index
test.ix[bad_index, "full_sq"] = np.NaN
bad_index = train[train.life_sq > 300].index
train.ix[bad_index, ["life_sq", "full_sq"]] = np.NaN
bad_index = test[test.life_sq > 200].index
test.ix[bad_index, ["life_sq", "full_sq"]] = np.NaN
bad_index = train[train.build_year < 1500].index
train.ix[bad_index, "build_year"] = np.NaN
bad_index = test[test.build_year < 1500].index
test.ix[bad_index, "build_year"] = np.NaN
bad_index = train[train.num_room == 0].index 
train.ix[bad_index, "num_room"] = np.NaN
bad_index = test[test.num_room == 0].index 
test.ix[bad_index, "num_room"] = np.NaN
bad_index = train[(train.floor == 0).values * (train.max_floor == 0).values].index
train.ix[bad_index, ["max_floor", "floor"]] = np.NaN
bad_index = train[train.floor == 0].index
train.ix[bad_index, "floor"] = np.NaN
bad_index = train[train.max_floor == 0].index
train.ix[bad_index, "max_floor"] = np.NaN
bad_index = test[test.max_floor == 0].index
test.ix[bad_index, "max_floor"] = np.NaN
bad_index = train[train.floor > train.max_floor].index
train.ix[bad_index, "max_floor"] = np.NaN
bad_index = test[test.floor > test.max_floor].index
test.ix[bad_index, "max_floor"] = np.NaN
bad_index = train[train.state == 33].index
train.ix[bad_index, "state"] = np.NaN

## Extra Feature Addition

In [5]:
# Add month-year
train['timestamp'] = pd.to_datetime(train['timestamp'])
month_year = (train.timestamp.dt.month + train.timestamp.dt.year * 100)
month_year_cnt_map = month_year.value_counts().to_dict()
train['month_year_cnt'] = month_year.map(month_year_cnt_map)
test['timestamp'] = pd.to_datetime(test['timestamp'])
month_year = (test.timestamp.dt.month + test.timestamp.dt.year * 100)
month_year_cnt_map = month_year.value_counts().to_dict()
test['month_year_cnt'] = month_year.map(month_year_cnt_map)
# Add week-year count
week_year = (train.timestamp.dt.weekofyear + train.timestamp.dt.year * 100)
week_year_cnt_map = week_year.value_counts().to_dict()
train['week_year_cnt'] = week_year.map(week_year_cnt_map)
week_year = (test.timestamp.dt.weekofyear + test.timestamp.dt.year * 100)
week_year_cnt_map = week_year.value_counts().to_dict()
test['week_year_cnt'] = week_year.map(week_year_cnt_map)
# Add month and day-of-week
train['month'] = train.timestamp.dt.month
train['dow'] = train.timestamp.dt.dayofweek
test['month'] = test.timestamp.dt.month
test['dow'] = test.timestamp.dt.dayofweek
# Other feature engineering
train['rel_floor'] = train['floor'] / train['max_floor'].astype(float)
train['rel_kitch_sq'] = train['kitch_sq'] / train['full_sq'].astype(float)
test['rel_floor'] = test['floor'] / test['max_floor'].astype(float)
test['rel_kitch_sq'] = test['kitch_sq'] / test['full_sq'].astype(float)
train['room_size'] = train['life_sq'] / train['num_room'].astype(float)
test['room_size'] = test['life_sq'] / test['num_room'].astype(float)

## Involve Macro Features

In [6]:
macro_cols = ['timestamp',"balance_trade", "balance_trade_growth", "eurrub", "average_provision_of_build_contract",
"micex_rgbi_tr", "micex_cbi_tr", "deposits_rate", "mortgage_value", "mortgage_rate",
"income_per_cap", "rent_price_4+room_bus", "museum_visitis_per_100_cap", "apartment_build"]
macro = pd.read_csv('macro_c.csv')[macro_cols]
train = train.merge(macro,how='left',on='timestamp')
test = test.merge(macro,how='left',on='timestamp')

## Create PCA Features

In [7]:
train_fill = train.fillna(-999)
test_fill = test.fillna(-999)
from sklearn.decomposition import PCA
n_comp = 20
pca = PCA(n_components=n_comp,random_state=42)
pca_results_train = pca.fit_transform(train_fill[col])
pca_results_test = pca.transform(test_fill[col])

## Create ICA Features

In [9]:
from sklearn.decomposition import FastICA
ica = FastICA(n_components = n_comp,random_state = 42)
ica_result_train = ica.fit_transform(train_fill[col])
ica_result_test = ica.transform(test_fill[col])

## Put features in original dataset

In [10]:
for i in range(1, n_comp + 1):
    train['pca_' + str(i)] = pca_results_train[:, i - 1]
    test['pca_' + str(i)] = pca_results_test[:, i - 1]
    train['ica_' + str(i)] = ica_result_train[:, i - 1]
    test['ica_' + str(i)] = ica_result_test[:, i - 1]

In [11]:
ReducedVar = []
for each in list(test.columns):
    if 'pca' in each or 'ica' in each:
        ReducedVar.append(each)

## Prepare a train/test set for Validating Stacking

In [12]:
train_ = train.fillna(-999)
test_ = test.fillna(-999)
from sklearn.model_selection import train_test_split
training,testing = train_test_split(train_,test_size = 0.2,random_state = 42)

## Model 1: ElasticNet Regression

In [13]:
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import ElasticNet

In [14]:
col_new = list(test.columns)[2:]

In [15]:
enet = ElasticNet(alpha=200,l1_ratio=1)
cv = cross_val_score(enet,train_[ReducedVar],np.log(train_['price_doc']+1),scoring='mean_squared_error',cv=5)

  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)


In [18]:
np.abs(cv)**0.5

array([ 0.50443312,  0.45055124,  0.47046905,  0.4819424 ,  0.48129275])

In [15]:
print(np.mean(np.abs(cv))**0.5)

0.478058888877


**Fit the model and check the squared error**

In [16]:
enet.fit(training[ReducedVar],np.log(training['price_doc']+1))
pred_enet = enet.predict(testing[ReducedVar])

In [17]:
ResultSingleModel = pd.DataFrame({'id':range(len(pred_enet)),'ElasticNet':pred_enet})

## Model2: Random Forest Regression

In [14]:
from sklearn.ensemble import RandomForestRegressor

In [15]:
rf = RandomForestRegressor(n_estimators=150,random_state=42)
#cv = cross_val_score(rf,train_[ReducedVar],np.log(train_['price_doc']+1),scoring='mean_squared_error',cv=5)

In [95]:
np.mean(np.abs(cv)**0.5)

0.42918294450179062

**Fit the model and check the error**

In [20]:
rf.fit(training[ReducedVar],np.log(training['price_doc']+1))
pred_rf = rf.predict(testing[ReducedVar])

In [21]:
ResultSingleModel['RandomForest'] = pred_rf

## Model 3: Gradient Boosting Regressor

In [16]:
from sklearn.ensemble import GradientBoostingRegressor
gbt = GradientBoostingRegressor(learning_rate = 0.05,n_estimators = 300,max_depth = 6,max_features = 'auto')
#cv = cross_val_score(gbt,train_[ReducedVar],np.log(train_['price_doc']+1),scoring='mean_squared_error',cv=5)

In [27]:
np.mean(np.abs(cv)**0.5)

0.43029469446315727

**Fit the model and check the error**

In [28]:
gbt.fit(training[ReducedVar],np.log(training['price_doc']+1))
pred_gbt = gbt.predict(testing[ReducedVar])

In [29]:
ResultSingleModel['GradientBoosting'] = pred_gbt

In [45]:
ResultSingleModel['Actual'] = list(np.log(testing['price_doc']+1))

## Model 4: xgboost, For Comparison only

In [66]:
dtrain = xgb.DMatrix(train[ReducedVar],np.log(train['price_doc']+1))

In [67]:
params = {
    'eta': 0.05, ## Try 0.01,3,5
    'max_depth': 3,## Try 4,5,6
    'subsample': 0.7,
    'colsample_bytree': 0.7,
    'objective': 'reg:linear',
    'eval_metric': 'rmse',
    'silent': 1,
    'lambda':0.5,
    'min_child_weight':3
}

In [68]:
xgbcv = xgb.cv(params, dtrain, num_boost_round=1000, early_stopping_rounds=20,
    verbose_eval=50, show_stdv=False,seed=42)

[0]	train-rmse:14.427	test-rmse:14.427
[50]	train-rmse:1.19881	test-rmse:1.20022
[100]	train-rmse:0.447184	test-rmse:0.453489
[150]	train-rmse:0.430291	test-rmse:0.439573
[200]	train-rmse:0.423986	test-rmse:0.436189
[250]	train-rmse:0.418447	test-rmse:0.433741
[300]	train-rmse:0.413627	test-rmse:0.431501
[350]	train-rmse:0.409287	test-rmse:0.430029
[400]	train-rmse:0.405461	test-rmse:0.428672
[450]	train-rmse:0.401767	test-rmse:0.427654
[500]	train-rmse:0.398329	test-rmse:0.426668
[550]	train-rmse:0.395125	test-rmse:0.425952
[600]	train-rmse:0.391921	test-rmse:0.425195
[650]	train-rmse:0.389302	test-rmse:0.424639
[700]	train-rmse:0.386791	test-rmse:0.424383
[750]	train-rmse:0.384164	test-rmse:0.424009
[800]	train-rmse:0.381629	test-rmse:0.42362
[850]	train-rmse:0.379199	test-rmse:0.423274
[900]	train-rmse:0.376868	test-rmse:0.422954


In [69]:
print(len(xgbcv))

919


In [81]:
dtraining = xgb.DMatrix(training[ReducedVar],np.log(training['price_doc']+1))
dtesting = xgb.DMatrix(testing[ReducedVar])

In [38]:
xgb_model = xgb.train(params,dtraining,num_boost_round=len(xgbcv))

In [39]:
xgb_predict = xgb_model.predict(dtesting)

In [40]:
ResultSingleModel['xgb'] = xgb_predict

## See whether this one will be good for stacking

In [39]:
ResultSingleModel = pd.read_csv('BeforeStacking.csv')

In [40]:
ResultSingleModel['ElasticNet'] = np.abs(ResultSingleModel['Actual'] - ResultSingleModel['ElasticNet'])
ResultSingleModel['RandomForest'] = np.abs(ResultSingleModel['Actual'] - ResultSingleModel['RandomForest'])
ResultSingleModel['GradientBoosting'] = np.abs(ResultSingleModel['Actual'] - ResultSingleModel['GradientBoosting'])
ResultSingleModel['xgb'] = np.abs(ResultSingleModel['Actual'] - ResultSingleModel['xgb'])

In [41]:
ResultSingleModel = ResultSingleModel[['ElasticNet','RandomForest','GradientBoosting','xgb']]

In [42]:
Best_index = list(np.argmin(ResultSingleModel.as_matrix(),axis=1))

In [37]:
from collections import Counter
Counter(Best_index)

Counter({0: 1390, 1: 827, 2: 931, 3: 2705})

**We can, actually, do stacking on three models,as they perform quite differently on the testing set.**

## Stacking Preparation

In [71]:
enet_feature = enet.fit(train_[ReducedVar],np.log(train_['price_doc']+1)).predict(train_[ReducedVar])
rf_feature = rf.fit(train_[ReducedVar],np.log(train_['price_doc']+1)).predict(train_[ReducedVar])
gbdt_feature = gbt.fit(train_[ReducedVar],np.log(train_['price_doc']+1)).predict(train_[ReducedVar])
xgb_feature = xgb.train(params,dtrain,num_boost_round=919).predict(dtrain)

In [72]:
stack_frame = pd.DataFrame({'ENet':enet_feature,'RF':rf_feature,'GBDT':gbdt_feature,'xgb':xgb_feature})

**Average stacking data frame**

In [79]:
stack_frame['average_value'] = (stack_frame['ENet']+stack_frame['RF']+stack_frame['GBDT']+stack_frame['xgb'])/4

**Do a small validation on the model**

In [82]:
enet_feature = enet.fit(training[ReducedVar],np.log(training['price_doc']+1)).predict(training[ReducedVar])
rf_feature = rf.fit(training[ReducedVar],np.log(training['price_doc']+1)).predict(training[ReducedVar])
gbdt_feature = gbt.fit(training[ReducedVar],np.log(training['price_doc']+1)).predict(training[ReducedVar])
xgb_feature =  xgb.train(params,dtraining,num_boost_round=919).predict(dtraining)

In [83]:
enet_predictor = enet.fit(training[ReducedVar],np.log(training['price_doc']+1)).predict(testing[ReducedVar])
rf_predictor = rf.fit(training[ReducedVar],np.log(training['price_doc']+1)).predict(testing[ReducedVar])
gbdt_predictor = gbt.fit(training[ReducedVar],np.log(training['price_doc']+1)).predict(testing[ReducedVar])
xgb_prdictor =  xgb.train(params,dtraining,num_boost_round=919).predict(dtesting)

In [85]:
stack_train = pd.DataFrame({'ENet':enet_feature,'RF':rf_feature,'GBDT':gbdt_feature,'xgb':xgb_feature})
stack_test = pd.DataFrame({'ENet':enet_predictor,'RF':rf_predictor,'GBDT':gbdt_predictor,'xgb':xgb_prdictor})

In [87]:
stack_test['Averaged'] = (stack_test['ENet'] + stack_test['RF'] + stack_test['GBDT'] + stack_test['xgb'])/4

In [90]:
from sklearn.linear_model import LinearRegression
lr = LinearRegression()
pred = lr.fit(stack_train,np.log(training['price_doc']+1)).predict(stack_test.drop('Averaged',axis=1))

In [88]:
from sklearn.metrics import mean_squared_error

In [89]:
for each in list(stack_test.columns):
    print(each+' '+str(mean_squared_error(np.log(testing['price_doc']+1),stack_test[each])**0.5))

ENet 0.476690677399
GBDT 0.417996143386
RF 0.413094700027
xgb 0.420923889073
Averaged 0.411393674364
