In [91]:
%run Functions.py

In [53]:
import numpy as np
from ggplot import *
import warnings
warnings.filterwarnings('ignore')

## Stacking Different SKlearn Models

In [70]:
train = pd.read_csv('train_clean.csv')
test = pd.read_csv('test_clean.csv')
macro = pd.read_csv('macro_c.csv')

**Get additional features for train and test datasets**

In [71]:
train,test = get_additional_features(train,test,macro)
train = train.fillna(-999)
test = test.fillna(-999)

## Input 1: RandomForest Regression

In [73]:
from sklearn.ensemble import RandomForestRegressor

In [74]:
rf = RandomForestRegressor(n_estimators= 100,criterion='mse',max_features= 'auto',random_state=42,n_jobs=-1,min_samples_leaf=8)

In [75]:
col = list(test.columns)[2:]

In [76]:
RF_Train,RF_Pred = get_sklearn_stack_data(rf,train,col,0.95*train['price_doc']+10,test)

TRAIN: [ 5853  5854  5855 ..., 29259 29260 29261] TEST: [   0    1    2 ..., 5850 5851 5852]
TRAIN: [    0     1     2 ..., 29259 29260 29261] TEST: [ 5853  5854  5855 ..., 11703 11704 11705]
TRAIN: [    0     1     2 ..., 29259 29260 29261] TEST: [11706 11707 11708 ..., 17555 17556 17557]
TRAIN: [    0     1     2 ..., 29259 29260 29261] TEST: [17558 17559 17560 ..., 23407 23408 23409]
TRAIN: [    0     1     2 ..., 23407 23408 23409] TEST: [23410 23411 23412 ..., 29259 29260 29261]
Start Training


## Input 2: Gradient Boosting Regression

In [78]:
from sklearn.ensemble import GradientBoostingRegressor

In [79]:
gbt = GradientBoostingRegressor(learning_rate = 0.05,n_estimators = 300,max_depth = 6,max_features = 0.7)

In [80]:
GBT_Train,GBT_Pred = get_sklearn_stack_data(gbt,train,col,0.95*train['price_doc']+10,test)

TRAIN: [ 5853  5854  5855 ..., 29259 29260 29261] TEST: [   0    1    2 ..., 5850 5851 5852]
TRAIN: [    0     1     2 ..., 29259 29260 29261] TEST: [ 5853  5854  5855 ..., 11703 11704 11705]
TRAIN: [    0     1     2 ..., 29259 29260 29261] TEST: [11706 11707 11708 ..., 17555 17556 17557]
TRAIN: [    0     1     2 ..., 29259 29260 29261] TEST: [17558 17559 17560 ..., 23407 23408 23409]
TRAIN: [    0     1     2 ..., 23407 23408 23409] TEST: [23410 23411 23412 ..., 29259 29260 29261]
Start Training


## Input 3: Elastic Net

In [81]:
from sklearn.linear_model import ElasticNet

In [82]:
enet = ElasticNet(alpha=200,l1_ratio=1) ## Become Ridge Actually
EN_Train,EN_Pred = get_sklearn_stack_data(enet,train,col,0.95*train['price_doc']+10,test)

TRAIN: [ 5853  5854  5855 ..., 29259 29260 29261] TEST: [   0    1    2 ..., 5850 5851 5852]
TRAIN: [    0     1     2 ..., 29259 29260 29261] TEST: [ 5853  5854  5855 ..., 11703 11704 11705]
TRAIN: [    0     1     2 ..., 29259 29260 29261] TEST: [11706 11707 11708 ..., 17555 17556 17557]
TRAIN: [    0     1     2 ..., 29259 29260 29261] TEST: [17558 17559 17560 ..., 23407 23408 23409]
TRAIN: [    0     1     2 ..., 23407 23408 23409] TEST: [23410 23411 23412 ..., 29259 29260 29261]
Start Training


## Input 4: XGBoost With Additional Features

In [94]:
train = pd.read_csv('train_clean.csv')
test = pd.read_csv('test_clean.csv')
train,test = get_additional_features(train,test,macro)

In [95]:
col = list(test.columns)[2:]

In [97]:
dtrain = xgb.DMatrix(train[col],train['price_doc']*0.95+10)

In [98]:
params1 = {
    'eta': 0.05, ## Try 0.01,3,5
    'max_depth': 5,## Try 4,5,6
    'subsample': 0.7,
    'colsample_bytree': 0.7,
    'objective': 'reg:linear',
    'eval_metric': 'rmse',
    'silent': 1
}

In [99]:
xgb_cvalid = xgb.cv(params1, dtrain, num_boost_round=1000, early_stopping_rounds=20,
    verbose_eval=50, show_stdv=False,seed=42)

[0]	train-rmse:7.94246e+06	test-rmse:7.9481e+06
[50]	train-rmse:2.13359e+06	test-rmse:2.54057e+06
[100]	train-rmse:1.78606e+06	test-rmse:2.36385e+06
[150]	train-rmse:1.66918e+06	test-rmse:2.33625e+06
[200]	train-rmse:1.57831e+06	test-rmse:2.31639e+06
[250]	train-rmse:1.50492e+06	test-rmse:2.30487e+06
[300]	train-rmse:1.44553e+06	test-rmse:2.29495e+06
[350]	train-rmse:1.39281e+06	test-rmse:2.28973e+06
[400]	train-rmse:1.34553e+06	test-rmse:2.28595e+06
[450]	train-rmse:1.30081e+06	test-rmse:2.28405e+06
[500]	train-rmse:1.26158e+06	test-rmse:2.28342e+06
[550]	train-rmse:1.22528e+06	test-rmse:2.28128e+06


In [100]:
print(len(xgb_cvalid))

554


In [101]:
label1 = train['price_doc']*0.95+10
nrounds1 = 554

In [102]:
xgb1_Train,xgb1_Test = get_xgb_stack_data(params1,nrounds1,train,col,label1,test)

TRAIN: [ 5853  5854  5855 ..., 29259 29260 29261] TEST: [   0    1    2 ..., 5850 5851 5852]
TRAIN: [    0     1     2 ..., 29259 29260 29261] TEST: [ 5853  5854  5855 ..., 11703 11704 11705]
TRAIN: [    0     1     2 ..., 29259 29260 29261] TEST: [11706 11707 11708 ..., 17555 17556 17557]
TRAIN: [    0     1     2 ..., 29259 29260 29261] TEST: [17558 17559 17560 ..., 23407 23408 23409]
TRAIN: [    0     1     2 ..., 23407 23408 23409] TEST: [23410 23411 23412 ..., 29259 29260 29261]
Start Training


## Input 5: XGBoost with Original Features

In [103]:
train = pd.read_csv('train_clean.csv')
test = pd.read_csv('test_clean.csv')

In [104]:
col = list(test.columns)[2:]

In [105]:
dtrain = xgb.DMatrix(train[col],train['price_doc']*0.95+10)
params2 = {
    'eta': 0.05, ## Try 0.01,3,5
    'max_depth': 5,## Try 4,5,6
    'subsample': 0.7,
    'colsample_bytree': 0.7,
    'objective': 'reg:linear',
    'eval_metric': 'rmse',
    'silent': 1
}
xgb_cvalid = xgb.cv(params2, dtrain, num_boost_round=1000, early_stopping_rounds=20,
    verbose_eval=50, show_stdv=False,seed=42)
print(len(xgb_cvalid))

[0]	train-rmse:7.94259e+06	test-rmse:7.94835e+06
[50]	train-rmse:2.14214e+06	test-rmse:2.55265e+06
[100]	train-rmse:1.80711e+06	test-rmse:2.3833e+06
[150]	train-rmse:1.68467e+06	test-rmse:2.35247e+06
[200]	train-rmse:1.60159e+06	test-rmse:2.33181e+06
[250]	train-rmse:1.5381e+06	test-rmse:2.32198e+06
[300]	train-rmse:1.48355e+06	test-rmse:2.3155e+06
[350]	train-rmse:1.43542e+06	test-rmse:2.31321e+06
[400]	train-rmse:1.38895e+06	test-rmse:2.30794e+06
400


In [106]:
label2 = 0.95*train['price_doc']+10
nround2 = 400

In [108]:
xgb2_Train,xgb2_Test = get_xgb_stack_data(params2,nround2,train,col,label2,test)

TRAIN: [ 5853  5854  5855 ..., 29259 29260 29261] TEST: [   0    1    2 ..., 5850 5851 5852]
TRAIN: [    0     1     2 ..., 29259 29260 29261] TEST: [ 5853  5854  5855 ..., 11703 11704 11705]
TRAIN: [    0     1     2 ..., 29259 29260 29261] TEST: [11706 11707 11708 ..., 17555 17556 17557]
TRAIN: [    0     1     2 ..., 29259 29260 29261] TEST: [17558 17559 17560 ..., 23407 23408 23409]
TRAIN: [    0     1     2 ..., 23407 23408 23409] TEST: [23410 23411 23412 ..., 29259 29260 29261]
Start Training


## Input 6: XGBoost With Additional Features + Prices House Per Time

In [110]:
train = pd.read_csv('train_clean.csv')
test = pd.read_csv('test_clean.csv')
train,test = get_additional_features(train,test,macro)

In [111]:
price = pd.read_csv('moscow_avg_price.csv')

In [113]:
price = price[['timestamp','moscow_avg_price_avg_rub','moscow_avg_price_low_rub']]

In [114]:
price['timestamp'] = pd.to_datetime(price['timestamp'])

In [115]:
train = train.merge(price,on='timestamp',how='left')
test = test.merge(price,on='timestamp',how='left')

In [117]:
col = list(test.columns)[2:]
dtrain = xgb.DMatrix(train[col],train['price_doc']*0.95+10)
params3 = {
    'eta': 0.05, ## Try 0.01,3,5
    'max_depth': 5,## Try 4,5,6
    'subsample': 0.7,
    'colsample_bytree': 0.7,
    'objective': 'reg:linear',
    'eval_metric': 'rmse',
    'silent': 1
}
xgb_cvalid = xgb.cv(params3, dtrain, num_boost_round=1000, early_stopping_rounds=20,
    verbose_eval=50, show_stdv=False,seed=42)
print(len(xgb_cvalid))

[0]	train-rmse:7.94257e+06	test-rmse:7.94813e+06
[50]	train-rmse:2.12844e+06	test-rmse:2.54503e+06
[100]	train-rmse:1.77994e+06	test-rmse:2.35871e+06
[150]	train-rmse:1.66482e+06	test-rmse:2.32385e+06
[200]	train-rmse:1.57173e+06	test-rmse:2.3039e+06
[250]	train-rmse:1.50211e+06	test-rmse:2.29097e+06
[300]	train-rmse:1.4433e+06	test-rmse:2.28311e+06
[350]	train-rmse:1.38899e+06	test-rmse:2.27806e+06
[400]	train-rmse:1.33946e+06	test-rmse:2.27455e+06
[450]	train-rmse:1.29632e+06	test-rmse:2.27239e+06
443


In [118]:
label3 = 0.95*train['price_doc']+10
nround3 = 443
xgb3_Train,xgb3_Test = get_xgb_stack_data(params3,nround3,train,col,label3,test)

TRAIN: [ 5853  5854  5855 ..., 29259 29260 29261] TEST: [   0    1    2 ..., 5850 5851 5852]
TRAIN: [    0     1     2 ..., 29259 29260 29261] TEST: [ 5853  5854  5855 ..., 11703 11704 11705]
TRAIN: [    0     1     2 ..., 29259 29260 29261] TEST: [11706 11707 11708 ..., 17555 17556 17557]
TRAIN: [    0     1     2 ..., 29259 29260 29261] TEST: [17558 17559 17560 ..., 23407 23408 23409]
TRAIN: [    0     1     2 ..., 23407 23408 23409] TEST: [23410 23411 23412 ..., 29259 29260 29261]
Start Training


## Level 2 XGBoost: Stack All

In [231]:
train_level2 = pd.DataFrame({'rf':RF_Train['predicted'],'gbt':GBT_Train['predicted'],'ENet':EN_Train['predicted'],'xgb1':xgb1_Train['predicted'],'xgb2':xgb2_Train['predicted'],'xgb3':xgb3_Train['predicted'],'label':xgb3_Train['label']})

In [232]:
test_level2 = pd.DataFrame({'rf':RF_Pred,'gbt':GBT_Pred,'ENet':EN_Pred,'xgb1':xgb1_Test,'xgb2':xgb2_Test,'xgb3':xgb3_Test})

In [207]:
col = list(test_level2.columns)
train_level2 = train_level2.iloc[num:,:]
label = train_level2['label']
dtrain = xgb.DMatrix(train_level2[col],label)

In [208]:
params = {
   'eta': 0.03, ## Try 0.01,3,5
   'max_depth': 2,## Try 4,5,6
   'subsample': 1,
   'colsample_bytree':1,
   'objective': 'reg:linear',
    'eval_metric': 'rmse',
    'silent': 1
}

In [209]:
xgb_cvalid = xgb.cv(params, dtrain, num_boost_round=1000, early_stopping_rounds=20,
    verbose_eval=50, show_stdv=False,seed=42)

[0]	train-rmse:8.25881e+06	test-rmse:8.25852e+06
[50]	train-rmse:2.86339e+06	test-rmse:2.92721e+06
[100]	train-rmse:2.15927e+06	test-rmse:2.27513e+06
[150]	train-rmse:2.0679e+06	test-rmse:2.2379e+06


In [210]:
len(xgb_cvalid)

136

In [211]:
level2_model = xgb.train(params,dtrain,num_boost_round=136)

In [212]:
dtest = xgb.DMatrix(test_level2)

In [213]:
def get_feature_importance(model):
    Importance = model.get_fscore()
    Importance = list(Importance.items())
    Feature= []
    Score = []
    for each in Importance:
        Feature.append(each[0])
        Score.append(each[1])
    df = pd.DataFrame({'Feature':Feature,'Score':Score}).sort_values(by=['Score'],ascending=[0])
    return df    

In [214]:
get_feature_importance(level2_model)

Unnamed: 0,Feature,Score
1,xgb3,131
0,xgb2,81
3,rf,72
2,xgb1,47
5,gbt,46
4,ENet,29


In [215]:
pred = level2_model.predict(dtest)

In [216]:
sub = pd.DataFrame({'id':test['id'],'price_doc':pred})

In [217]:
sub.to_csv('Stacking_3SKlearn_3xgb_correct222.csv',index=False)

## Level 2 Stacking: Only XGBoost

In [218]:
train_level2_2 = pd.DataFrame({'rf':RF_Train['predicted'],'xgb2':xgb2_Train['predicted'],'xgb3':xgb3_Train['predicted'],'label':xgb3_Train['label']})
test_level2_2 = pd.DataFrame({'rf':RF_Pred,'xgb2':xgb2_Test,'xgb3':xgb3_Test})

In [219]:
#num = round(0.2*len(train_level2_2))

In [220]:
#train_level2_2 = train_level2_2.iloc[num:,:]
col = list(test_level2_2.columns)

In [221]:
label = train_level2_2['label']
dtrain = xgb.DMatrix(train_level2_2[col],label)
dtest = xgb.DMatrix(test_level2_2)

In [222]:
xgb_cvalid = xgb.cv(params, dtrain, num_boost_round=1000, early_stopping_rounds=20,
    verbose_eval=50, show_stdv=False,seed=42)

[0]	train-rmse:8.08991e+06	test-rmse:8.09027e+06
[50]	train-rmse:2.93899e+06	test-rmse:2.98985e+06
[100]	train-rmse:2.30371e+06	test-rmse:2.40704e+06
[150]	train-rmse:2.22778e+06	test-rmse:2.37673e+06


In [223]:
len(xgb_cvalid)

144

In [224]:
model = xgb.train(params,dtrain,num_boost_round=144)

In [225]:
pred = model.predict(dtest)

In [226]:
sub2 = pd.DataFrame({'id':test['id'],'price_doc':pred})

In [227]:
sub2.to_csv('Stacking_xgb3_and_2_and_RF_corrected333.csv',index=False)

## Get yesterday's stacking input

In [233]:
train_input = pd.read_csv('Stacking_Input.csv')
test_input = pd.read_csv('Stacking_test_input.csv')

In [234]:
del train_input['label']

In [235]:
train_level2.shape

(29262, 7)

In [236]:
new_train = pd.concat([train_level2,train_input],axis=1)

In [237]:
new_test = pd.concat([test_level2,test_input],axis=1)

In [239]:
col = list(new_test.columns)

In [240]:
params = {
   'eta': 0.03, ## Try 0.01,3,5
   'max_depth': 2,## Try 4,5,6
   'subsample': 1,
   'colsample_bytree':1,
   'objective': 'reg:linear',
    'eval_metric': 'rmse',
    'silent': 1
}

In [244]:
dtrain = xgb.DMatrix(new_train[col],new_train['label'])
dtest = xgb.DMatrix(new_test)

In [245]:
xgb_cvalid = xgb.cv(params, dtrain, num_boost_round=1000, early_stopping_rounds=20,
    verbose_eval=50, show_stdv=False,seed=42)

[0]	train-rmse:8.08958e+06	test-rmse:8.09049e+06
[50]	train-rmse:2.90971e+06	test-rmse:2.97815e+06
[100]	train-rmse:2.25399e+06	test-rmse:2.38663e+06
[150]	train-rmse:2.1614e+06	test-rmse:2.35052e+06


In [246]:
model = xgb.train(params,dtrain,num_boost_round=len(xgb_cvalid))
print(len(xgb_cvalid))

168


In [248]:
FI=get_feature_importance(model)

In [252]:
Models = FI['Feature'][:6]

In [254]:
dtrain = xgb.DMatrix(new_train[Models],new_train['label'])
dtest = xgb.DMatrix(new_test[Models])

In [255]:
xgb_cvalid = xgb.cv(params, dtrain, num_boost_round=1000, early_stopping_rounds=20,
    verbose_eval=50, show_stdv=False,seed=42)

[0]	train-rmse:8.08958e+06	test-rmse:8.09049e+06
[50]	train-rmse:2.91337e+06	test-rmse:2.97464e+06
[100]	train-rmse:2.26092e+06	test-rmse:2.37958e+06
[150]	train-rmse:2.17232e+06	test-rmse:2.33255e+06
[200]	train-rmse:2.13482e+06	test-rmse:2.32212e+06


In [256]:
model = xgb.train(params,dtrain,num_boost_round=len(xgb_cvalid))

In [258]:
pred = model.predict(dtest)

In [260]:
sub = pd.DataFrame({'id':test['id'],'price_doc':pred})

In [261]:
sub.to_csv('Stacking_0628_FinalRounds.csv',index=False)

In [263]:
new_train.to_csv('Stacking_Input_Saved_0628.csv',index=False)
new_test.to_csv('Stacking_Test_Saved_0628.csv',index=False)