# **Setup**

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [9]:
!cp "/content/drive/MyDrive/AirQo_Low_Cost_Air_Quality_Monitor_Calibration/data/Train.csv" .
!cp "/content/drive/MyDrive/AirQo_Low_Cost_Air_Quality_Monitor_Calibration/data/Test.csv" .

In [10]:
import os
import sys
import gc
import math
import random
import pickle
import pandas as pd
import numpy as np
import joblib

from tqdm.notebook import tqdm
from tqdm import tqdm_notebook


import warnings
warnings.simplefilter('ignore')

from sklearn.model_selection import KFold, StratifiedKFold, train_test_split
from sklearn.metrics import mean_squared_error 

In [11]:
train = pd.read_csv('Train.csv')
test = pd.read_csv('Test.csv')

# **Processing**

In [12]:
data=pd.concat([train,test])

In [13]:
data=data[['ID','created_at','site','pm2_5','pm10','ref_pm2_5']]

In [14]:
data['created_at']=pd.to_datetime(data['created_at'])
data['year']=data['created_at'].dt.year
data['month']=data['created_at'].dt.month
data['day']=data['created_at'].dt.day
data['hour']=data['created_at'].dt.hour

In [15]:
from sklearn.preprocessing import LabelEncoder

LE_cols = ['site','created_at']
for le_col in LE_cols :
    LE = LabelEncoder()
    data[le_col] = LE.fit_transform(data[le_col])
cyclic_features=['month','day','hour']
for col in cyclic_features:
    max_val = data[col].max()

    data[col + '_sin'] = np.sin(2 * np.pi * data[col] / max_val)
    data[col + '_cos'] = np.cos(2 * np.pi * data[col] / max_val)

In [16]:
data['days_count']=data['year'].apply(str)+data['month'].apply(str)+data['day'].apply(str)
data.set_index(['site','days_count'],inplace=True)
data['daily_target_mean']=data.groupby(['site','days_count']).ref_pm2_5.mean()
data['daily_target_std']=data.groupby(['site','days_count']).ref_pm2_5.std()
#data['daily_target_median']=data.groupby(['site','days_count']).ref_pm2_5.median()
data.reset_index(inplace=True)

In [17]:
data.drop(cyclic_features,axis=1,inplace=True)

In [18]:
odata=data.copy()
odata['src']=[0]*train.shape[0]+[1]*test.shape[0]
odata.sort_values(by=['created_at'],inplace=True)

In [19]:
shifts= [1,2,3,4,-1,-2,-3,-4]
cols=['ref_pm2_5']
#normal shift
for col in cols:  
    for s in shifts:
        odata[col+'_shift_'+str(s)]=odata.groupby('site')[col].shift(s)
        if s<0:
            odata[col+'_shift_'+str(s)]=odata.groupby('site')[col+'_shift_'+str(s)].bfill()
        else:
            odata[col+'_shift_'+str(s)]=odata.groupby('site')[col+'_shift_'+str(s)].ffill()
            
            #odata[col+'_diff_'+str(s)]=odata.groupby('site')[col+'_shift_'+str(s)].diff()--p
            #odata[col+'_diff_'+str(s)]=odata.groupby('site')[col+'_diff_'+str(s)].ffill()--p
            
            
            

        #shift mean
    for s in list(set(np.abs(shifts))):
        odata[col+'_shift_'+str(s)+'_mean']=(odata[col+'_shift_'+str(s)]+odata[col+'_shift_'+str(-1*s)])/2
        #odata[col+'_shift_'+str(s)+'_deriv']=(odata[col+'_shift_'+str(-1*s)]-odata[col+'_shift_'+str(s)])/(2*s)--p
        odata[col+'_shift_'+str(s)+'_deriv_new']=(odata[col+'_shift_'+str(-1*s)]-odata[col+'_shift_'+str(s)])+odata['pm2_5']
        #odata['pm_plus_shift'+str(s)]=odata[col+'_shift_'+str(s)]-odata['pm10']
        
        odata['rolling_avg'+str(s)] = odata.groupby('site')[col].shift(s).expanding().mean().fillna(method='ffill')
        odata['rolling_avg'+str(-s)] = odata.groupby('site')[col].shift(-s).expanding().mean().fillna(method='bfill')


In [20]:
col='ref_pm2_5'
#odata['2_1']=(odata[col+'_shift_'+str(-2)]-odata[col+'_shift_'+str(1)])/3(p)
#odata['1_2']=(odata[col+'_shift_'+str(-1)]-odata[col+'_shift_'+str(2)])/3(p)

#odata['3_1']=(odata[col+'_shift_'+str(-3)]-odata[col+'_shift_'+str(1)])/4(p)
#odata['1_3']=(odata[col+'_shift_'+str(-1)]-odata[col+'_shift_'+str(3)])/4(p)

#odata['3_2']=(odata[col+'_shift_'+str(-3)]-odata[col+'_shift_'+str(2)])/5
#odata['2_3']=(odata[col+'_shift_'+str(-2)]-odata[col+'_shift_'+str(3)])/5
#odata['-1-2']=(odata[col+'_shift_'+str(1)]-odata[col+'_shift_'+str(2)])
#odata['2f1']=(odata[col+'_shift_'+str(-2)]-odata[col+'_shift_'+str(-1)])#(p)

#no jumps
odata['-1-2']=(odata[col+'_shift_'+str(1)]-odata[col+'_shift_'+str(2)])
odata['-2-3']=(odata[col+'_shift_'+str(2)]-odata[col+'_shift_'+str(3)])
odata['-3-4']=(odata[col+'_shift_'+str(3)]-odata[col+'_shift_'+str(4)])
odata['-1-3']=(odata[col+'_shift_'+str(1)]-odata[col+'_shift_'+str(3)])/2
odata['-1-4']=(odata[col+'_shift_'+str(1)]-odata[col+'_shift_'+str(4)])/3
odata['-2-4']=(odata[col+'_shift_'+str(2)]-odata[col+'_shift_'+str(4)])/2

odata['+1+2']=(odata[col+'_shift_'+str(-2)]-odata[col+'_shift_'+str(-1)])
odata['+2+3']=(odata[col+'_shift_'+str(-3)]-odata[col+'_shift_'+str(-2)])
odata['+3+4']=(odata[col+'_shift_'+str(-4)]-odata[col+'_shift_'+str(-3)])
odata['+1+3']=(odata[col+'_shift_'+str(-3)]-odata[col+'_shift_'+str(-1)])/2
odata['+1+4']=(odata[col+'_shift_'+str(-4)]-odata[col+'_shift_'+str(-1)])/3
odata['+2+4']=(odata[col+'_shift_'+str(-4)]-odata[col+'_shift_'+str(-2)])/2

In [21]:
odata['prox']=data['ref_pm2_5']-data['pm2_5']

In [22]:
cols=['site']
shifts=[i for i in range(1,5)]
for s in shifts:
    odata['lagged_prox'+str(s)]=odata.groupby(cols).prox.shift(s)
    odata['lagged_prox'+str(s)]=odata.groupby(cols)['lagged_prox'+str(s)].ffill()
    odata['lagged_prox'+str(s)].fillna(odata.daily_target_mean-odata.pm2_5,inplace=True)
    
    odata['lagged_prox'+str(-s)]=odata.groupby(cols).prox.shift(-s)
    odata['lagged_prox'+str(-s)]=odata.groupby(cols)['lagged_prox'+str(-s)].bfill()
    odata['lagged_prox'+str(-s)].fillna(odata.daily_target_mean-odata.pm2_5,inplace=True)
    #summing mean ref with lag prox
    odata['from_previous_'+str(s)]=odata['daily_target_mean']+odata['lagged_prox'+str(s)]

    #summing mean ref with lag prox f
    odata['from_previous'+str(-s)]=odata['daily_target_mean']+odata['lagged_prox'+str(-s)]
    
    #summing with lag prox
    odata['from_previous_pm'+str(s)]=odata['pm2_5']+odata['lagged_prox'+str(s)]

    #summing with lag prox f
    odata['from_previous_pm'+str(-s)]=odata['pm2_5']+odata['lagged_prox'+str(-s)]

    odata['from_previous_mean'+str(s)]=(odata['from_previous_pm'+str(s)]+odata['from_previous_pm'+str(-s)])/2

In [23]:
odata['prox10']=data['ref_pm2_5']-data['pm10']

In [24]:
cols=['site']
shifts=[1,2,3,4,5,6]
for s in shifts:
    odata['lagged_prox10'+str(s)]=odata.groupby(cols).prox10.shift(s)
    odata['lagged_prox10'+str(s)]=odata.groupby(cols)['lagged_prox10'+str(s)].ffill()
    odata['lagged_prox10'+str(s)].fillna(odata.daily_target_mean-odata.pm10,inplace=True)
    
    odata['lagged_prox10'+str(-s)]=odata.groupby(cols).prox10.shift(-s)
    odata['lagged_prox10'+str(-s)]=odata.groupby(cols)['lagged_prox10'+str(-s)].bfill()
    odata['lagged_prox10'+str(-s)].fillna(odata.daily_target_mean-odata.pm10,inplace=True)
    #summing mean ref with lag prox
    odata['10from_previous_'+str(s)]=odata['daily_target_mean']+odata['lagged_prox10'+str(s)]

    #summing mean ref with lag prox f
    odata['10from_previous'+str(-s)]=odata['daily_target_mean']+odata['lagged_prox10'+str(-s)]
    
    #summing with lag prox
    odata['10from_previous_pm'+str(s)]=odata['pm10']+odata['lagged_prox10'+str(s)]

    #summing with lag prox f
    odata['10from_previous_pm'+str(-s)]=odata['pm10']+odata['lagged_prox10'+str(-s)]

    odata['10from_previous_mean'+str(s)]=(odata['10from_previous_pm'+str(s)]+odata['10from_previous_pm'+str(-s)])/2

In [25]:
#lagging pm2_5 grouped only by site(not used)
shifts=[1,2,3]
for s in shifts:
    odata['sensor_always_shift'+str(s)]=odata.groupby('site').pm2_5.shift(s)
    odata['sensor_always_shift'+str(s)].fillna(odata.pm2_5,inplace=True)
    odata['sensor_always_shift'+str(-s)]=odata.groupby('site').pm2_5.shift(-s)
    odata['sensor_always_shift'+str(-s)].fillna(odata.pm2_5,inplace=True)
    odata['mean_sensor'+str(s)]=(odata['sensor_always_shift'+str(s)]+odata['sensor_always_shift'+str(-s)])/2

In [26]:
merge_cols=list(data.columns)
data=pd.merge(data,odata,on=merge_cols,how="left")

In [27]:
data.drop('prox',axis=1,inplace=True)

In [28]:
data.drop('prox10',axis=1,inplace=True)

In [32]:
ntrain=data[:train.shape[0]].drop(['ID','src'],axis=1)
ntest=data[train.shape[0]:].drop(['ID','src'],axis=1)

# **Modeling By Site**

In [33]:
from sklearn.model_selection import StratifiedKFold ,KFold
import lightgbm as lgb
import xgboost as xgb

In [34]:
# Install Catboost
!pip install catboost==0.22 --quiet
import catboost as cat
from catboost import CatBoostRegressor, Pool

In [35]:
train=data[:train.shape[0]].drop(['src'],axis=1)
test=data[train.shape[0]:].drop(['src'],axis=1)

In [36]:
target = train['ref_pm2_5']

## 0. Utils

In [37]:
def Train_5Fold_lgbm(X,y,Test,site,kfold,params_lgb) :
  final_preds_per_location = []
  err_cb=[]
  site_oof_preds = np.zeros(len(X))
  for fold,(train_index, test_index) in enumerate(kfold.split(X,y)):
      X_train, X_test = X.values[train_index], X.values[test_index]
      y_train, y_test = y.values[train_index], y.values[test_index]
      
      trn_data = lgb.Dataset(X_train, y_train)
      val_data = lgb.Dataset(X_test, y_test)

      model = lgb.train(params_lgb, trn_data, valid_sets = [trn_data, val_data], verbose_eval=0, early_stopping_rounds = 200)

      preds=model.predict(X_test, num_iteration=model.best_iteration)
      site_oof_preds[test_index] = preds
      err_cb.append(np.sqrt(mean_squared_error(y_test,preds)))
      
      test_pred = model.predict(Test.values, num_iteration=model.best_iteration)
      final_preds_per_location.append(test_pred)
  
  print(f"Site {site} --  Training RMSE :" ,np.mean(err_cb))
  return np.mean(final_preds_per_location,axis=0) , site_oof_preds

# ------------------------------------------------------------------------------------------------------------------------------

def Custom_train_Lgbm() :
  '''SEED The envierment'''
  seed = 0
  random.seed(seed)
  np.random.seed(seed)
  kfold=KFold(n_splits=5, random_state=1901,shuffle=True)
  
  final_preds=[]
  final_ids = []
  train_ids = []
  oof_prediction = []
  for site in tqdm_notebook([0,1,2], leave=False):
    print(60*'-')
    if site ==1 :
      params_lgb = {'objective' :'regression','boosting_type' : 'gbdt','metric': 'rmse' ,
                    'learning_rate' : 0.1,'num_iterations': 1500,'max_depth' :7 ,'num_leaves' : 75,
                    'max_bins': 85,'feature_fraction': 0.8,'bagging_fraction': 0.9,'min_data_in_leaf':30,'reg_lambda' :75}
    else : 
      params_lgb = {'objective' :'regression','boosting_type' : 'gbdt','metric': 'rmse' ,
                    'learning_rate' : 0.1,'num_iterations': 2500,'max_depth' :9 ,'num_leaves' : 81,
                    'feature_fraction': 0.7,'bagging_fraction': 0.9,'min_data_in_leaf':5,'reg_lambda' :50}
    X = train[train['site']==site]  ; train_ids.extend(X['ID'].values)   ; y = X['ref_pm2_5']; X = X[features]
    Test = test[test['site']==site] ; final_ids.extend(Test['ID'].values); Test = Test[features]
    
    preds_per_location, site_oof_preds = Train_5Fold_lgbm(X=X,y=y,Test=Test,site=site,kfold=kfold,params_lgb=params_lgb)
    oof_prediction.extend(site_oof_preds) ; final_preds.extend(preds_per_location)
  
  oof_data = pd.DataFrame() ; oof_data['ID'] = train_ids ; oof_data['OOF'] = np.clip(oof_prediction,a_min=0,a_max=500) 
  preds_data = pd.DataFrame() ; preds_data['ID'] = final_ids ; preds_data['ref_pm2_5'] = np.clip(final_preds,a_min=0,a_max=500) 
  print()
  print(60*'#')
  return oof_data,preds_data

In [38]:
def Train_5Fold_xgb(X,y,Test,site,kfold,params_xgb) :
  final_preds_per_location = []
  err_cb=[]
  site_oof_preds = np.zeros(len(X))
  for fold,(train_index, test_index) in enumerate(kfold.split(X,y)):
      X_train, X_test = X.values[train_index], X.values[test_index]
      y_train, y_test = y.values[train_index], y.values[test_index]
      
      trn_data = xgb.DMatrix(X_train, y_train)
      val_data = xgb.DMatrix(X_test, y_test)
      watchlist = [(trn_data, 'train'), (val_data, 'valid')]

      model = xgb.train(params_xgb, trn_data,10000, evals = watchlist, verbose_eval=0, early_stopping_rounds = 200)

      dX_test = xgb.DMatrix(X_test)
      preds = model.predict(dX_test)
      site_oof_preds[test_index] = preds
      err_cb.append(np.sqrt(mean_squared_error(y_test,preds)))
      
      dTest = xgb.DMatrix(Test.values)
      test_pred = model.predict(dTest)
      final_preds_per_location.append(test_pred)
  
  print(f"Site {site} --  Training RMSE :" ,np.mean(err_cb))
  return np.mean(final_preds_per_location,axis=0) , site_oof_preds

# ------------------------------------------------------------------------------------------------------------------------------

def Custom_train_xgb() :
  '''SEED The envierment'''
  seed = 0
  random.seed(seed)
  np.random.seed(seed)
  kfold=KFold(n_splits=5, random_state=1901,shuffle=True)
  
  final_preds=[]
  final_ids = []
  train_ids = []
  oof_prediction = []
  for site in tqdm_notebook([0,1,2], leave=False):
    print(60*'-')
    if site == 1 :
      params_xgb = {'gpu_id': 0,'objective': 'reg:squarederror','eval_metric': 'rmse','booster': 'gbtree', 
          'n_estimators': 10000,  'max_depth': 7, 'learning_rate': 0.1, 'max_bin': 100,
          'max_leaves': 80,'reg_alpha': 10, 'reg_lambda': 100,'subsample': 0.9 ,'colsample_bytree' : 0.4,
          'silent' : False,}
    else :
      params_xgb = {'objective': 'reg:squarederror','eval_metric': 'rmse','booster': 'gbtree', 
          'n_estimators': 10000,  'max_depth': 6, 'learning_rate': 0.08, 'max_bin': 100,
          'max_leaves': 64,'reg_alpha': 5, 'reg_lambda': 15,'subsample': 0.7 ,'colsample_bytree' : 0.7,
          'silent' : False,}
    X = train[train['site']==site]  ; train_ids.extend(X['ID'].values)   ; y = X['ref_pm2_5']; X = X[features]
    Test = test[test['site']==site] ; final_ids.extend(Test['ID'].values); Test = Test[features]
    
    preds_per_location, site_oof_preds = Train_5Fold_xgb(X=X,y=y,Test=Test,site=site,kfold=kfold,params_xgb=params_xgb)
    oof_prediction.extend(site_oof_preds) ; final_preds.extend(preds_per_location)
  
  oof_data = pd.DataFrame() ; oof_data['ID'] = train_ids ; oof_data['OOF'] = np.clip(oof_prediction,a_min=0,a_max=500) 
  preds_data = pd.DataFrame() ; preds_data['ID'] = final_ids ; preds_data['ref_pm2_5'] = np.clip(final_preds,a_min=0,a_max=500)
   
  print()
  print(60*'#')
  return oof_data,preds_data

In [39]:
def Train_5Fold_catboost_2(X,y,Test,site,kfold) :
  final_preds_per_location = []
  err_cb=[]
  site_oof_preds = np.zeros(len(X))
  for fold,(train_index, test_index) in enumerate(kfold.split(X,y)):
      X_train, X_test = X.values[train_index], X.values[test_index]
      y_train, y_test = y.values[train_index], y.values[test_index]

      model = CatBoostRegressor(**cat_params) 
      model.fit(X_train,y_train,eval_set=[(X_test, y_test)],early_stopping_rounds=20,verbose=0)
      preds=model.predict(X_test)
      site_oof_preds[test_index] = preds
      err_cb.append(np.sqrt(mean_squared_error(y_test,preds)))
      
      test_pred = model.predict(Test.values)
      final_preds_per_location.append(test_pred)
  
  print(f"Site {site} --  Training RMSE :" ,np.mean(err_cb))
  return np.mean(final_preds_per_location,axis=0) , site_oof_preds

# ------------------------------------------------------------------------------------------------------------------------------ 

def Custom_train_catboost_2(features) :
  kfold=KFold(n_splits=5, random_state=1901,shuffle=True)
  
  final_preds=[]
  final_ids = []
  train_ids = []
  oof_prediction = []
  for site in tqdm_notebook([0,1,2], leave=False):
    print(60*'-')
    X = train[train['site']==site]  ; train_ids.extend(X['ID'].values)   ; y = X['ref_pm2_5']; X = X[features]
    Test = test[test['site']==site] ; final_ids.extend(Test['ID'].values); Test = Test[features]
    
    preds_per_location, site_oof_preds = Train_5Fold_catboost_2(X=X,y=y,Test=Test,site=site,kfold=kfold)
    oof_prediction.extend(site_oof_preds) ; final_preds.extend(preds_per_location)
  
  oof_data = pd.DataFrame() ; oof_data['ID'] = train_ids ; oof_data['OOF'] = np.clip(oof_prediction,a_min=0,a_max=500) 
  preds_data = pd.DataFrame() ; preds_data['ID'] = final_ids ; preds_data['ref_pm2_5'] = np.clip(final_preds,a_min=0,a_max=500)
  
  print()
  print(60*'#')
  return oof_data,preds_data

In [40]:
def Train_5Fold_Stacking(X,y,Test,kfold) :
  final_preds = [] ; err_cb = []
  oof_stack = np.zeros(len(X)) ;
  for fold,(train_index, test_index) in enumerate(kfold.split(X,y)):
      X_train, X_test = X.values[train_index], X.values[test_index]
      y_train, y_test = y.values[train_index], y.values[test_index]

      model = Ridge(alpha=0.01,random_state=42)
      model.fit(X_train,y_train)
      preds=model.predict(X_test)
      preds = np.clip(preds,a_min=0,a_max=500)
      oof_stack[test_index] = preds
      err_cb.append(mean_squared_error(y_test,preds,squared=False))
      
      test_pred = model.predict(Test.values)
      final_preds.append(test_pred)
  
  print(2*'--------------------------------------')
  print(f"Stacking RMSE :" ,np.mean(err_cb))
  return oof_stack,np.mean(final_preds,axis=0)

## 1. LGBM

In [41]:
features = [x for x in train.columns 
            if x not in ['ID','ref_pm2_5','created_at',]]

In [42]:
oof_data_lgb,final_preds_lgb = Custom_train_Lgbm() 

HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))

------------------------------------------------------------
Site 0 --  Training RMSE : 8.220216918335204
------------------------------------------------------------
Site 1 --  Training RMSE : 20.541915725445907
------------------------------------------------------------
Site 2 --  Training RMSE : 9.373808085991936

############################################################


In [43]:
train_oof_data = pd.merge(train[['ID','ref_pm2_5']], oof_data_lgb,on='ID',how='left')
print('OOF RMSE : ',np.sqrt(mean_squared_error(train_oof_data['ref_pm2_5'],train_oof_data['OOF'])))

OOF RMSE :  11.886558086359585


## 2. xgb

In [44]:
features = [x for x in train.columns 
            if x not in ['ID', 'created_at','ref_pm2_5']]

In [45]:
oof_data_xgb,final_preds_xgb = Custom_train_xgb()  # 20.56

HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))

------------------------------------------------------------
Site 0 --  Training RMSE : 8.314691898541728
------------------------------------------------------------
Site 1 --  Training RMSE : 20.566470849764762
------------------------------------------------------------
Site 2 --  Training RMSE : 9.452009633332592

############################################################


In [46]:
train_oof_data = pd.merge(train[['ID','ref_pm2_5']], oof_data_xgb,on='ID',how='left')
print('OOF RMSE : ',np.sqrt(mean_squared_error(train_oof_data['ref_pm2_5'],train_oof_data['OOF'])))

OOF RMSE :  11.926565662452614


## 3. CATBOOST_2

In [47]:
features = [x for x in train.columns 
            if x not in ['ID', 'created_at','ref_pm2_5',]]
cat_params = {'n_estimators':5000,'eval_metric':'RMSE','learning_rate':0.05, 'random_seed': 0,
                                    'use_best_model':True,'depth':10}

In [48]:
oof_data_catboost_2,final_preds_catboost_2 = Custom_train_catboost_2(features=features) 

HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))

------------------------------------------------------------
Site 0 --  Training RMSE : 8.159656138710158
------------------------------------------------------------
Site 1 --  Training RMSE : 20.30412646838159
------------------------------------------------------------
Site 2 --  Training RMSE : 9.300702141360285

############################################################


In [49]:
train_oof_data = pd.merge(train[['ID','ref_pm2_5']], oof_data_catboost_2,on='ID',how='left')
print('OOF RMSE : ',np.sqrt(mean_squared_error(train_oof_data['ref_pm2_5'],train_oof_data['OOF'])))

OOF RMSE :  11.750516221958026


## 5. Stacking

In [50]:
from sklearn.linear_model import Ridge ,LinearRegression

In [51]:
stacking_train = oof_data_catboost_2.copy() ; del stacking_train['OOF'] ;
stacking_train['Hela_App2_preds_cat_2'] =  oof_data_catboost_2['OOF'] ;
stacking_train['Hela_App2_preds_xgb'] =  oof_data_xgb['OOF'] ;
stacking_train['Hela_App2_preds_lgb'] =  oof_data_lgb['OOF']
stacking_train = pd.merge(train[['ID','ref_pm2_5']], stacking_train,on='ID',how='left')
stacking_train.rename({'ref_pm2_5': 'target'})

stacking_test = final_preds_catboost_2.copy() ; del stacking_test['ref_pm2_5']
stacking_test['Hela_App2_preds_cat_2'] =  final_preds_catboost_2['ref_pm2_5'] ; 
stacking_test['Hela_App2_preds_xgb'] =  final_preds_xgb['ref_pm2_5'] ;
stacking_test['Hela_App2_preds_lgb'] =  final_preds_lgb['ref_pm2_5']

In [52]:
cols = ['Hela_App2_preds_cat_2','Hela_App2_preds_xgb', 'Hela_App2_preds_lgb']

X , y , Test = stacking_train[cols] , stacking_train['ref_pm2_5'] , stacking_test[cols]
KFOLD = KFold(n_splits=5,random_state=1901,shuffle=True)

oof_stack,stack_preds = Train_5Fold_Stacking(X=X,y=y,Test=Test,kfold=KFOLD)

----------------------------------------------------------------------------
Stacking RMSE : 11.397878940156932


In [55]:
print('Stacking OOF',mean_squared_error(y,oof_stack,squared=False))

Stacking OOF 11.630116279544259


In [54]:
# save stacking data's 
stacking_train.to_csv('Hela_App2_stacking_train.csv',index=False)
stacking_test.to_csv('Hela_App2_stacking_test.csv',index=False)