# **Setup**

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!cp "/content/drive/MyDrive/AirQo_Low_Cost_Air_Quality_Monitor_Calibration/data/Train.csv" .
!cp "/content/drive/MyDrive/AirQo_Low_Cost_Air_Quality_Monitor_Calibration/data/Test.csv" .

In [3]:
import os
import sys
import gc
import math
import random
import pickle
import pandas as pd
import numpy as np
import joblib

from tqdm.notebook import tqdm
from tqdm import tqdm_notebook


import warnings
warnings.simplefilter('ignore')

from sklearn.model_selection import KFold, StratifiedKFold, train_test_split
from sklearn.metrics import mean_squared_error 

# **Utils**

In [4]:
# Generating feature interactions between PM features only 
def add_div_interacts(df, inter_cols):   
    def apply_interacts(x, inter_cols):
        cols = [x + '_div_' + c for c in inter_cols[inter_cols.index(x)+1:]]
        interacts_df[cols] = pd.concat([df[x] / (df[c]+1e-15) for c in inter_cols[inter_cols.index(x)+1:]], axis=1)
    
    interacts_df = pd.DataFrame()
    _ = df[inter_cols[:-1]].apply(lambda x: apply_interacts(x.name, inter_cols))
    df = pd.concat([df, interacts_df], axis=1)
    return df
    
def create_interactions(data_r,cols_1) :
  for num_cols in tqdm_notebook([cols_1], leave = False):
      data_r = add_div_interacts(data_r, num_cols)
  return data_r

In [5]:
def process(train,test) :
  data = pd.concat([train,test]).reset_index(drop=True)
  
  # time features 
  data['created_at'] = pd.to_datetime(data['created_at'])
  data['year'] = data['created_at'].dt.year
  data['year'] = data['year'].astype(float)
  data['month'] = data['created_at'].dt.month
  data['day'] = data['created_at'].dt.day
  data['weekday'] = data['created_at'].dt.weekday
  data['weekofyear'] = data['created_at'].dt.weekofyear
  data['hour'] = data['created_at'].dt.hour

  # cyclic time features
  data['day_cos'] = np.cos(data['day'])
  data['day_sin'] = np.sin(data['day']) 

  data['month_cos'] = np.cos(data['month'])
  data['month_sin'] = np.sin(data['month']) 

  data['hour_cos'] = np.cos(data['hour'])
  data['hour_sin'] = np.sin(data['hour']) 

  data['weekday_cos'] = np.cos(data['weekday'])
  data['weekday_sin'] = np.sin(data['weekday']) 

  data['weekofyear_cos'] = np.cos(data['weekofyear'])
  data['weekofyear_sin'] = np.sin(data['weekofyear']) 
  
  # combination between time features
  data['day_hour'] = data['day'].astype(str) + '-' + data['hour'].astype(str)
  data['month_day'] = data['month'].astype(str) + '-' + data['day'].astype(str)

  from sklearn.preprocessing import LabelEncoder
  LE = LabelEncoder()
  LE_cols = ['site','day_hour','month_day']
  for le_col in LE_cols :
    data[le_col] = LE.fit_transform(data[le_col])

  # Generating feature interactions between PM features only 
  data = create_interactions(data, ['pm2_5','pm10','s2_pm2_5','s2_pm10'])

  # Get our New Train,Test
  train = data[data['ID'].isin(train['ID'].values)]
  test = data[~data['ID'].isin(train['ID'].values)]

  return train, test

# **Load Data - PreProcess**

In [6]:
train = pd.read_csv('Train.csv')
test = pd.read_csv('Test.csv')

In [7]:
train, test= process(train,test)

HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))



# **Modeling**

In [8]:
from sklearn.model_selection import StratifiedKFold ,KFold
import lightgbm as lgb
import xgboost as xgb

In [9]:
# Install Catboost
!pip install catboost==0.22 --quiet
import catboost as cat
from catboost import CatBoostRegressor, Pool

[K     |████████████████████████████████| 64.4MB 55kB/s 
[?25h

In [10]:
target = train['ref_pm2_5']

## 0. Utils

In [11]:
def Train_5Fold_lgbm(X,y,Test,site,kfold) :
  final_preds_per_location = []
  err_cb=[]
  site_oof_preds = np.zeros(len(X))
  for fold,(train_index, test_index) in enumerate(kfold.split(X,y)):
      X_train, X_test = X.values[train_index], X.values[test_index]
      y_train, y_test = y.values[train_index], y.values[test_index]
      
      trn_data = lgb.Dataset(X_train, y_train)
      val_data = lgb.Dataset(X_test, y_test)

      model = lgb.train(params_lgb, trn_data, valid_sets = [trn_data, val_data], verbose_eval=0, early_stopping_rounds = 200)

      preds=model.predict(X_test, num_iteration=model.best_iteration)
      site_oof_preds[test_index] = preds
      err_cb.append(np.sqrt(mean_squared_error(y_test,preds)))
      
      test_pred = model.predict(Test.values, num_iteration=model.best_iteration)
      final_preds_per_location.append(test_pred)
  
  print(f"Site {site} --  Training RMSE :" ,np.mean(err_cb))
  return np.mean(final_preds_per_location,axis=0) , site_oof_preds

# ------------------------------------------------------------------------------------------------------------------------------

def Custom_train_Lgbm() :
  '''SEED The envierment'''
  seed = 0
  random.seed(seed)
  np.random.seed(seed)
  kfold=KFold(n_splits=5, random_state=1901,shuffle=True)
  
  final_preds=[]
  final_ids = []
  train_ids = []
  oof_prediction = []
  for site in tqdm_notebook([0,1,2], leave=False):
    print(60*'-')
    X = train[train['site']==site]  ; train_ids.extend(X['ID'].values)   ; y = X['ref_pm2_5']; X = X[features]
    Test = test[test['site']==site] ; final_ids.extend(Test['ID'].values); Test = Test[features]
    
    preds_per_location, site_oof_preds = Train_5Fold_lgbm(X=X,y=y,Test=Test,site=site,kfold=kfold)
    oof_prediction.extend(site_oof_preds) ; final_preds.extend(preds_per_location)
  
  oof_data = pd.DataFrame() ; oof_data['ID'] = train_ids ; oof_data['OOF'] = np.clip(oof_prediction,a_min=0,a_max=500) 
  preds_data = pd.DataFrame() ; preds_data['ID'] = final_ids ; preds_data['ref_pm2_5'] = np.clip(final_preds,a_min=0,a_max=500) 
  print()
  print(60*'#')
  return oof_data,preds_data

In [12]:
def Train_5Fold_xgb(X,y,Test,site,kfold) :
  final_preds_per_location = []
  err_cb=[]
  site_oof_preds = np.zeros(len(X))
  for fold,(train_index, test_index) in enumerate(kfold.split(X,y)):
      X_train, X_test = X.values[train_index], X.values[test_index]
      y_train, y_test = y.values[train_index], y.values[test_index]
      
      trn_data = xgb.DMatrix(X_train, y_train)
      val_data = xgb.DMatrix(X_test, y_test)
      watchlist = [(trn_data, 'train'), (val_data, 'valid')]

      model = xgb.train(params_xgb, trn_data,10000, evals = watchlist, verbose_eval=0, early_stopping_rounds = 200)

      dX_test = xgb.DMatrix(X_test)
      preds = model.predict(dX_test)
      site_oof_preds[test_index] = preds
      err_cb.append(np.sqrt(mean_squared_error(y_test,preds)))
      
      dTest = xgb.DMatrix(Test.values)
      test_pred = model.predict(dTest)
      final_preds_per_location.append(test_pred)
  
  print(f"Site {site} --  Training RMSE :" ,np.mean(err_cb))
  return np.mean(final_preds_per_location,axis=0) , site_oof_preds

# ------------------------------------------------------------------------------------------------------------------------------

def Custom_train_xgb() :
  '''SEED The envierment'''
  seed = 0
  random.seed(seed)
  np.random.seed(seed)
  kfold=KFold(n_splits=5, random_state=1901,shuffle=True)
  
  final_preds=[]
  final_ids = []
  train_ids = []
  oof_prediction = []
  for site in tqdm_notebook([0,1,2], leave=False):
    print(60*'-')
    X = train[train['site']==site]  ; train_ids.extend(X['ID'].values)   ; y = X['ref_pm2_5']; X = X[features]
    Test = test[test['site']==site] ; final_ids.extend(Test['ID'].values); Test = Test[features]
    
    preds_per_location, site_oof_preds = Train_5Fold_xgb(X=X,y=y,Test=Test,site=site,kfold=kfold)
    oof_prediction.extend(site_oof_preds) ; final_preds.extend(preds_per_location)
  
  oof_data = pd.DataFrame() ; oof_data['ID'] = train_ids ; oof_data['OOF'] = np.clip(oof_prediction,a_min=0,a_max=500) 
  preds_data = pd.DataFrame() ; preds_data['ID'] = final_ids ; preds_data['ref_pm2_5'] = np.clip(final_preds,a_min=0,a_max=500)
   
  print()
  print(60*'#')
  return oof_data,preds_data

In [13]:
def Train_5Fold_catboost_1(X,y,Test,site,kfold) :
  final_preds_per_location = []
  err_cb=[]
  site_oof_preds = np.zeros(len(X))
  for fold,(train_index, test_index) in enumerate(kfold.split(X,y)):
      X_train, X_test = X.values[train_index], X.values[test_index]
      y_train, y_test = y.values[train_index], y.values[test_index]
      
      model = CatBoostRegressor(n_estimators=5000,eval_metric='RMSE',learning_rate=0.05, random_seed= 0,
                                    use_best_model=True, ) 
      model.fit(X_train,y_train,eval_set=[(X_test, y_test)],early_stopping_rounds=20,verbose=0)

      preds=model.predict(X_test)
      site_oof_preds[test_index] = preds
      err_cb.append(np.sqrt(mean_squared_error(y_test,preds)))
      
      test_pred = model.predict(Test.values)
      final_preds_per_location.append(test_pred)
  
  print(f"Site {site} --  Training RMSE :" ,np.mean(err_cb))
  return np.mean(final_preds_per_location,axis=0) , site_oof_preds
  
# ------------------------------------------------------------------------------------------------------------------------------ 

def Custom_train_catboost_1(features) :
  kfold=KFold(n_splits=5, random_state=1901,shuffle=True)
  
  final_preds=[]
  final_ids = []
  train_ids = []
  oof_prediction = []
  for site in tqdm_notebook([0,1,2], leave=False):
    print(60*'-')
    X = train[train['site']==site]  ; train_ids.extend(X['ID'].values)   ; y = X['ref_pm2_5']; X = X[features]
    Test = test[test['site']==site] ; final_ids.extend(Test['ID'].values); Test = Test[features]
    
    preds_per_location, site_oof_preds = Train_5Fold_catboost_1(X=X,y=y,Test=Test,site=site,kfold=kfold)
    oof_prediction.extend(site_oof_preds) ; final_preds.extend(preds_per_location)
  
  oof_data = pd.DataFrame() ; oof_data['ID'] = train_ids ; oof_data['OOF'] = np.clip(oof_prediction,a_min=0,a_max=500) 
  preds_data = pd.DataFrame() ; preds_data['ID'] = final_ids ; preds_data['ref_pm2_5'] = np.clip(final_preds,a_min=0,a_max=500)
  
  print()
  print(60*'#')
  return oof_data,preds_data

In [14]:
def Train_5Fold_catboost_2(X,y,Test,site,kfold) :
  final_preds_per_location = []
  err_cb=[]
  site_oof_preds = np.zeros(len(X))
  for fold,(train_index, test_index) in enumerate(kfold.split(X,y)):
      X_train, X_test = X.values[train_index], X.values[test_index]
      y_train, y_test = y.values[train_index], y.values[test_index]

      model = CatBoostRegressor(n_estimators=5000,eval_metric='RMSE',learning_rate=0.05, random_seed= 0,
                                    use_best_model=True, ) 
      model.fit(X_train,y_train,eval_set=[(X_test, y_test)],early_stopping_rounds=20,verbose=0)
      preds=model.predict(X_test)
      site_oof_preds[test_index] = preds
      err_cb.append(np.sqrt(mean_squared_error(y_test,preds)))
      
      test_pred = model.predict(Test.values)
      final_preds_per_location.append(test_pred)
  
  print(f"Site {site} --  Training RMSE :" ,np.mean(err_cb))
  return np.mean(final_preds_per_location,axis=0) , site_oof_preds

# ------------------------------------------------------------------------------------------------------------------------------ 

def Custom_train_catboost_2(features) :
  kfold=KFold(n_splits=5, random_state=1901,shuffle=True)
  
  final_preds=[]
  final_ids = []
  train_ids = []
  oof_prediction = []
  for site in tqdm_notebook([0,1,2], leave=False):
    print(60*'-')
    X = train[train['site']==site]  ; train_ids.extend(X['ID'].values)   ; y = X['ref_pm2_5']; X = X[features]
    Test = test[test['site']==site] ; final_ids.extend(Test['ID'].values); Test = Test[features]
    
    preds_per_location, site_oof_preds = Train_5Fold_catboost_2(X=X,y=y,Test=Test,site=site,kfold=kfold)
    oof_prediction.extend(site_oof_preds) ; final_preds.extend(preds_per_location)
  
  oof_data = pd.DataFrame() ; oof_data['ID'] = train_ids ; oof_data['OOF'] = np.clip(oof_prediction,a_min=0,a_max=500) 
  preds_data = pd.DataFrame() ; preds_data['ID'] = final_ids ; preds_data['ref_pm2_5'] = np.clip(final_preds,a_min=0,a_max=500)
  
  print()
  print(60*'#')
  return oof_data,preds_data

In [15]:
def Train_5Fold_Stacking(X,y,Test,kfold) :
  final_preds = [] ; err_cb = []
  oof_stack = np.zeros(len(X)) ;
  for fold,(train_index, test_index) in enumerate(kfold.split(X,y)):
      X_train, X_test = X.values[train_index], X.values[test_index]
      y_train, y_test = y.values[train_index], y.values[test_index]

      model = Ridge(alpha=0.01,random_state=42)
      model.fit(X_train,y_train)
      preds=model.predict(X_test)
      preds = np.clip(preds,a_min=0,a_max=500)
      oof_stack[test_index] = preds
      err_cb.append(mean_squared_error(y_test,preds,squared=False))
      
      test_pred = model.predict(Test.values)
      final_preds.append(test_pred)
  
  print(2*'--------------------------------------')
  print(f"Stacking RMSE :" ,np.mean(err_cb))
  return oof_stack,np.mean(final_preds,axis=0)

## 1. LGBM

In [16]:
features = [x for x in train.columns 
            if x not in ['ID', 'created_at','dist_major_road' , 'greenness' , 'population','lat' , 'altitude','landform_90m','landform_270m',
                         'ref_pm2_5','long','pm10_div_s2_pm10','pm2_5_div_s2_pm2_5','pm10_div_s2_pm2_5','pm2_5_div_s2_pm10','month_day',]]
                                                  
params_lgb = {'objective' :'regression','boosting_type' : 'gbdt','metric': 'rmse' ,
              'learning_rate' : 0.1,'num_iterations': 2500,'max_depth' :9 ,'num_leaves' : 81,
              'feature_fraction': 0.7,'bagging_fraction': 0.9,'min_data_in_leaf':5,'reg_lambda' :50}

In [17]:
oof_data_lgb,final_preds_lgb = Custom_train_Lgbm()  # 20.246212898756532

HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))

------------------------------------------------------------
Site 0 --  Training RMSE : 8.246766743705319
------------------------------------------------------------
Site 1 --  Training RMSE : 20.246212898756532
------------------------------------------------------------
Site 2 --  Training RMSE : 9.845985263811599

############################################################


In [18]:
train_oof_data = pd.merge(train[['ID','ref_pm2_5']], oof_data_lgb,on='ID',how='left')
print('OOF RMSE : ',np.sqrt(mean_squared_error(train_oof_data['ref_pm2_5'],train_oof_data['OOF'])))

OOF RMSE :  11.996131962768287


## 2. xgb

In [19]:
features = [x for x in train.columns 
            if x not in ['ID', 'created_at','dist_major_road' , 'greenness' , 'population','lat' , 'altitude','landform_90m','landform_270m',
                         'ref_pm2_5','long','pm10_div_s2_pm10','pm2_5_div_s2_pm2_5','pm10_div_s2_pm2_5','pm2_5_div_s2_pm10','month_day',]]

params_xgb = {'objective': 'reg:squarederror','eval_metric': 'rmse','booster': 'gbtree', 
          'n_estimators': 10000,  'max_depth': 6, 'learning_rate': 0.08, 'max_bin': 100,
          'max_leaves': 64,'reg_alpha': 5, 'reg_lambda': 15,'subsample': 0.7 ,'colsample_bytree' : 0.7,
          'silent' : False,}

In [20]:
oof_data_xgb,final_preds_xgb = Custom_train_xgb()  # 20.507548323786573

HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))

------------------------------------------------------------
Site 0 --  Training RMSE : 8.120389948580696
------------------------------------------------------------
Site 1 --  Training RMSE : 20.425651203290435
------------------------------------------------------------
Site 2 --  Training RMSE : 9.769254383275811

############################################################


In [21]:
train_oof_data = pd.merge(train[['ID','ref_pm2_5']], oof_data_xgb,on='ID',how='left')
print('OOF RMSE : ',np.sqrt(mean_squared_error(train_oof_data['ref_pm2_5'],train_oof_data['OOF'])))

OOF RMSE :  11.987335628280457


## 3. CATBOOST_1

In [22]:
features = ['site', 'pm2_5', 'pm10', 's2_pm2_5', 's2_pm10', 'humidity', 'temp', 'long',
            'year', 'month', 'day', 'weekday', 'hour','weekofyear','day_hour']

In [23]:
oof_data_catboost_1,final_preds_catboost_1 = Custom_train_catboost_1(features=features) 

HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))

------------------------------------------------------------
Site 0 --  Training RMSE : 8.21782882822482
------------------------------------------------------------
Site 1 --  Training RMSE : 20.44675281291432
------------------------------------------------------------
Site 2 --  Training RMSE : 9.822242475827856

############################################################


In [24]:
train_oof_data = pd.merge(train[['ID','ref_pm2_5']], oof_data_catboost_1,on='ID',how='left')
print('OOF RMSE : ',np.sqrt(mean_squared_error(train_oof_data['ref_pm2_5'],train_oof_data['OOF'])))

OOF RMSE :  12.02234751287189


## 4. CATBOOST_2

In [25]:
features = [x for x in train.columns 
            if x not in ['ID', 'created_at','dist_major_road' , 'greenness' , 'population','lat' , 'altitude','landform_90m','landform_270m',
                         'ref_pm2_5','long','pm10_div_s2_pm10','pm2_5_div_s2_pm2_5','pm10_div_s2_pm2_5','pm2_5_div_s2_pm10','month_day',]]

In [26]:
oof_data_catboost_2,final_preds_catboost_2 = Custom_train_catboost_2(features=features) # 19.956495851102694

HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))

------------------------------------------------------------
Site 0 --  Training RMSE : 8.025612559316055
------------------------------------------------------------
Site 1 --  Training RMSE : 19.956495851102694
------------------------------------------------------------
Site 2 --  Training RMSE : 9.65985380499935

############################################################


In [27]:
train_oof_data = pd.merge(train[['ID','ref_pm2_5']], oof_data_catboost_2,on='ID',how='left')
print('OOF RMSE : ',np.sqrt(mean_squared_error(train_oof_data['ref_pm2_5'],train_oof_data['OOF'])))

OOF RMSE :  11.784461808774894


## 5. Stacking

In [28]:
from sklearn.linear_model import Ridge ,LinearRegression

In [29]:
stacking_train = oof_data_catboost_2.copy() ; del stacking_train['OOF'] ;
stacking_train['azer_preds_cat_2'] =  oof_data_catboost_2['OOF'] ;
stacking_train['azer_preds_cat_1'] =  oof_data_catboost_1['OOF']
stacking_train['azer_preds_xgb'] =  oof_data_xgb['OOF'] ;
stacking_train['azer_preds_lgb'] =  oof_data_lgb['OOF']
stacking_train = pd.merge(train[['ID','ref_pm2_5']], stacking_train,on='ID',how='left')
stacking_train.rename({'ref_pm2_5': 'target'})

stacking_test = final_preds_catboost_2.copy() ; del stacking_test['ref_pm2_5']
stacking_test['azer_preds_cat_2'] =  final_preds_catboost_2['ref_pm2_5'] ; 
stacking_test['azer_preds_cat_1'] =  final_preds_catboost_1['ref_pm2_5']
stacking_test['azer_preds_xgb'] =  final_preds_xgb['ref_pm2_5'] ;
stacking_test['azer_preds_lgb'] =  final_preds_lgb['ref_pm2_5']

In [30]:
cols = ['azer_preds_cat_2','azer_preds_cat_1', 'azer_preds_xgb', 'azer_preds_lgb']

X , y , Test = stacking_train[cols] , stacking_train['ref_pm2_5'] , stacking_test[cols]
KFOLD = KFold(n_splits=5,random_state=1901,shuffle=True)

oof_stack,stack_preds = Train_5Fold_Stacking(X=X,y=y,Test=Test,kfold=KFOLD)

----------------------------------------------------------------------------
Stacking RMSE : 11.466119699621165


In [33]:
print('Stacking OOF',mean_squared_error(y,oof_stack,squared=False))

Stacking OOF 11.66078552317838


In [32]:
# save stacking data's 
stacking_train.to_csv('azer_stacking_train.csv',index=False)
stacking_test.to_csv('azer_stacking_test.csv',index=False)