# Loading Libraries

In [1]:
!pip install catboost

Collecting catboost
[?25l  Downloading https://files.pythonhosted.org/packages/5a/41/24e14322b9986cf72a8763e0a0a69cc256cf963cf9502c8f0044a62c1ae8/catboost-0.26-cp37-none-manylinux1_x86_64.whl (69.2MB)
[K     |████████████████████████████████| 69.2MB 44kB/s 
Installing collected packages: catboost
Successfully installed catboost-0.26


In [2]:
import os, sys, gc, warnings, random
import numpy as np
import pandas as pd
from sklearn import preprocessing
%matplotlib inline
from sklearn.model_selection import RandomizedSearchCV, KFold
import warnings
warnings.filterwarnings('ignore')
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import LabelEncoder
from catboost import CatBoostRegressor, Pool

In [4]:
#You should specify the path of "Train.csv"
train_path= '/content/Train.csv'
#You should specify the path of "Test.csv"
test_path= '/content/Test.csv'

train=pd.read_csv(train_path) 
test=pd.read_csv(test_path) 

# Preprocessing & Feature Engineering

In [5]:
def process(data):
  data.drop(columns=['ID'],inplace=True)
  #we change the target column from "ref_pm2_5" to "leakage"
  data.rename(columns={'ref_pm2_5':'leakage'},inplace=True)

  data=data.astype({'created_at':np.datetime64})
  data['year']=data['created_at'].dt.year
  data['month']=data['created_at'].dt.month
  data['day']=data['created_at'].dt.day
  data['hour']=data['created_at'].dt.hour

  #leakage11 is the mean of the target (ref_pm2_5 that we named leakage) for every site, year and month
  data['leakage11']=data.groupby(['site','year','month']).transform(lambda x: x.mean()).leakage

  #leakage is now the mean of the target (ref_pm2_5) for every site, year, month and day
  data['leakage']=data.groupby(['site','year','month','day']).transform(lambda x: x.mean()).leakage

  #h_mean is the mean of "leakage" for every site and hour
  data['h_mean']=data.groupby(['site', 'hour']).transform(lambda x: x.mean()).leakage

  #h_mean_monthly is the mean of "leakage" for every site, month and hour
  data['h_mean_monthly']=data.groupby(['site', 'month', 'hour']).transform(lambda x: x.mean()).leakage

  #d_mean_all is the mean of "leakage" for every year, month and day
  data['d_mean_all']=data.groupby(['year','month', 'day']).transform(lambda x: x.mean()).leakage

  data=data.astype({'leakage':'float64','leakage11':'float64'})

  #we change the measurement unit of the temperature from Celsius to Kelvin
  data['temp']=data['temp']+273

  #we determine the values of the pressure using the altitude and the temperature (using a physics formula to calculate the pressure)
  data['pressure']=101325*(np.exp((-28.966e-3)*9.05*data['altitude']/(8.314*data['temp'])))

  data.drop(columns=['created_at'],inplace=True)
  
  #we apply label encoding to "site"
  label_encoder = LabelEncoder()
  data['site']=label_encoder.fit_transform(data['site'])
  return data

### Lag Features

1. lagb refers to "Lag before"
1. lagb function adds the following features: for each row,


>*   **t_1**:  the difference between the nearest **previous** non-nan value of **ref_pm2_5** (this value is recorded just **before** the values corresponding to this row) and its corresponding **pm10** which is afterwards defined as pm10t_1 **(pm10 - ref_pm2_5)**.
>*  **pm10t_1**:  the value of pm10 corresponding to t_1
>*  **difft_1**:  the difference between t_1 and pm10t_1 **(pm10t_1 - t_1)**, corresponds also to the nearest **previous** non-nan value of ref_pm2_5 which is recorded just **before** the values corresponding to this row .

In [6]:
def lagb(te,train1):
  te["t_1"]=np.NaN
  te["pm10t_1"]=np.NaN

  for i in range(te.shape[0]):
    try:
      te.t_1[i]=train1.loc[(train1['site']==te.site[i])&(train1['year']==te.year[i]) & (train1['month']==te.month[i]) & (train1['day']==te.day[i]) & (train1['hour']<te.hour[i])].sort_values(by=['hour'],ascending=False).reset_index().loc[0,['ref']]
      te.pm10t_1[i]=train1.loc[(train1['site']==te.site[i])&(train1['year']==te.year[i]) & (train1['month']==te.month[i]) & (train1['day']==te.day[i]) & (train1['hour']<te.hour[i])].sort_values(by=['hour'],ascending=False).reset_index().loc[0,['pm10']]
    except:
      try:
        te.t_1[i]=train1.loc[(train1['site']==te.site[i])&(train1['year']==te.year[i]) & (train1['month']==te.month[i]) & (train1['day']==te.day[i]-1)].sort_values(by=['hour'],ascending=False).reset_index().loc[0,['ref']]
        te.pm10t_1[i]=train1.loc[(train1['site']==te.site[i])&(train1['year']==te.year[i]) & (train1['month']==te.month[i]) & (train1['day']==te.day[i]-1)].sort_values(by=['hour'],ascending=False).reset_index().loc[0,['pm10']]
                                                                                                                                                                                                                        
      except:
        te.t_1[i]=69
        te.pm10t_1[i]=69

  te['tm']= 0.5*(te.t_1+te.t__1)
  te['difft_1']= te["pm10t_1"]-te["t_1"]
  return te

1. laga refers to "Lag after"
1. laga function adds the following features: for each row,


>*   **t__1**:  the difference between the nearest **next** non-nan value of **ref_pm2_5** (this value is recorded just **after** the values corresponding to this row) and its corresponding **pm10** which is afterwards defined as pm10t__1 **(pm10 - ref_pm2_5)**.
>*  **pm10t__1**:  the value of pm10 corresponding to t__1
>*  **difft__1**:  the difference between t__1 and pm10t__1 **(pm10t__1 - t__1)**, corresponds also to the nearest **next** non-nan value of ref_pm2_5 which is recorded just **after** the values corresponding to this row .




In [7]:
def laga(te,train1):
  te["t__1"]=np.NaN
  te["pm10t__1"]=np.NaN
  for i in range(te.shape[0]):
    try:
      te.t__1[i]=train1.loc[(train1['site']==te.site[i])&(train1['year']==te.year[i]) & (train1['month']==te.month[i]) & (train1['day']==te.day[i]) & (train1['hour']>te.hour[i])].sort_values(by=['hour'],ascending=True).reset_index().loc[0,['ref']]
      te.pm10t__1[i]=train1.loc[(train1['site']==te.site[i])&(train1['year']==te.year[i]) & (train1['month']==te.month[i]) & (train1['day']==te.day[i]) & (train1['hour']>te.hour[i])].sort_values(by=['hour'],ascending=True).reset_index().loc[0,['pm10']]
    except:
      try:
        te.t__1[i]=train1.loc[(train1['site']==te.site[i])&(train1['year']==te.year[i]) & (train1['month']==te.month[i]) & (train1['day']==te.day[i]+1)].sort_values(by=['hour'],ascending=True).reset_index().loc[0,['ref']]
        te.pm10t__1[i]=train1.loc[(train1['site']==te.site[i])&(train1['year']==te.year[i]) & (train1['month']==te.month[i]) & (train1['day']==te.day[i]+1)].sort_values(by=['hour'],ascending=True).reset_index().loc[0,['pm10']]
      except:
        te.t__1[i]=69
        te.pm10t__1[i]=69
  te['difft__1']=te["pm10t__1"]-te["t__1"]
  return te

###Preprocessing Data and generating features

In [8]:
#we choose to change the target; we want to predict the difference between ref_pm2_5 and pm10 instead of ref_pm2_5
y=train['pm10']-train['ref_pm2_5']

data=pd.concat([train,test])
data['ref_pm2_5']=data['pm10']-data['ref_pm2_5']
#PS: we changed the target but we kept the name "ref_pm2_5". 

#Zscore describes pm_10 value's relationship to the mean of ref_pm2_5 measured in terms of standard deviation of ref_pm2_5 (additional feature)
data['Zscore']=((data['pm10'])-train['ref_pm2_5'].mean())/(train['ref_pm2_5'].std())

data=process(data)

train=data.iloc[:len(train)]
test=data.iloc[len(train):]
train1=train.copy()
train1['ref']=y

#Applying laga and lagb to train
train=laga(train,train1)
train=lagb(train,train1)

#Applying laga and lagb to test
test=laga(test,train1)
test=lagb(test,train1)

data=pd.concat([train,test])

#Applying one-hot encoding to "site"
data=pd.get_dummies(data,columns=['site'])

train=data.iloc[:len(train)]
test=data.iloc[len(train):]

# Models

In [9]:
#we use two models (each one corresponds to a random_state value: 42 and 49)
lvl1_preds= np.array(0)

for i in [42, 49]:
  model= CatBoostRegressor(learning_rate=0.05, depth=16,
                         l2_leaf_reg=1, min_child_samples=4,
                         grow_policy='Depthwise', iterations=2000, eval_metric='RMSE', od_type='iter',
                         od_wait=20, random_state= i, verbose=True)

  model.fit(train, y, plot=True, verbose=False, early_stopping_rounds=20)

  y_pred_test= model.predict(test)
  y_pred_test= y_pred_test.reshape(y_pred_test.shape[0], 1)

  try:
    lvl1_preds= np.concatenate((lvl1_preds, y_pred_test), axis=1)
  except:
    lvl1_preds= y_pred_test

  print("Done: ", i, "\t", lvl1_preds.shape)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

Done:  42 	 (2733, 1)


MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

Done:  49 	 (2733, 2)


In [10]:
#our prediction is the mean of the predictions found by the two models
lvl1_mean= np.mean(lvl1_preds, axis=1)
id=pd.read_csv('/content/Test.csv')
#since we predicted the difference between ref_pm2_5 and pm10, we determine now the desired value of ref_pm2_5 
output = pd.DataFrame({'ID': id['ID'],
                       'ref_pm2_5': test['pm10']-lvl1_mean})
output.to_csv('r42_r49_verif.csv', index=False)