In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
data = pd.read_csv('/content/drive/MyDrive/Projects/Air Quality Index/extracted_data.csv')
data.head()

Unnamed: 0,PM2.5 AQI Value,AQI Category,T,TM,Tm,SLP,H,PP,VV,V,VM,VG,RA,SN,TS,FG
0,64,Moderate,4.4,11.7,-1.7,1027.7,68.0,0.0,16.1,7.2,16.5,-,,,,
1,51,Moderate,7.3,11.7,-1.7,1014.8,71.0,0.0,15.8,3.1,7.6,-,o,,,
2,25,Good,1.2,8.9,-4.4,1016.4,69.0,13.97,12.6,32.4,51.9,68.3,o,o,,
3,31,Good,,,,,,,,,,,,,,
4,22,Good,,,,,,,,,,,,,,


In [4]:
data.shape

(2439, 16)

# Feature Engineering

In [5]:
# Checking for null values
data.isnull().sum()

PM2.5 AQI Value       0
AQI Category          0
T                  1193
TM                 1193
Tm                 1193
SLP                1193
H                  1193
PP                 1193
VV                 1193
V                  1193
VM                 1193
VG                 1193
RA                 1432
SN                 2374
TS                 2148
FG                 2164
dtype: int64

In [6]:
# NaN values in RA, SN, TS, FG represent the absence of respective events on that day
data['RA'] = np.where(data['RA']=='o',1,0)
data['SN'] = np.where(data['SN']=='o',1,0)
data['TS'] = np.where(data['TS']=='o',1,0)
data['FG'] = np.where(data['FG']=='o',1,0)

In [7]:
# dropping all the rows containing null values
data.dropna(inplace=True)

In [8]:
data.reset_index(drop=True,inplace=True)

In [9]:
data['AQI Category'].value_counts()

Good        1139
Moderate     107
Name: AQI Category, dtype: int64

In [10]:
data['AQI Category'] = np.where(data['AQI Category']=='Good',1,0)

In [11]:
# Replacing the '-' characters in features with NaN values
data.replace('-',np.nan,inplace=True)

In [12]:
data.head()

Unnamed: 0,PM2.5 AQI Value,AQI Category,T,TM,Tm,SLP,H,PP,VV,V,VM,VG,RA,SN,TS,FG
0,64,0,4.4,11.7,-1.7,1027.7,68,0.0,16.1,7.2,16.5,,0,0,0,0
1,51,0,7.3,11.7,-1.7,1014.8,71,0.0,15.8,3.1,7.6,,1,0,0,0
2,25,1,1.2,8.9,-4.4,1016.4,69,13.97,12.6,32.4,51.9,68.3,1,1,0,0
3,17,1,-5.9,2.8,-10.0,1024.1,37,3.05,16.1,25.7,37.0,55.4,0,0,0,0
4,43,1,-4.7,2.8,-10.6,1034.3,46,0.0,16.1,7.6,13.0,,0,0,0,0


In [13]:
data.isnull().mean()

PM2.5 AQI Value    0.000000
AQI Category       0.000000
T                  0.000000
TM                 0.000803
Tm                 0.000803
SLP                0.000000
H                  0.001605
PP                 0.000000
VV                 0.000000
V                  0.000000
VM                 0.000000
VG                 0.400482
RA                 0.000000
SN                 0.000000
TS                 0.000000
FG                 0.000000
dtype: float64

In [14]:
data.dropna(subset=['TM','Tm','H'],inplace=True)

In [15]:
x = data.drop('PM2.5 AQI Value',axis=1).copy()
y = data['PM2.5 AQI Value']

In [23]:
# train_test_split
from sklearn.model_selection import train_test_split
xtrain,xtest,ytrain,ytest = train_test_split(x,y,test_size=0.2,random_state=0)

In [None]:
# changing the datatype of features
xtrain['TM'] = pd.to_numeric(xtrain['TM'])
xtrain['Tm'] = pd.to_numeric(xtrain['Tm'])
xtrain['H'] = pd.to_numeric(xtrain['H'])
xtrain['VG'] = pd.to_numeric(xtrain['VG'])

In [None]:
xtrain['VG_nan'] = np.where(xtrain['VG'].isnull(),1,0)

In [None]:
# changing the datatype of features
xtest['TM'] = pd.to_numeric(xtest['TM'])
xtest['Tm'] = pd.to_numeric(xtest['Tm'])
xtest['H'] = pd.to_numeric(xtest['H'])
xtest['VG'] = pd.to_numeric(xtest['VG'])

In [None]:
xtest['VG_nan'] = np.where(xtest['VG'].isnull(),1,0)

In [None]:
xtrain['VG'] = np.where(xtrain['VG'].isnull(),xtrain['VG'].median(),xtrain['VG'])

In [None]:
xtest['VG'] = np.where(xtest['VG'].isnull(),xtrain['VG'].median(),xtest['VG'])

# Model Building

In [31]:
#pip install catboost

In [33]:
from sklearn.linear_model import LinearRegression,BayesianRidge
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor,GradientBoostingRegressor,AdaBoostRegressor,BaggingRegressor
from xgboost import XGBRegressor
from catboost import CatBoostRegressor
from lightgbm import LGBMRegressor
from sklearn.svm import SVR
from mlxtend.regressor import StackingRegressor
from sklearn.model_selection import RandomizedSearchCV,GridSearchCV,cross_val_score,KFold
from sklearn.metrics import mean_squared_error, mean_absolute_error, mean_squared_log_error

In [70]:
# 10 Fold Cross validation

kf = KFold(n_splits=10, random_state=42, shuffle=True)

cv_scores = []
cv_std = []

baseline_models = ['Linear_Reg.','Bayesian_Ridge_Reg.','LGBM_Reg.','SVR',
                   'Dec_Tree_Reg.','Random_Forest_Reg.', 'XGB_Reg.',
                   'Grad_Boost_Reg.','Cat_Boost_Reg.','Stacked_Reg.']

In [78]:
# Creation of the RMSE metric:
    
def rmse(y, y_pred):
    return np.sqrt(mean_squared_error(y, y_pred))

def cv_rmse(model):
    rmse = np.sqrt(-cross_val_score(model, xtrain, ytrain, cv=kf))
    return (rmse)

In [None]:
# Linear Regression

lreg = LinearRegression()
score_lreg = cv_rmse(lreg)
cv_scores.append(score_lreg.mean())
cv_std.append(score_lreg.std())

# Bayesian Ridge Regression

brr = BayesianRidge(compute_score=True)
score_brr = cv_rmse(brr)
cv_scores.append(score_brr.mean())
cv_std.append(score_brr.std())

# Light Gradient Boost Regressor

l_gbm = LGBMRegressor(objective='regression')
score_l_gbm = cv_rmse(l_gbm)
cv_scores.append(score_l_gbm.mean())
cv_std.append(score_l_gbm.std())

# Support Vector Regression

svr = SVR()
score_svr = cv_rmse(svr)
cv_scores.append(score_svr.mean())
cv_std.append(score_svr.std())

# Decision Tree Regressor

dtr = DecisionTreeRegressor()
score_dtr = cv_rmse(dtr)
cv_scores.append(score_dtr.mean())
cv_std.append(score_dtr.std())

# Random Forest Regressor

rfr = RandomForestRegressor()
score_rfr = cv_rmse(rfr)
cv_scores.append(score_rfr.mean())
cv_std.append(score_rfr.std())

# XGB Regressor

xgb = XGBRegressor()
score_xgb = cv_rmse(xgb)
cv_scores.append(score_xgb.mean())
cv_std.append(score_xgb.std())

# Gradient Boost Regressor

gbr = GradientBoostingRegressor()
score_gbr = cv_rmse(gbr)
cv_scores.append(score_gbr.mean())
cv_std.append(score_gbr.std())

# Cat Boost Regressor

catb = CatBoostRegressor()
score_catb = cv_rmse(catb)
cv_scores.append(score_catb.mean())
cv_std.append(score_catb.std())

# Stacked Regressor

stack_gen = StackingRegressor(regressors=(CatBoostRegressor(),
                                          BayesianRidge()),
                              meta_regressor = CatBoostRegressor(),
                              use_features_in_secondary = True)

score_stack_gen = cv_rmse(stack_gen)
cv_scores.append(score_stack_gen.mean())
cv_std.append(score_stack_gen.std())

In [45]:
final_cv_score = pd.DataFrame(baseline_models, columns = ['Regressors'])
final_cv_score['RMSE_mean'] = cv_scores
final_cv_score['RMSE_std'] = cv_std

In [54]:
final_cv_score.sort_values(by='RMSE_mean')

Unnamed: 0,Regressors,RMSE_mean,RMSE_std
6,XGB_Reg.,8.233865,0.472751
8,Cat_Boost_Reg.,8.281464,0.603249
5,Random_Forest_Reg.,8.290073,0.569968
7,Grad_Boost_Reg.,8.319878,0.485085
2,LGBM_Reg.,8.530351,0.574829
9,Stacked_Reg.,8.598517,0.67986
0,Linear_Reg.,9.023674,0.39481
1,Bayesian_Ridge_Reg.,9.026131,0.390162
4,Dec_Tree_Reg.,11.660389,0.810923
3,SVR,12.846905,1.335178


## Hyperparameter Tuning

### XGBoost

In [55]:
n_estimators = [100, 500, 900, 1100, 1500]
max_depth = [2, 3, 5, 10, 15]
booster=['gbtree','gblinear']
learning_rate=[0.05,0.1,0.15,0.20]
min_child_weight=[1,2,3,4]
base_score=[0.25,0.5,0.75,1]

# Define the grid of hyperparameters to search
hyperparameter_grid = {
    'n_estimators': n_estimators,
    'max_depth':max_depth,
    'learning_rate':learning_rate,
    'min_child_weight':min_child_weight,
    'booster':booster,
    'base_score':base_score
    }

In [58]:
# Set up the random search with 4-fold cross validation
import xgboost
xgreg = xgboost.XGBRegressor()

random_cv = RandomizedSearchCV(estimator=xgreg,
            param_distributions=hyperparameter_grid,
            n_iter=100,cv=3,
            scoring = 'neg_mean_absolute_error',n_jobs = 4,
            verbose = 5, 
            return_train_score = True,
            random_state=42)

random_cv.fit(xtrain,ytrain)

Fitting 3 folds for each of 100 candidates, totalling 300 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  10 tasks      | elapsed:    5.1s
[Parallel(n_jobs=4)]: Done  64 tasks      | elapsed:   27.8s
[Parallel(n_jobs=4)]: Done 154 tasks      | elapsed:   48.2s
[Parallel(n_jobs=4)]: Done 280 tasks      | elapsed:  1.6min




[Parallel(n_jobs=4)]: Done 300 out of 300 | elapsed:  1.7min finished


RandomizedSearchCV(cv=3, error_score=nan,
                   estimator=XGBRegressor(base_score=0.5, booster='gbtree',
                                          colsample_bylevel=1,
                                          colsample_bynode=1,
                                          colsample_bytree=1, gamma=0,
                                          importance_type='gain',
                                          learning_rate=0.1, max_delta_step=0,
                                          max_depth=3, min_child_weight=1,
                                          missing=None, n_estimators=100,
                                          n_jobs=1, nthread=None,
                                          objective='reg:linear',
                                          random_state=0, reg_alpha=...
                   iid='deprecated', n_iter=100, n_jobs=4,
                   param_distributions={'base_score': [0.25, 0.5, 0.75, 1],
                                        'booster': ['

In [96]:
from sklearn.metrics import r2_score
xgb_pred = random_cv.predict(xtest)
xgb_rmse = rmse(xgb_pred,ytest)
xgb_rmse

8.13443139408054

### Catboost

In [80]:
params = {'iterations': 6000,
          'learning_rate': 0.005,
          'depth': 4,
          'l2_leaf_reg': 1,
          'eval_metric':'RMSE',
          'early_stopping_rounds': 200,
          'verbose': 200,
          'random_seed': 42}
         
cat = CatBoostRegressor(**params)
cat_model = cat.fit(xtrain,ytrain,
                     plot=True,
                     verbose = False)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

In [97]:
cat_pred = cat.predict(xtest)
cat_rmse = rmse(cat_pred,ytest)
cat_rmse

8.247874288233705

### Random forest

In [91]:
from pprint import pprint
n_estimators = [int(x) for x in np.linspace(start = 10, stop = 100, num = 10)]
max_features = ['auto', 'sqrt']

# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features':max_features,'min_samples_split':[2,10,20],\
               'max_depth':[None,2,5,10],'min_samples_leaf':[1,5,10],\
               'max_leaf_nodes':[None,5,10,20],'bootstrap':[True,False]}
pprint(random_grid)

{'bootstrap': [True, False],
 'max_depth': [None, 2, 5, 10],
 'max_features': ['auto', 'sqrt'],
 'max_leaf_nodes': [None, 5, 10, 20],
 'min_samples_leaf': [1, 5, 10],
 'min_samples_split': [2, 10, 20],
 'n_estimators': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100]}


In [92]:
rf = RandomForestRegressor()

rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, 
                              verbose=1, random_state=42,n_jobs=4,n_iter=50)

rf_random.fit(xtrain,ytrain)

Fitting 5 folds for each of 50 candidates, totalling 250 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  76 tasks      | elapsed:    8.6s
[Parallel(n_jobs=4)]: Done 236 tasks      | elapsed:   27.7s
[Parallel(n_jobs=4)]: Done 250 out of 250 | elapsed:   29.4s finished


RandomizedSearchCV(cv=None, error_score=nan,
                   estimator=RandomForestRegressor(bootstrap=True,
                                                   ccp_alpha=0.0,
                                                   criterion='mse',
                                                   max_depth=None,
                                                   max_features='auto',
                                                   max_leaf_nodes=None,
                                                   max_samples=None,
                                                   min_impurity_decrease=0.0,
                                                   min_impurity_split=None,
                                                   min_samples_leaf=1,
                                                   min_samples_split=2,
                                                   min_weight_fraction_leaf=0.0,
                                                   n_estimators=100,
                           

In [93]:
rf_pred = rf_random.predict(xtest)
rf_rmse = rmse(rf_pred,ytest)
rf_rmse

8.341033461217162

XGBoost Regressor, CatBoost Regressor, Random Forest Regressor give the best results among all the models