### Importing libraries

In [71]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.linear_model import LinearRegression,Ridge,Lasso
from sklearn.model_selection import GridSearchCV,cross_val_score
from sklearn.metrics import r2_score
import warnings
warnings.filterwarnings("ignore")
from statsmodels.tsa.holtwinters import ExponentialSmoothing
from sklearn.metrics import mean_absolute_percentage_error,mean_squared_error

### Importing the datasets

In [72]:
ad_so2=pd.read_excel(r'Alibad_so2.xlsx')
ad_nox=pd.read_excel(r'alibad_no2.xlsx')
ad_pm10=pd.read_excel(r'alibad_PM10.xlsx')
ad_aqi=pd.read_excel(r'adilabad_aqi.xlsx')

### Filling the null values row-wise as there seems to more yearly relation than monthly relation for each pollutant

In [73]:
for i in range(0,7):
    ad_so2.iloc[i,1:]=ad_so2.iloc[i,1:].fillna(ad_so2.iloc[i,1:].mean())
    ad_nox.iloc[i,1:]=ad_nox.iloc[i,1:].fillna(ad_nox.iloc[i,1:].mean())
    ad_pm10.iloc[i,1:]=ad_pm10.iloc[i,1:].fillna(ad_pm10.iloc[i,1:].mean())
    ad_aqi.iloc[i,1:]=ad_aqi.iloc[i,1:].fillna(ad_aqi.iloc[i,1:].mean())

### Reorganizing the data for a combined dataset.

In [74]:
ad_so2 = ad_so2.melt(id_vars=["Year"], var_name="Month", value_name="SO2")
ad_nox = ad_nox.melt(id_vars=["Year"], var_name="Month", value_name="NOX")
ad_pm10 = ad_pm10.melt(id_vars=["Year"], var_name="Month", value_name="PM10")
ad_aqi = ad_aqi.melt(id_vars=["Year"], var_name="Month", value_name="AQI")


### Organizing the data for better model performance

In [75]:
ad_poll = pd.merge(ad_pm10, ad_nox, on=['Month','Year'])
ad_poll = pd.merge(ad_poll, ad_so2, on=['Month','Year'])
ad_final=pd.merge(ad_poll,ad_aqi,on=['Month','Year'])
ad_final.index = pd.to_datetime(ad_final['Year'].astype(str) + '-' + ad_final['Month'], format='%Y-%b')
ad_final.drop(['Year', 'Month'], axis=1, inplace=True)
ad_final=ad_final.sort_index()
ad_final['2016':'2021']

Unnamed: 0,PM10,NOX,SO2,AQI
2016-01-01,62.0,18.428571,4.571429,67.333333
2016-02-01,68.0,18.428571,4.571429,68.333333
2016-03-01,71.0,18.428571,4.571429,71.111111
2016-04-01,70.0,18.428571,4.571429,69.555556
2016-05-01,69.0,18.428571,4.571429,70.555556
...,...,...,...,...
2021-08-01,69.0,24.150000,5.950000,69.000000
2021-09-01,69.0,24.150000,5.950000,69.000000
2021-10-01,69.0,24.150000,5.950000,69.000000
2021-11-01,69.0,24.150000,5.950000,69.000000


### Test Train Split

In [76]:
X_=ad_final.iloc[:,:-1]
y=ad_final.iloc[:,-1]

In [77]:
#Scalling the Data to improve model perfomance and bring features into similar range
ss=StandardScaler()
X=ss.fit_transform(X_)
X=pd.DataFrame(X,columns=ad_final.columns[:-1])

In [78]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,random_state=66,test_size=0.25)

### Perfoming Hyper-parameter tuning with comparison with other models

In [79]:
#Listing Parameters

rf_params = {'n_estimators': [500,300,100,800,1000], 'max_depth': [12,15,10,5,7,3]}

lr_param_grid = {}

svr_param_grid = {'kernel': ['linear', 'rbf'], 'C': [0.1, 1, 10], 'gamma': [0.1, 1, 10]}

In [80]:
# create models
rf_model = RandomForestRegressor(random_state=42)
lr_model = LinearRegression()
svr_model = SVR()

In [81]:
# create GridSearchCV objects
rf_grid = GridSearchCV(rf_model, rf_params, cv=5)
lr_grid = GridSearchCV(lr_model, lr_param_grid)
svr_grid = GridSearchCV(svr_model, svr_param_grid)

In [82]:
# fit the models
rf_grid.fit(X_train, y_train)
lr_grid.fit(X_train, y_train)
svr_grid.fit(X_train, y_train)

GridSearchCV(estimator=SVR(),
             param_grid={'C': [0.1, 1, 10], 'gamma': [0.1, 1, 10],
                         'kernel': ['linear', 'rbf']})

In [83]:
# get the best hyperparameters
rf_best_params = rf_grid.best_params_
lr_best_params = lr_grid.best_params_
svr_best_params = svr_grid.best_params_

In [84]:
# create new models with the best hyperparameters
rf_best_model = RandomForestRegressor(**rf_best_params, random_state=42)
lr_best_model = LinearRegression(**lr_best_params)
svr_best_model = SVR(**svr_best_params)

In [85]:
# fit the best models
rf_best_model.fit(X_train, y_train)
lr_best_model.fit(X_train, y_train)
svr_best_model.fit(X_train, y_train)

SVR(C=10, gamma=0.1)

In [86]:
# make predictions
rf_preds = rf_best_model.predict(X_test)
lr_preds = lr_best_model.predict(X_test)
svr_preds = svr_best_model.predict(X_test)

### Checking individual model performace

In [87]:
# calculate r2 scores

rf_r2 = r2_score(y_test, rf_preds)
print(f"Random Forest R2 Score: {rf_r2}")

lr_r2 = r2_score(y_test, lr_preds)
print(f"Linear Regression R2 Score: {lr_r2}")

svr_r2 = r2_score(y_test, svr_preds)
print(f"Support Vector Regressor R2 Score: {svr_r2}")

Random Forest R2 Score: 0.8603987559458011
Linear Regression R2 Score: 0.9553439343559368
Support Vector Regressor R2 Score: 0.915883235716348


### Checking Ridge and Lasso Regression and cross validation to prevent overfitting

In [88]:
lr = LinearRegression()
ridge = Ridge()
lasso = Lasso()
rf = RandomForestRegressor()
svr = SVR()

lr_params = {'normalize': [True, False]}
ridge_params = {'alpha': [0.01, 0.1, 1, 10, 100]}
lasso_params = {'alpha': [0.01, 0.1, 1, 10, 100]}
rf_params = {'n_estimators': [500,300,100,800,1000], 'max_depth':[12,15,10,5,7,3] }
svr_params = {'C': [0.1, 1, 10], 'gamma': [0.1, 1, 10], 'kernel': ['linear', 'rbf']}

In [89]:
models = {'Linear Regression': (lr, lr_params),
          'Ridge Regression': (ridge, ridge_params),
          'Lasso Regression': (lasso, lasso_params),
          'Random Forest Regression': (rf, rf_params),
          'Support Vector Regression': (svr, svr_params)}

In [90]:
for name, (model, params) in models.items():
    gs = GridSearchCV(model, params, cv=2,n_jobs=-1)
    scores = cross_val_score(gs, X_test, y_test, cv=2, scoring='r2')
    print(f'{name}: {scores.mean():.3f} (±{scores.std():.3f})')

Linear Regression: 0.978 (±0.008)
Ridge Regression: 0.943 (±0.028)
Lasso Regression: 0.980 (±0.011)
Random Forest Regression: 0.789 (±0.176)
Support Vector Regression: 0.917 (±0.054)


### Performing ensemble methods and full model accuracy

In [91]:
import numpy as np
f_pred=[]
f_pred.append((rf_preds+lr_preds+svr_preds)/3)
f_pred=np.array(f_pred)
f_pred=f_pred.reshape(-1,1)

In [92]:
rr2 = r2_score(y_test, f_pred)
print(f" R2 Score: {rr2}")

mape = mean_absolute_percentage_error(y_test, f_pred)
print(f" Mean Absolute Percentage Error: {mape}")

mse=mean_squared_error(y_test,f_pred)
print(f"Mean Squared Error : {mape}")

 R2 Score: 0.9330395567535132
 Mean Absolute Percentage Error: 0.021954157271185555
Mean Squared Error : 0.021954157271185555


### Final Predictions


In [93]:
loc=[]
for i in f_pred:
    if (i>=0) and i<=50:
        loc.append('Good')
    elif i>=51 and i<=100:
        loc.append('Moderate')
    elif i>=101 and i<=150:
        loc.append('Unhealthy for Sensitive Groups')
    elif i>=151 and i<=200:
        loc.append('Unhealthy')
    elif i>=201 and i<=300:
        loc.append('Very Unhealthy')
    else:
        loc.append('Hazardous')
y_test=pd.DataFrame(y_test)
y_test['Level of Concern']=loc

In [94]:
y_test

Unnamed: 0,AQI,Level of Concern
2019-08-01,79.0,Moderate
2021-09-01,69.0,Moderate
2017-04-01,68.962963,Moderate
2018-10-01,67.111111,Moderate
2019-11-01,78.666667,Moderate
2020-01-01,78.0,Moderate
2021-11-01,69.0,Moderate
2020-05-01,71.779835,Moderate
2022-09-01,56.0,Moderate
2022-02-01,58.414815,Moderate


### Predictor Function

In [95]:
def predictor(df):
    df=ss.transform(df.values)
    rfp=rf_best_model.predict(df)
    lrp=lr_best_model.predict(df)
    svrp=svr_best_model.predict(df)
    f_pred=[]
    f_pred.append((rfp+lrp+svrp)/3)
    f_pred=np.array(f_pred)
    f_pred=f_pred.reshape(-1,1)
    loc=[]
    for i in f_pred:
        if (i>=0) and i<=50:
            loc.append('Good')
        elif i>=51 and i<=100:
            loc.append('Moderate')
        elif i>=101 and i<=150:
            loc.append('Unhealthy for Sensitive Groups')
        elif i>=151 and i<=200:
            loc.append('Unhealthy')
        elif i>=201 and i<=300:
            loc.append('Very Unhealthy')
        else:
            loc.append('Hazardous')
    df_p=pd.DataFrame(f_pred,columns=['AQI'])
    df_p['AQI']=f_pred
    df_p['Level of Concern']=np.array(loc).reshape(-1,1)
    print(df_p)
    return df_p

In [96]:
res=predictor(X_['2022'])

          AQI Level of Concern
0   59.303101         Moderate
1   59.303101         Moderate
2   58.009091         Moderate
3   64.696709         Moderate
4   60.910562         Moderate
5   58.690805         Moderate
6   55.888752         Moderate
7   58.144096         Moderate
8   57.733556         Moderate
9   63.495554         Moderate
10  59.992046         Moderate
11  61.738568         Moderate


In [108]:
print("Mean Absolute Error:",mean_absolute_error(y['2022'],res['AQI']))
print("Mean Squared Error:",mean_squared_error(y['2022'],res['AQI']))
print('Mean Absolute Percentage:',mean_absolute_percentage_error(y['2022'],res['AQI']))

Mean Absolute Error: 1.7555418755002397
Mean Squared Error: 6.90831274394269
Mean Absolute Percentage: 0.0325701760912322


### Predicting future pollutant values

In [98]:
ad_so2.index = pd.to_datetime(ad_so2['Year'].astype(str) + '-' + ad_so2['Month'], format='%Y-%b')
ad_so2.drop(['Year', 'Month'], axis=1, inplace=True)
ad_so2=ad_so2.sort_index()

In [99]:
ad_nox.index = pd.to_datetime(ad_nox['Year'].astype(str) + '-' + ad_nox['Month'], format='%Y-%b')
ad_nox.drop(['Year', 'Month'], axis=1, inplace=True)
ad_nox=ad_nox.sort_index()

In [100]:
ad_pm10.index = pd.to_datetime(ad_pm10['Year'].astype(str) + '-' + ad_pm10['Month'], format='%Y-%b')
ad_pm10.drop(['Year', 'Month'], axis=1, inplace=True)
ad_pm10=ad_pm10.sort_index()

In [101]:
#Performing exponential smoothing for the data to predict future dependent variables

from statsmodels.tsa.api import ExponentialSmoothing
model_so2= ExponentialSmoothing(ad_so2[:'2020'], trend='add', seasonal='mul')
fit_so2 = model_so2.fit(optimized=True)
fore_so2 = fit_so2.forecast(12)

from statsmodels.tsa.api import ExponentialSmoothing
model_nox= ExponentialSmoothing(ad_nox[:'2020'], trend='add', seasonal='mul')
fit_nox = model_nox.fit(optimized=True)
fore_nox = fit_nox.forecast(12)


from statsmodels.tsa.api import ExponentialSmoothing
model_pm10= ExponentialSmoothing(ad_pm10[:'2020'], trend='add', seasonal='mul')
fit_pm10 = model_pm10.fit(optimized=True)
fore_pm10 = fit_pm10.forecast(12)


In [102]:
#Accuracy
from sklearn.metrics import mean_absolute_percentage_error,mean_absolute_error
mae_so2 = mean_absolute_error(ad_so2['2021'],fore_so2['2021'] )
print('Mean Absolute Error for so2:', mae_so2)
mae_nox = mean_absolute_error(ad_nox['2022'],fore_nox )
print('Mean Absolute Error for nox:', mae_nox)
mae_pm10 = mean_absolute_error(ad_pm10['2022'],fore_pm10 )
print('MAPE for PM10:', mae_pm10)


Mean Absolute Error for so2: 0.9929669673588934
Mean Absolute Error for nox: 2.8551150685656084
MAPE for PM10: 16.74121576656466


### Combining the data

In [103]:
forecast_df=pd.DataFrame(fore_so2,columns=['SO2'])
forecast_df['NOX']=fore_so2
forecast_df['PM10']=fore_pm10

In [104]:
forecast_df

Unnamed: 0,SO2,NOX,PM10
2021-01-01,5.30057,5.30057,74.714109
2021-02-01,5.314102,5.314102,77.343808
2021-03-01,5.209848,5.209848,76.89083
2021-04-01,4.598006,4.598006,73.637062
2021-05-01,4.916682,4.916682,79.220416
2021-06-01,4.652429,4.652429,75.902724
2021-07-01,5.012746,5.012746,72.623358
2021-08-01,4.65677,4.65677,72.71372
2021-09-01,4.889956,4.889956,66.725228
2021-10-01,5.016688,5.016688,73.558264


In [105]:
final_pred=predictor(forecast_df)


         AQI Level of Concern
0   5.156858             Good
1   3.900699             Good
2   4.055301             Good
3   5.248870             Good
4   2.757542             Good
5   4.192525             Good
6   5.987639             Good
7   5.728432             Good
8   8.749041             Good
9   5.540525             Good
10  5.206016             Good
11  4.117250             Good


In [109]:
final_pred.to_csv('Alidabad2023.csv')