### Importing libraries

In [30]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.linear_model import LinearRegression,Ridge,Lasso
from sklearn.model_selection import GridSearchCV,cross_val_score
from sklearn.metrics import r2_score
import warnings
warnings.filterwarnings("ignore")
import numpy as np
from sklearn.metrics import mean_absolute_percentage_error,mean_squared_error,mean_absolute_error

### Importing Datasets

In [31]:
krm_so2=pd.read_excel(r'Karimnagar_so2.xlsx')
krm_nox=pd.read_excel(r'Karimnagar_no2.xlsx')
krm_pm10=pd.read_excel(r'Karimnagar_PM10.xlsx')
krm_aqi=pd.read_excel(r'Karimnagar_aqi.xlsx')

### Filling the null values row-wise as there seems to more yearly relation than monthly relation for each pollutant

In [32]:
for i in range(0,7):
    krm_so2.iloc[i,1:]=krm_so2.iloc[i,1:].fillna(krm_so2.iloc[i,1:].mean())
    krm_nox.iloc[i,1:]=krm_nox.iloc[i,1:].fillna(krm_nox.iloc[i,1:].mean())
    krm_pm10.iloc[i,1:]=krm_pm10.iloc[i,1:].fillna(krm_pm10.iloc[i,1:].mean())
    krm_aqi.iloc[i,1:]=krm_aqi.iloc[i,1:].fillna(krm_aqi.iloc[i,1:].mean())

### Reorganizing the data for a combined dataset.

In [33]:
krm_so2 = krm_so2.melt(id_vars=["Year"], var_name="Month", value_name="SO2")
krm_nox = krm_nox.melt(id_vars=["Year"], var_name="Month", value_name="NOX")
krm_pm10 = krm_pm10.melt(id_vars=["Year"], var_name="Month", value_name="PM10")
krm_aqi = krm_aqi.melt(id_vars=["Year"], var_name="Month", value_name="AQI")

### Organizing the data for better model usage

In [34]:
krm_poll = pd.merge(krm_pm10, krm_nox, on=['Month','Year'])
krm_poll = pd.merge(krm_poll, krm_so2, on=['Month','Year'])
krm_final=pd.merge(krm_poll,krm_aqi,on=['Month','Year'])
krm_final.index = pd.to_datetime(krm_final['Year'].astype(str) + '-' + krm_final['Month'], format='%Y-%b')
krm_final.drop(['Year', 'Month'], axis=1, inplace=True)
krm_final=krm_final.sort_index()
krm_final['2016':'2021']

Unnamed: 0,PM10,NOX,SO2,AQI
2016-01-01,69.0,22.0,8.0,69.222222
2016-02-01,59.0,25.0,9.0,59.125000
2016-03-01,42.0,32.1,7.7,44.000000
2016-04-01,62.0,29.0,7.0,62.194444
2016-05-01,52.0,21.0,5.0,52.250000
...,...,...,...,...
2021-08-01,64.0,32.4,6.9,64.000000
2021-09-01,66.0,34.7,6.7,65.521923
2021-10-01,68.0,35.3,7.2,68.000000
2021-11-01,73.0,34.2,6.8,73.000000


### Test Train Split

In [35]:
X_=krm_final.iloc[:,:-1]
y=krm_final.iloc[:,-1]

In [36]:
#Scalling the Data to improve model perfomance and bring features into similar range
ss=StandardScaler()
X=ss.fit_transform(X_)
X=pd.DataFrame(X,columns=krm_final.columns[:-1])

In [37]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,random_state=66,test_size=0.25)

### Perfoming Hyper-parameter tuning with comparison with other models

In [38]:
#Listing Parameters

rf_params = {'n_estimators': [500,300,100,800,1000], 'max_depth': [12,15,10,5,7,3]}

lr_param_grid = {}

svr_param_grid = {'kernel': ['linear', 'rbf'], 'C': [0.1, 1, 10], 'gamma': [0.1, 1, 10]}

In [39]:
# create models
rf_model = RandomForestRegressor(random_state=42)
lr_model = LinearRegression()
svr_model = SVR()

In [40]:
# create GridSearchCV objects
rf_grid = GridSearchCV(rf_model, rf_params, cv=5)
lr_grid = GridSearchCV(lr_model, lr_param_grid)
svr_grid = GridSearchCV(svr_model, svr_param_grid)

In [41]:
# fit the models
rf_grid.fit(X_train, y_train)
lr_grid.fit(X_train, y_train)
svr_grid.fit(X_train, y_train)

GridSearchCV(estimator=SVR(),
             param_grid={'C': [0.1, 1, 10], 'gamma': [0.1, 1, 10],
                         'kernel': ['linear', 'rbf']})

In [42]:
# get the best hyperparameters
rf_best_params = rf_grid.best_params_
lr_best_params = lr_grid.best_params_
svr_best_params = svr_grid.best_params_

In [43]:
# create new models with the best hyperparameters
rf_best_model = RandomForestRegressor(**rf_best_params, random_state=42)
lr_best_model = LinearRegression(**lr_best_params)
svr_best_model = SVR(**svr_best_params)

In [44]:
# fit the best models
rf_best_model.fit(X_train, y_train)
lr_best_model.fit(X_train, y_train)
svr_best_model.fit(X_train, y_train)

SVR(C=10, gamma=0.1, kernel='linear')

In [45]:
# make predictions
rf_preds = rf_best_model.predict(X_test)
lr_preds = lr_best_model.predict(X_test)
svr_preds = svr_best_model.predict(X_test)

### Checking individual model performace

In [46]:
# calculate r2 scores

rf_r2 = r2_score(y_test, rf_preds)
print(f"Random Forest R2 Score: {rf_r2}")

lr_r2 = r2_score(y_test, lr_preds)
print(f"Linear Regression R2 Score: {lr_r2}")

svr_r2 = r2_score(y_test, svr_preds)
print(f"Support Vector Regressor R2 Score: {svr_r2}")

Random Forest R2 Score: 0.989805391447411
Linear Regression R2 Score: 0.9949182880159755
Support Vector Regressor R2 Score: 0.9954504094053316


### Checking Ridge and Lasso Regression and cross validation to prevent overfitting

In [47]:
lr = LinearRegression()
ridge = Ridge()
lasso = Lasso()
rf = RandomForestRegressor()
svr = SVR()

lr_params = {'normalize': [True, False]}
ridge_params = {'alpha': [0.01, 0.1, 1, 10, 100]}
lasso_params = {'alpha': [0.01, 0.1, 1, 10, 100]}
rf_params = {'n_estimators': [500,300,100,800,1000], 'max_depth':[12,15,10,5,7,3] }
svr_params = {'C': [0.1, 1, 10], 'gamma': [0.1, 1, 10], 'kernel': ['linear', 'rbf']}

In [48]:
models = {'Linear Regression': (lr, lr_params),
          'Ridge Regression': (ridge, ridge_params),
          'Lasso Regression': (lasso, lasso_params),
          'Random Forest Regression': (rf, rf_params),
          'Support Vector Regression': (svr, svr_params)}

In [49]:
models = {'Linear Regression': (lr, lr_params),
          'Ridge Regression': (ridge, ridge_params),
          'Lasso Regression': (lasso, lasso_params),
          'Random Forest Regression': (rf, rf_params),
          'Support Vector Regression': (svr, svr_params)}
for name, (model, params) in models.items():
    gs = GridSearchCV(model, params, cv=2,n_jobs=-1)
    scores = cross_val_score(gs, X_test, y_test, cv=2, scoring='r2')
    print(f'{name}: {scores.mean():.3f} (±{scores.std():.3f})')

Linear Regression: 0.986 (±0.002)
Ridge Regression: 0.988 (±0.001)
Lasso Regression: 0.989 (±0.002)
Random Forest Regression: 0.815 (±0.077)
Support Vector Regression: 0.989 (±0.003)


### Performing ensemble methods and full model accuracy

In [50]:
import numpy as np
f_pred=[]
f_pred.append((rf_preds+lr_preds+svr_preds)/3)
f_pred=np.array(f_pred)
f_pred=f_pred.reshape(-1,1)

In [51]:
rr2 = r2_score(y_test, f_pred)
print(f" R2 Score: {rr2}")

mape = mean_absolute_percentage_error(y_test, f_pred)
print(f" Mean Absolute Percentage Error: {mape}")

mse=mean_squared_error(y_test,f_pred)
print(f"Mean Squared Error : {mape}")


 R2 Score: 0.9957776957821972
 Mean Absolute Percentage Error: 0.017272635028228244
Mean Squared Error : 0.017272635028228244


### Final Predictions


In [52]:
loc=[]
for i in f_pred:
    if (i>=0) and i<=50:
        loc.append('Good')
    elif i>=51 and i<=100:
        loc.append('Moderate')
    elif i>=101 and i<=150:
        loc.append('Unhealthy for Sensitive Groups')
    elif i>=151 and i<=200:
        loc.append('Unhealthy')
    elif i>=201 and i<=300:
        loc.append('Very Unhealthy')
    else:
        loc.append('Hazardous')
y_test=pd.DataFrame(y_test)
y_test['Level of Concern']=loc

In [53]:
y_test

Unnamed: 0,AQI,Level of Concern
2019-08-01,99.375,Moderate
2021-09-01,65.521923,Moderate
2017-04-01,78.851852,Moderate
2018-10-01,97.777778,Moderate
2019-11-01,104.555556,Unhealthy for Sensitive Groups
2020-01-01,110.666667,Unhealthy for Sensitive Groups
2021-11-01,73.0,Moderate
2020-05-01,72.0,Moderate
2022-09-01,58.0,Moderate
2022-02-01,90.333333,Moderate


### Predictor function

In [54]:
def predictor(df):
    df=ss.transform(df.values)
    rfp=rf_best_model.predict(df)
    lrp=lr_best_model.predict(df)
    svrp=svr_best_model.predict(df)
    f_pred=[]
    f_pred.append((rfp+lrp+svrp)/3)
    f_pred=np.array(f_pred)
    f_pred=f_pred.reshape(-1,1)
    loc=[]
    for i in f_pred:
        if (i>=0) and i<=50:
            loc.append('Good')
        elif i>=51 and i<=100:
            loc.append('Moderate')
        elif i>=101 and i<=150:
            loc.append('Unhealthy for Sensitive Groups')
        elif i>=151 and i<=200:
            loc.append('Unhealthy')
        elif i>=201 and i<=300:
            loc.append('Very Unhealthy')
        else:
            loc.append('Hazardous')
    df_p=pd.DataFrame(f_pred,columns=['AQI'])
    df_p['AQI']=f_pred
    df_p['Level of Concern']=np.array(loc).reshape(-1,1)
    print(df_p)
    return df_p

In [55]:

res=predictor(X_['2022'])

           AQI                Level of Concern
0    81.722622                        Moderate
1    89.238211                        Moderate
2    95.520921                        Moderate
3    70.786568                        Moderate
4    76.604705                        Moderate
5    89.246416                        Moderate
6    69.687679                        Moderate
7    56.292342                        Moderate
8    59.427412                        Moderate
9    63.017907                        Moderate
10   99.601845                        Moderate
11  127.144069  Unhealthy for Sensitive Groups


In [56]:
print("Mean Absolute Error:",mean_absolute_error(y['2022'],res['AQI']))
print("Mean Squared Error:",mean_squared_error(y['2022'],res['AQI']))
print('Mean Absolute Percentage:',mean_absolute_percentage_error(y['2022'],res['AQI']))

Mean Absolute Error: 0.7298508725815491
Mean Squared Error: 0.7868476261874217
Mean Absolute Percentage: 0.009605243720525009


In [57]:
krm_so2.index = pd.to_datetime(krm_so2['Year'].astype(str) + '-' + krm_so2['Month'], format='%Y-%b')
krm_so2.drop(['Year', 'Month'], axis=1, inplace=True)
krm_so2=krm_so2.sort_index()

In [59]:
krm_nox.index = pd.to_datetime(krm_nox['Year'].astype(str) + '-' + krm_nox['Month'], format='%Y-%b')
krm_nox.drop(['Year', 'Month'], axis=1, inplace=True)
krm_nox=krm_nox.sort_index()

In [60]:
krm_pm10.index = pd.to_datetime(krm_pm10['Year'].astype(str) + '-' + krm_pm10['Month'], format='%Y-%b')
krm_pm10.drop(['Year', 'Month'], axis=1, inplace=True)
krm_pm10=krm_pm10.sort_index()

In [61]:
#Performing exponential smoothing for the data to predict future dependent variables

from statsmodels.tsa.api import ExponentialSmoothing
model_so2= ExponentialSmoothing(krm_so2[:'2020'], trend='add', seasonal='mul')
fit_so2 = model_so2.fit(optimized=True)
fore_so2 = fit_so2.forecast(12)

from statsmodels.tsa.api import ExponentialSmoothing
model_nox= ExponentialSmoothing(krm_nox[:'2020'], trend='add', seasonal='mul')
fit_nox = model_nox.fit(optimized=True)
fore_nox = fit_nox.forecast(12)


from statsmodels.tsa.api import ExponentialSmoothing
model_pm10= ExponentialSmoothing(krm_pm10[:'2020'], trend='add', seasonal='mul')
fit_pm10 = model_pm10.fit(optimized=True)
fore_pm10 = fit_pm10.forecast(12)

  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)


In [62]:
#Accuracy
from sklearn.metrics import mean_absolute_percentage_error,mean_absolute_error
mae_so2 = mean_absolute_error(krm_so2['2021'],fore_so2['2021'] )
print('Mean Absolute Error for so2:', mae_so2)
mae_nox = mean_absolute_error(krm_nox['2022'],fore_nox )
print('Mean Absolute Error for nox:', mae_nox)
mae_pm10 = mean_absolute_error(krm_pm10['2022'],fore_pm10 )
print('MAPE for PM10:', mae_pm10)

Mean Absolute Error for so2: 0.6336863919665294
Mean Absolute Error for nox: 25.129272446035728
MAPE for PM10: 36.90340590622562


In [63]:
forecast_df=pd.DataFrame(fore_so2,columns=['SO2'])
forecast_df['NOX']=fore_so2
forecast_df['PM10']=fore_pm10
forecast_df

Unnamed: 0,SO2,NOX,PM10
2021-01-01,8.287897,8.287897,135.98575
2021-02-01,8.292993,8.292993,125.531648
2021-03-01,8.290843,8.290843,123.850837
2021-04-01,8.505888,8.505888,104.603244
2021-05-01,8.100089,8.100089,114.666562
2021-06-01,7.954748,7.954748,103.552678
2021-07-01,7.975556,7.975556,108.880641
2021-08-01,7.412959,7.412959,101.703308
2021-09-01,7.752569,7.752569,111.667111
2021-10-01,7.841634,7.841634,134.489927


In [64]:
final_pred=predictor(forecast_df)

          AQI Level of Concern
0   85.211035         Moderate
1   80.255349         Moderate
2   79.456850         Moderate
3   70.452000         Moderate
4   74.989843         Moderate
5   69.633922         Moderate
6   72.173181         Moderate
7   68.442376         Moderate
8   73.365499         Moderate
9   84.242610         Moderate
10  84.226856         Moderate
11  85.007575         Moderate


In [65]:
final_pred.to_csv('Khammam2023.csv')