### Importing libraries

In [201]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.linear_model import LinearRegression,Ridge,Lasso
from sklearn.model_selection import GridSearchCV,cross_val_score
from sklearn.metrics import r2_score
import warnings
warnings.filterwarnings("ignore")
import numpy as np
from sklearn.metrics import mean_absolute_percentage_error,mean_squared_error,mean_absolute_error

### Importing Datasets

In [202]:
wrg_so2=pd.read_excel(r'Warangal_so2.xlsx')
wrg_nox=pd.read_excel(r'Warangal_no2.xlsx')
wrg_pm10=pd.read_excel(r'Warangal_PM10.xlsx')
wrg_aqi=pd.read_excel(r'Warangal_aqi.xlsx')

### Filling the null values row-wise as there seems to more yearly relation than monthly relation for each pollutant

In [203]:
for i in range(0,7):
    wrg_so2.iloc[i,1:]=wrg_so2.iloc[i,1:].fillna(wrg_so2.iloc[i,1:].mean())
    wrg_nox.iloc[i,1:]=wrg_nox.iloc[i,1:].fillna(wrg_nox.iloc[i,1:].mean())
    wrg_pm10.iloc[i,1:]=wrg_pm10.iloc[i,1:].fillna(wrg_pm10.iloc[i,1:].mean())
    wrg_aqi.iloc[i,1:]=wrg_aqi.iloc[i,1:].fillna(wrg_aqi.iloc[i,1:].mean())

### Reorganizing the data for a combined dataset.

In [204]:
wrg_so2 = wrg_so2.melt(id_vars=["Year"], var_name="Month", value_name="SO2")
wrg_nox = wrg_nox.melt(id_vars=["Year"], var_name="Month", value_name="NOX")
wrg_pm10 = wrg_pm10.melt(id_vars=["Year"], var_name="Month", value_name="PM10")
wrg_aqi = wrg_aqi.melt(id_vars=["Year"], var_name="Month", value_name="AQI")

### Organizing the data for better model usage

In [205]:
wrg_poll = pd.merge(wrg_pm10, wrg_nox, on=['Month','Year'])
wrg_poll = pd.merge(wrg_poll, wrg_so2, on=['Month','Year'])
wrg_final=pd.merge(wrg_poll,wrg_aqi,on=['Month','Year'])
wrg_final.index = pd.to_datetime(wrg_final['Year'].astype(str) + '-' + wrg_final['Month'], format='%Y-%b')
wrg_final.drop(['Year', 'Month'], axis=1, inplace=True)
wrg_final=wrg_final.sort_index()
wrg_final['2016':'2021']

Unnamed: 0,PM10,NOX,SO2,AQI
2016-01-01,76.0,26.0,7.0,75.777778
2016-02-01,72.0,19.0,7.0,72.111111
2016-03-01,63.0,21.8,7.1,63.111111
2016-04-01,73.0,20.0,7.0,73.370370
2016-05-01,84.0,19.0,8.0,84.111111
...,...,...,...,...
2021-07-01,57.0,28.7,5.8,57.000000
2021-08-01,60.0,32.7,6.8,60.000000
2021-09-01,55.0,31.2,7.3,55.134376
2021-11-01,65.0,34.3,8.3,65.000000


### Test Train Split

In [206]:
X_=wrg_final.iloc[:,:-1]
y=wrg_final.iloc[:,-1]

In [207]:
#Scalling the Data to improve model perfomance and bring features into similar range
ss=StandardScaler()
X=ss.fit_transform(X_)
X=pd.DataFrame(X,columns=wrg_final.columns[:-1])

In [208]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,random_state=66,test_size=0.25)

### Perfoming Hyper-parameter tuning with comparison with other models

In [209]:
#Listing Parameters

rf_params = {'n_estimators': [500,300,100,800,1000], 'max_depth': [12,15,10,5,7,3]}

lr_param_grid = {}

svr_param_grid = {'kernel': ['linear', 'rbf'], 'C': [0.1, 1, 10], 'gamma': [0.1, 1, 10]}

In [210]:
# create models
rf_model = RandomForestRegressor(random_state=42)
lr_model = LinearRegression()
svr_model = SVR()

In [211]:
# create GridSearchCV objects
rf_grid = GridSearchCV(rf_model, rf_params, cv=5)
lr_grid = GridSearchCV(lr_model, lr_param_grid)
svr_grid = GridSearchCV(svr_model, svr_param_grid)

In [212]:
# fit the models
rf_grid.fit(X_train, y_train)
lr_grid.fit(X_train, y_train)
svr_grid.fit(X_train, y_train)

GridSearchCV(estimator=SVR(),
             param_grid={'C': [0.1, 1, 10], 'gamma': [0.1, 1, 10],
                         'kernel': ['linear', 'rbf']})

In [213]:
# get the best hyperparameters
rf_best_params = rf_grid.best_params_
lr_best_params = lr_grid.best_params_
svr_best_params = svr_grid.best_params_

In [214]:
# create new models with the best hyperparameters
rf_best_model = RandomForestRegressor(**rf_best_params, random_state=42)
lr_best_model = LinearRegression(**lr_best_params)
svr_best_model = SVR(**svr_best_params)

In [215]:
# fit the best models
rf_best_model.fit(X_train, y_train)
lr_best_model.fit(X_train, y_train)
svr_best_model.fit(X_train, y_train)

SVR(C=10, gamma=0.1)

In [216]:
# make predictions
rf_preds = rf_best_model.predict(X_test)
lr_preds = lr_best_model.predict(X_test)
svr_preds = svr_best_model.predict(X_test)

### Checking individual model performace

In [217]:
# calculate r2 scores

rf_r2 = r2_score(y_test, rf_preds)
print(f"Random Forest R2 Score: {rf_r2}")

lr_r2 = r2_score(y_test, lr_preds)
print(f"Linear Regression R2 Score: {lr_r2}")

svr_r2 = r2_score(y_test, svr_preds)
print(f"Support Vector Regressor R2 Score: {svr_r2}")

Random Forest R2 Score: 0.9652738831657484
Linear Regression R2 Score: 0.9735753246167103
Support Vector Regressor R2 Score: 0.9840841642253106


### Checking Ridge and Lasso Regression and cross validation to prevent overfitting

In [218]:
lr = LinearRegression()
ridge = Ridge()
lasso = Lasso()
rf = RandomForestRegressor()
svr = SVR()

lr_params = {'normalize': [True, False]}
ridge_params = {'alpha': [0.01, 0.1, 1, 10, 100]}
lasso_params = {'alpha': [0.01, 0.1, 1, 10, 100]}
rf_params = {'n_estimators': [500,300,100,800,1000], 'max_depth':[12,15,10,5,7,3] }
svr_params = {'C': [0.1, 1, 10], 'gamma': [0.1, 1, 10], 'kernel': ['linear', 'rbf']}

In [219]:
models = {'Linear Regression': (lr, lr_params),
          'Ridge Regression': (ridge, ridge_params),
          'Lasso Regression': (lasso, lasso_params),
          'Random Forest Regression': (rf, rf_params),
          'Support Vector Regression': (svr, svr_params)}

In [220]:
models = {'Linear Regression': (lr, lr_params),
          'Ridge Regression': (ridge, ridge_params),
          'Lasso Regression': (lasso, lasso_params),
          'Random Forest Regression': (rf, rf_params),
          'Support Vector Regression': (svr, svr_params)}
for name, (model, params) in models.items():
    gs = GridSearchCV(model, params, cv=2,n_jobs=-1)
    scores = cross_val_score(gs, X_test, y_test, cv=2, scoring='r2')
    print(f'{name}: {scores.mean():.3f} (±{scores.std():.3f})')

Linear Regression: 0.995 (±0.003)
Ridge Regression: 0.995 (±0.002)
Lasso Regression: 0.996 (±0.001)
Random Forest Regression: 0.719 (±0.082)
Support Vector Regression: 0.992 (±0.002)


### Performing ensemble methods and full model accuracy

In [221]:
import numpy as np
f_pred=[]
f_pred.append((rf_preds+lr_preds+svr_preds)/3)
f_pred=np.array(f_pred)
f_pred=f_pred.reshape(-1,1)

In [222]:
rr2 = r2_score(y_test, f_pred)
print(f" R2 Score: {rr2}")

mape = mean_absolute_percentage_error(y_test, f_pred)
print(f" Mean Absolute Percentage Error: {mape}")

mse=mean_squared_error(y_test,f_pred)
print(f"Mean Squared Error : {mape}")


 R2 Score: 0.9828899398386829
 Mean Absolute Percentage Error: 0.014866089952342166
Mean Squared Error : 0.014866089952342166


### Final Predictions


In [223]:
loc=[]
for i in f_pred:
    if (i>=0) and i<=50:
        loc.append('Good')
    elif i>=51 and i<=100:
        loc.append('Moderate')
    elif i>=101 and i<=150:
        loc.append('Unhealthy for Sensitive Groups')
    elif i>=151 and i<=200:
        loc.append('Unhealthy')
    elif i>=201 and i<=300:
        loc.append('Very Unhealthy')
    else:
        loc.append('Hazardous')
y_test=pd.DataFrame(y_test)
y_test['Level of Concern']=loc

In [224]:
y_test

Unnamed: 0,AQI,Level of Concern
2020-02-01,88.0,Moderate
2021-11-01,65.0,Moderate
2017-05-01,68.041667,Moderate
2021-04-01,71.0,Moderate
2019-09-01,72.916667,Moderate
2022-02-01,72.514053,Moderate
2019-01-01,90.555556,Moderate
2020-06-01,58.25,Moderate
2018-05-01,81.111111,Moderate
2022-04-01,60.080031,Moderate


### Predictor function

In [225]:
def predictor(df):
    df=ss.transform(df.values)
    rfp=rf_best_model.predict(df)
    lrp=lr_best_model.predict(df)
    svrp=svr_best_model.predict(df)
    f_pred=[]
    f_pred.append((rfp+lrp+svrp)/3)
    f_pred=np.array(f_pred)
    f_pred=f_pred.reshape(-1,1)
    loc=[]
    for i in f_pred:
        if (i>=0) and i<=50:
            loc.append('Good')
        elif i>=51 and i<=100:
            loc.append('Moderate')
        elif i>=101 and i<=150:
            loc.append('Unhealthy for Sensitive Groups')
        elif i>=151 and i<=200:
            loc.append('Unhealthy')
        elif i>=201 and i<=300:
            loc.append('Very Unhealthy')
        else:
            loc.append('Hazardous')
    df_p=pd.DataFrame(f_pred,columns=['AQI'])
    df_p['AQI']=f_pred
    df_p['Level of Concern']=np.array(loc).reshape(-1,1)
    print(df_p)
    return df_p

In [226]:

res=predictor(X_['2022'])

          AQI Level of Concern
0   73.223204         Moderate
1   73.214048         Moderate
2   75.978603         Moderate
3   61.504043         Moderate
4   76.658215         Moderate
5   84.378233         Moderate
6   50.854867        Hazardous
7   57.798713         Moderate
8   51.990740         Moderate
9   92.876397         Moderate
10  92.684760         Moderate


In [227]:
print("Mean Absolute Error:",mean_absolute_error(y['2022'],res['AQI']))
print("Mean Squared Error:",mean_squared_error(y['2022'],res['AQI']))
print('Mean Absolute Percentage:',mean_absolute_percentage_error(y['2022'],res['AQI']))

Mean Absolute Error: 1.3908257373487176
Mean Squared Error: 3.4696106266203333
Mean Absolute Percentage: 0.024318528131646564


In [231]:
wrg_nox

Unnamed: 0,Year,Month,NOX
0,2016,Jan,26.0
1,2017,Jan,25.0
2,2018,Jan,28.5
3,2019,Jan,54.5
4,2020,Jan,49.2
...,...,...,...
79,2018,Dec,42.1
80,2019,Dec,50.5
81,2020,Dec,35.2
82,2021,Dec,31.0


### Predicting future Pollutant values

In [228]:
wrg_so2.index = pd.to_datetime(wrg_so2['Year'].astype(str) + '-' + wrg_so2['Month'], format='%Y-%b')
wrg_so2.drop(['Year', 'Month'], axis=1, inplace=True)
wrg_so2=wrg_so2.sort_index()

In [230]:
wrg_nox.index = pd.to_datetime(wrg_nox['Year'].astype(str) + '-' + wrg_nox['Month'], format='%Y-%b')
wrg_nox.drop(['Year', 'Month'], axis=1, inplace=True)
wrg_nox=wrg_nox.sort_index()

ValueError: unconverted data remains:  

In [None]:
wrg_pm10.index = pd.to_datetime(wrg_pm10['Year'].astype(str) + '-' + wrg_pm10['Month'], format='%Y-%b')
wrg_pm10.drop(['Year', 'Month'], axis=1, inplace=True)
wrg_pm10=wrg_pm10.sort_index()

In [None]:
#Performing exponential smoothing for the data to predict future dependent variables

from statsmodels.tsa.api import ExponentialSmoothing
model_so2= ExponentialSmoothing(wrg_so2[:'2020'], trend='add', seasonal='mul')
fit_so2 = model_so2.fit(optimized=True)
fore_so2 = fit_so2.forecast(12)

from statsmodels.tsa.api import ExponentialSmoothing
model_nox= ExponentialSmoothing(wrg_nox[:'2020'], trend='add', seasonal='mul')
fit_nox = model_nox.fit(optimized=True)
fore_nox = fit_nox.forecast(12)


from statsmodels.tsa.api import ExponentialSmoothing
model_pm10= ExponentialSmoothing(wrg_pm10[:'2020'], trend='add', seasonal='mul')
fit_pm10 = model_pm10.fit(optimized=True)
fore_pm10 = fit_pm10.forecast(12)

NameError: name 'niz_so2' is not defined

In [None]:
#Accuracy
from sklearn.metrics import mean_absolute_percentage_error,mean_absolute_error
mae_so2 = mean_absolute_error(wrg_so2['2021'],fore_so2['2021'] )
print('Mean Absolute Error for so2:', mae_so2)
mae_nox = mean_absolute_error(wrg_nox['2022'],fore_nox )
print('Mean Absolute Error for nox:', mae_nox)
mae_pm10 = mean_absolute_error(wrg_pm10['2022'],fore_pm10 )
print('MAPE for PM10:', mae_pm10)

In [None]:
forecast_df=pd.DataFrame(fore_so2,columns=['SO2'])
forecast_df['NOX']=fore_so2
forecast_df['PM10']=fore_pm10
forecast_df

In [None]:
final_pred=predictor(forecast_df)

In [None]:
final_pred.to_csv('Nizamabad2023.csv')