### Importing libraries

In [28]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.linear_model import LinearRegression,Ridge,Lasso
from sklearn.model_selection import GridSearchCV,cross_val_score
from sklearn.metrics import r2_score
import warnings
warnings.filterwarnings("ignore")
import numpy as np
from sklearn.metrics import mean_absolute_percentage_error,mean_squared_error,mean_absolute_error

### Importing Datasets

In [2]:
niz_so2=pd.read_excel(r'nizamabad_so2.xlsx')
niz_nox=pd.read_excel(r'nizamabad_no2.xlsx')
niz_pm10=pd.read_excel(r'nizamabad_PM10.xlsx')
niz_aqi=pd.read_excel(r'nizamabad_aqi.xlsx')

### Filling the null values row-wise as there seems to more yearly relation than monthly relation for each pollutant

In [3]:
for i in range(0,7):
    niz_so2.iloc[i,1:]=niz_so2.iloc[i,1:].fillna(niz_so2.iloc[i,1:].mean())
    niz_nox.iloc[i,1:]=niz_nox.iloc[i,1:].fillna(niz_nox.iloc[i,1:].mean())
    niz_pm10.iloc[i,1:]=niz_pm10.iloc[i,1:].fillna(niz_pm10.iloc[i,1:].mean())
    niz_aqi.iloc[i,1:]=niz_aqi.iloc[i,1:].fillna(niz_aqi.iloc[i,1:].mean())

### Reorganizing the data for a combined dataset.

In [4]:
niz_so2 = niz_so2.melt(id_vars=["Year"], var_name="Month", value_name="SO2")
niz_nox = niz_nox.melt(id_vars=["Year"], var_name="Month", value_name="NOX")
niz_pm10 = niz_pm10.melt(id_vars=["Year"], var_name="Month", value_name="PM10")
niz_aqi = niz_aqi.melt(id_vars=["Year"], var_name="Month", value_name="AQI")

### Organizing the data for better model usage

In [5]:
niz_poll = pd.merge(niz_pm10, niz_nox, on=['Month','Year'])
niz_poll = pd.merge(niz_poll, niz_so2, on=['Month','Year'])
niz_final=pd.merge(niz_poll,niz_aqi,on=['Month','Year'])
niz_final.index = pd.to_datetime(niz_final['Year'].astype(str) + '-' + niz_final['Month'], format='%Y-%b')
niz_final.drop(['Year', 'Month'], axis=1, inplace=True)
niz_final=niz_final.sort_index()
niz_final['2016':'2021']

Unnamed: 0,PM10,NOX,SO2,AQI
2016-01-01,67.0,18.857143,4.857143,62.333333
2016-02-01,64.0,18.857143,4.857143,63.666667
2016-03-01,66.0,18.857143,4.857143,66.333333
2016-04-01,68.0,18.857143,4.857143,68.333333
2016-05-01,71.0,18.857143,4.857143,68.666667
...,...,...,...,...
2021-08-01,45.0,22.400000,5.300000,45.000000
2021-09-01,43.0,23.100000,5.000000,43.000000
2021-10-01,59.0,27.300000,6.200000,59.000000
2021-11-01,66.0,26.700000,6.400000,66.000000


### Test Train Split

In [6]:
X_=niz_final.iloc[:,:-1]
y=niz_final.iloc[:,-1]

In [7]:
#Scalling the Data to improve model perfomance and bring features into similar range
ss=StandardScaler()
X=ss.fit_transform(X_)
X=pd.DataFrame(X,columns=niz_final.columns[:-1])

In [8]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,random_state=66,test_size=0.25)

### Perfoming Hyper-parameter tuning with comparison with other models

In [9]:
#Listing Parameters

rf_params = {'n_estimators': [500,300,100,800,1000], 'max_depth': [12,15,10,5,7,3]}

lr_param_grid = {}

svr_param_grid = {'kernel': ['linear', 'rbf'], 'C': [0.1, 1, 10], 'gamma': [0.1, 1, 10]}

In [10]:
# create models
rf_model = RandomForestRegressor(random_state=42)
lr_model = LinearRegression()
svr_model = SVR()

In [11]:
# create GridSearchCV objects
rf_grid = GridSearchCV(rf_model, rf_params, cv=5)
lr_grid = GridSearchCV(lr_model, lr_param_grid)
svr_grid = GridSearchCV(svr_model, svr_param_grid)

In [12]:
# fit the models
rf_grid.fit(X_train, y_train)
lr_grid.fit(X_train, y_train)
svr_grid.fit(X_train, y_train)

GridSearchCV(estimator=SVR(),
             param_grid={'C': [0.1, 1, 10], 'gamma': [0.1, 1, 10],
                         'kernel': ['linear', 'rbf']})

In [13]:
# get the best hyperparameters
rf_best_params = rf_grid.best_params_
lr_best_params = lr_grid.best_params_
svr_best_params = svr_grid.best_params_

In [14]:
# create new models with the best hyperparameters
rf_best_model = RandomForestRegressor(**rf_best_params, random_state=42)
lr_best_model = LinearRegression(**lr_best_params)
svr_best_model = SVR(**svr_best_params)

In [15]:
# fit the best models
rf_best_model.fit(X_train, y_train)
lr_best_model.fit(X_train, y_train)
svr_best_model.fit(X_train, y_train)

SVR(C=10, gamma=0.1, kernel='linear')

In [16]:
# make predictions
rf_preds = rf_best_model.predict(X_test)
lr_preds = lr_best_model.predict(X_test)
svr_preds = svr_best_model.predict(X_test)

### Checking individual model performace

In [17]:
# calculate r2 scores

rf_r2 = r2_score(y_test, rf_preds)
print(f"Random Forest R2 Score: {rf_r2}")

lr_r2 = r2_score(y_test, lr_preds)
print(f"Linear Regression R2 Score: {lr_r2}")

svr_r2 = r2_score(y_test, svr_preds)
print(f"Support Vector Regressor R2 Score: {svr_r2}")

Random Forest R2 Score: 0.7048089653069679
Linear Regression R2 Score: 0.8095658681712304
Support Vector Regressor R2 Score: 0.8301594663408847


### Checking Ridge and Lasso Regression and cross validation to prevent overfitting

In [18]:
lr = LinearRegression()
ridge = Ridge()
lasso = Lasso()
rf = RandomForestRegressor()
svr = SVR()

lr_params = {'normalize': [True, False]}
ridge_params = {'alpha': [0.01, 0.1, 1, 10, 100]}
lasso_params = {'alpha': [0.01, 0.1, 1, 10, 100]}
rf_params = {'n_estimators': [500,300,100,800,1000], 'max_depth':[12,15,10,5,7,3] }
svr_params = {'C': [0.1, 1, 10], 'gamma': [0.1, 1, 10], 'kernel': ['linear', 'rbf']}

In [19]:
models = {'Linear Regression': (lr, lr_params),
          'Ridge Regression': (ridge, ridge_params),
          'Lasso Regression': (lasso, lasso_params),
          'Random Forest Regression': (rf, rf_params),
          'Support Vector Regression': (svr, svr_params)}

In [20]:
models = {'Linear Regression': (lr, lr_params),
          'Ridge Regression': (ridge, ridge_params),
          'Lasso Regression': (lasso, lasso_params),
          'Random Forest Regression': (rf, rf_params),
          'Support Vector Regression': (svr, svr_params)}
for name, (model, params) in models.items():
    gs = GridSearchCV(model, params, cv=2,n_jobs=-1)
    scores = cross_val_score(gs, X_test, y_test, cv=2, scoring='r2')
    print(f'{name}: {scores.mean():.3f} (±{scores.std():.3f})')

Linear Regression: 0.845 (±0.071)
Ridge Regression: 0.782 (±0.009)
Lasso Regression: 0.844 (±0.064)
Random Forest Regression: 0.657 (±0.037)
Support Vector Regression: 0.849 (±0.069)


### Performing ensemble methods and full model accuracy

In [21]:
import numpy as np
f_pred=[]
f_pred.append((rf_preds+lr_preds+svr_preds)/3)
f_pred=np.array(f_pred)
f_pred=f_pred.reshape(-1,1)

In [22]:
rr2 = r2_score(y_test, f_pred)
print(f" R2 Score: {rr2}")

mape = mean_absolute_percentage_error(y_test, f_pred)
print(f" Mean Absolute Percentage Error: {mape}")

mse=mean_squared_error(y_test,f_pred)
print(f"Mean Squared Error : {mape}")


 R2 Score: 0.8489642812825178
 Mean Absolute Percentage Error: 0.03522768098123049
Mean Squared Error : 0.03522768098123049


### Final Predictions


In [23]:
loc=[]
for i in f_pred:
    if (i>=0) and i<=50:
        loc.append('Good')
    elif i>=51 and i<=100:
        loc.append('Moderate')
    elif i>=101 and i<=150:
        loc.append('Unhealthy for Sensitive Groups')
    elif i>=151 and i<=200:
        loc.append('Unhealthy')
    elif i>=201 and i<=300:
        loc.append('Very Unhealthy')
    else:
        loc.append('Hazardous')
y_test=pd.DataFrame(y_test)
y_test['Level of Concern']=loc

In [24]:
y_test

Unnamed: 0,AQI,Level of Concern
2019-08-01,70.555556,Moderate
2021-09-01,43.0,Good
2017-04-01,67.37037,Moderate
2018-10-01,48.0,Hazardous
2019-11-01,61.777778,Moderate
2020-01-01,64.0,Moderate
2021-11-01,66.0,Moderate
2020-05-01,50.0,Moderate
2022-09-01,54.0,Moderate
2022-02-01,59.0,Moderate


### Predictor function

In [25]:
def predictor(df):
    df=ss.transform(df.values)
    rfp=rf_best_model.predict(df)
    lrp=lr_best_model.predict(df)
    svrp=svr_best_model.predict(df)
    f_pred=[]
    f_pred.append((rfp+lrp+svrp)/3)
    f_pred=np.array(f_pred)
    f_pred=f_pred.reshape(-1,1)
    loc=[]
    for i in f_pred:
        if (i>=0) and i<=50:
            loc.append('Good')
        elif i>=51 and i<=100:
            loc.append('Moderate')
        elif i>=101 and i<=150:
            loc.append('Unhealthy for Sensitive Groups')
        elif i>=151 and i<=200:
            loc.append('Unhealthy')
        elif i>=201 and i<=300:
            loc.append('Very Unhealthy')
        else:
            loc.append('Hazardous')
    df_p=pd.DataFrame(f_pred,columns=['AQI'])
    df_p['AQI']=f_pred
    df_p['Level of Concern']=np.array(loc).reshape(-1,1)
    print(df_p)
    return df_p

In [26]:

res=predictor(X_['2022'])

          AQI Level of Concern
0   56.845038         Moderate
1   59.356672         Moderate
2   65.489366         Moderate
3   58.387081         Moderate
4   56.930694         Moderate
5   53.051480         Moderate
6   47.828198             Good
7   53.098181         Moderate
8   52.833937         Moderate
9   61.121499         Moderate
10  56.947525         Moderate
11  59.520000         Moderate


In [29]:
print("Mean Absolute Error:",mean_absolute_error(y['2022'],res['AQI']))
print("Mean Squared Error:",mean_squared_error(y['2022'],res['AQI']))
print('Mean Absolute Percentage:',mean_absolute_percentage_error(y['2022'],res['AQI']))

Mean Absolute Error: 0.9311346528366619
Mean Squared Error: 1.1682990630185845
Mean Absolute Percentage: 0.016665636792293305


Predicting Future Pollutant values


In [30]:
niz_so2.index = pd.to_datetime(niz_so2['Year'].astype(str) + '-' + niz_so2['Month'], format='%Y-%b')
niz_so2.drop(['Year', 'Month'], axis=1, inplace=True)
niz_so2=niz_so2.sort_index()

In [31]:
niz_nox.index = pd.to_datetime(niz_nox['Year'].astype(str) + '-' + niz_nox['Month'], format='%Y-%b')
niz_nox.drop(['Year', 'Month'], axis=1, inplace=True)
niz_nox=niz_nox.sort_index()

In [32]:
niz_pm10.index = pd.to_datetime(niz_pm10['Year'].astype(str) + '-' + niz_pm10['Month'], format='%Y-%b')
niz_pm10.drop(['Year', 'Month'], axis=1, inplace=True)
niz_pm10=niz_pm10.sort_index()

In [33]:
#Performing exponential smoothing for the data to predict future dependent variables

from statsmodels.tsa.api import ExponentialSmoothing
model_so2= ExponentialSmoothing(niz_so2[:'2020'], trend='add', seasonal='mul')
fit_so2 = model_so2.fit(optimized=True)
fore_so2 = fit_so2.forecast(12)

from statsmodels.tsa.api import ExponentialSmoothing
model_nox= ExponentialSmoothing(niz_nox[:'2020'], trend='add', seasonal='mul')
fit_nox = model_nox.fit(optimized=True)
fore_nox = fit_nox.forecast(12)


from statsmodels.tsa.api import ExponentialSmoothing
model_pm10= ExponentialSmoothing(niz_pm10[:'2020'], trend='add', seasonal='mul')
fit_pm10 = model_pm10.fit(optimized=True)
fore_pm10 = fit_pm10.forecast(12)

  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)


In [34]:
#Accuracy
from sklearn.metrics import mean_absolute_percentage_error,mean_absolute_error
mae_so2 = mean_absolute_error(niz_so2['2021'],fore_so2['2021'] )
print('Mean Absolute Error for so2:', mae_so2)
mae_nox = mean_absolute_error(niz_nox['2022'],fore_nox )
print('Mean Absolute Error for nox:', mae_nox)
mae_pm10 = mean_absolute_error(niz_pm10['2022'],fore_pm10 )
print('MAPE for PM10:', mae_pm10)

Mean Absolute Error for so2: 0.32864689429605903
Mean Absolute Error for nox: 1.9005767411529595
MAPE for PM10: 5.638941170433765


In [35]:
forecast_df=pd.DataFrame(fore_so2,columns=['SO2'])
forecast_df['NOX']=fore_so2
forecast_df['PM10']=fore_pm10
forecast_df

Unnamed: 0,SO2,NOX,PM10
2021-01-01,5.97813,5.97813,61.9623
2021-02-01,6.005965,6.005965,60.971072
2021-03-01,5.95996,5.95996,62.776207
2021-04-01,5.963667,5.963667,64.95808
2021-05-01,5.944375,5.944375,61.596362
2021-06-01,5.716075,5.716075,60.220693
2021-07-01,5.667271,5.667271,59.272349
2021-08-01,5.56422,5.56422,59.769461
2021-09-01,5.693629,5.693629,59.572024
2021-10-01,6.273429,6.273429,58.785554


In [36]:
final_pred=predictor(forecast_df)

          AQI Level of Concern
0   20.453248             Good
1   20.390425             Good
2   20.508834             Good
3   20.702611             Good
4   20.392451             Good
5   20.077293             Good
6   19.952764             Good
7   19.908183             Good
8   20.001455             Good
9   20.427922             Good
10  20.277262             Good
11  20.727640             Good


In [37]:
final_pred.to_csv('Nizamabad2023.csv')