## Air Quality Index Prediction

In [1]:
# To handle dataset
import pandas as pd
import numpy as np

# for plotting
import matplotlib.pyplot as plt

# to divide train and test set
from sklearn.model_selection import train_test_split

# To build model
from sklearn.linear_model import Lasso
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
import xgboost as xg
import lightgbm as ltb

# to evaluate the model
from sklearn.metrics import mean_squared_error, r2_score

# to save the trained model
import pickle



In [2]:
# Loading dataset to pandas dataframe

data = pd.read_csv("station_day.csv")

In [3]:
# visualise the dataset

data.head()

Unnamed: 0,StationId,Date,PM2.5,PM10,NO,NO2,NOx,NH3,CO,SO2,O3,Benzene,Toluene,Xylene,AQI,AQI_Bucket
0,AP001,2017-11-24,71.36,115.75,1.75,20.65,12.4,12.19,0.1,10.76,109.26,0.17,5.92,0.1,,
1,AP001,2017-11-25,81.4,124.5,1.44,20.5,12.08,10.72,0.12,15.24,127.09,0.2,6.5,0.06,184.0,Moderate
2,AP001,2017-11-26,78.32,129.06,1.26,26.0,14.85,10.28,0.14,26.96,117.44,0.22,7.95,0.08,197.0,Moderate
3,AP001,2017-11-27,88.76,135.32,6.6,30.85,21.77,12.91,0.11,33.59,111.81,0.29,7.63,0.12,198.0,Moderate
4,AP001,2017-11-28,64.18,104.09,2.56,28.07,17.01,11.42,0.09,19.0,138.18,0.17,5.02,0.07,188.0,Moderate


In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 108035 entries, 0 to 108034
Data columns (total 16 columns):
 #   Column      Non-Null Count   Dtype  
---  ------      --------------   -----  
 0   StationId   108035 non-null  object 
 1   Date        108035 non-null  object 
 2   PM2.5       86410 non-null   float64
 3   PM10        65329 non-null   float64
 4   NO          90929 non-null   float64
 5   NO2         91488 non-null   float64
 6   NOx         92535 non-null   float64
 7   NH3         59930 non-null   float64
 8   CO          95037 non-null   float64
 9   SO2         82831 non-null   float64
 10  O3          82467 non-null   float64
 11  Benzene     76580 non-null   float64
 12  Toluene     69333 non-null   float64
 13  Xylene      22898 non-null   float64
 14  AQI         87025 non-null   float64
 15  AQI_Bucket  87025 non-null   object 
dtypes: float64(13), object(3)
memory usage: 13.2+ MB


In [5]:
data.describe()

Unnamed: 0,PM2.5,PM10,NO,NO2,NOx,NH3,CO,SO2,O3,Benzene,Toluene,Xylene,AQI
count,86410.0,65329.0,90929.0,91488.0,92535.0,59930.0,95037.0,82831.0,82467.0,76580.0,69333.0,22898.0,87025.0
mean,80.272571,157.968427,23.123424,35.24076,41.195055,28.732875,1.605749,12.257634,38.134836,3.358029,15.345394,2.423446,179.74929
std,76.526403,123.418672,34.491019,29.510827,45.145976,24.897797,4.369578,12.984723,39.128004,11.156234,29.348587,6.472409,131.324339
min,0.02,0.01,0.01,0.01,0.0,0.01,0.0,0.01,0.01,0.0,0.0,0.0,8.0
25%,31.88,70.15,4.84,15.09,13.97,11.9,0.53,5.04,18.895,0.16,0.69,0.0,86.0
50%,55.95,122.09,10.29,27.21,26.66,23.59,0.91,8.95,30.84,1.21,4.33,0.4,132.0
75%,99.92,208.67,24.98,46.93,50.5,38.1375,1.45,14.92,47.14,3.61,17.51,2.11,254.0
max,1000.0,1000.0,470.0,448.05,467.63,418.9,175.81,195.65,963.0,455.03,454.85,170.37,2049.0


In [6]:
# Droping columns which are not important

data = data.drop(['StationId', 'Date', 'AQI_Bucket'], axis=1)

In [7]:
# checking for '0' values in data

zero_values = (data == 0).sum()

print(zero_values)

PM2.5          0
PM10           0
NO             0
NO2            0
NOx         4776
NH3            0
CO          7280
SO2            0
O3             0
Benzene    12602
Toluene    10455
Xylene      6083
AQI            0
dtype: int64


In [8]:
# Function for filling nan values

def fill_missing_with_mean(data, columns):
    # Iterate over each column
    for col in columns:
        # Calculate the mean of the column
        col_mean = np.nanmean(data[col])
        
        # Replace NaN values with mean value
        data[col].fillna(col_mean, inplace=True)
        
    return data

In [9]:
# Specifying the columns to fill

columns_to_fill = ['PM2.5', 'PM10', 'NO', 'NO2', 'NOx', 'NH3', 'CO', 'SO2', 'O3', 'Benzene', 'Toluene', 'Xylene', 'AQI']  # Names of columns to fill with mean

# Fill nan values in specific columns with mean using the function
filled_data = fill_missing_with_mean(data, columns_to_fill)

In [10]:
# Function for replacing 0 with mean

def fill_zeros_with_mean(df, columns):
    # Calculate the mean of specified columns
    mean_values = df[columns].mean()

    # Fill 0 values in specified columns with mean
    df_filled = df.copy()
    df_filled[columns] = df_filled[columns].replace(0, np.nan).fillna(mean_values)
    
    return df_filled

In [11]:
# Specifying the columns to fill
columns_to_fill = ['NOx', 'CO', 'Benzene', 'Toluene', 'Xylene']

# Fill 0 values in specific columns with mean using the function
df_filled = fill_zeros_with_mean(filled_data, columns_to_fill)

### Seperate dataset into train and test set

In [12]:
X = df_filled.drop(['AQI'], axis=1)
y = df_filled['AQI']

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=0)

## Model Building

### 1. Regularised Linear Regression : Lasso

In [14]:
# set up the model

lasso_model = Lasso(alpha=0.001, random_state=0)

# train the model

lasso_model.fit(X_train, y_train)

In [15]:
# evaluate the model:
# ====================


# We will evaluate performance using the mean squared error and
# the root of the mean squared error and r2

# make predictions for train set
pred = lasso_model.predict(X_train)

# determine mse, rmse and r2
print('train mse: {}'.format(int(
    mean_squared_error(y_train, pred))))
print('train rmse: {}'.format(int(
    mean_squared_error(y_train, pred, squared=False))))
print('train r2: {}'.format(
    r2_score(y_train, pred)))
print()

# make predictions for test set
pred = lasso_model.predict(X_test)

# determine mse, rmse and r2
print('test mse: {}'.format(int(
    mean_squared_error(y_test, pred))))
print('test rmse: {}'.format(int(
    mean_squared_error(y_test, pred, squared=False))))
print('test r2: {}'.format(
    r2_score(y_test, pred)))
print()


train mse: 2955
train rmse: 54
train r2: 0.7875298389115255

test mse: 2773
test rmse: 52
test r2: 0.7978186568867717



### 2. Linear Regression

In [16]:
# set up the model

lin_model = LinearRegression()

# train the model

lin_model.fit(X_train, y_train)

In [17]:
# evaluate the model:
# ====================

# We will evaluate performance using the mean squared error and
# the root of the mean squared error and r2

# make predictions for train set
pred = lin_model.predict(X_train)

# determine mse, rmse and r2
print('train mse: {}'.format(int(
    mean_squared_error(y_train, pred))))
print('train rmse: {}'.format(int(
    mean_squared_error(y_train, pred, squared=False))))
print('train r2: {}'.format(
    r2_score(y_train, pred)))
print()

# make predictions for test set
pred = lin_model.predict(X_test)

# determine mse, rmse and r2
print('test mse: {}'.format(int(
    mean_squared_error(y_test, pred))))
print('test rmse: {}'.format(int(
    mean_squared_error(y_test, pred, squared=False))))
print('test r2: {}'.format(
    r2_score(y_test, pred)))
print()

train mse: 2955
train rmse: 54
train r2: 0.7875298389260742

test mse: 2773
test rmse: 52
test r2: 0.7978185799180499



### 3. Decision Tree

In [18]:
# set up the model

dtr_model = DecisionTreeRegressor()

# train the model

dtr_model.fit(X_train, y_train)

In [19]:
# evaluate the model:
# ====================


# We will evaluate performance using the mean squared error and
# the root of the mean squared error and r2

# make predictions for train set
pred = dtr_model.predict(X_train)

# determine mse, rmse and r2
print('train mse: {}'.format(int(
    mean_squared_error(y_train, pred))))
print('train rmse: {}'.format(int(
    mean_squared_error(y_train, pred, squared=False))))
print('train r2: {}'.format(
    r2_score(y_train, pred)))
print()

# make predictions for test set
pred = dtr_model.predict(X_test)

# determine mse, rmse and r2
print('test mse: {}'.format(int(
    mean_squared_error(y_test, pred))))
print('test rmse: {}'.format(int(
    mean_squared_error(y_test, pred, squared=False))))
print('test r2: {}'.format(
    r2_score(y_test, pred)))
print()

train mse: 40
train rmse: 6
train r2: 0.99711169429428

test mse: 2857
test rmse: 53
test r2: 0.7916486776325855



### 4. XgBoost Regressor

In [20]:
# set up the model

xgb_model = xg.XGBRegressor()

# train the model

xgb_model.fit(X_train, y_train)

In [21]:
# evaluate the model:
# ====================


# We will evaluate performance using the mean squared error and
# the root of the mean squared error and r2

# make predictions for train set
pred = xgb_model.predict(X_train)

# determine mse, rmse and r2
print('train mse: {}'.format(int(
    mean_squared_error(y_train, pred))))
print('train rmse: {}'.format(int(
    mean_squared_error(y_train, pred, squared=False))))
print('train r2: {}'.format(
    r2_score(y_train, pred)))
print()

# make predictions for test set
pred = xgb_model.predict(X_test)

# determine mse, rmse and r2
print('test mse: {}'.format(int(
    mean_squared_error(y_test, pred))))
print('test rmse: {}'.format(int(
    mean_squared_error(y_test, pred, squared=False))))
print('test r2: {}'.format(
    r2_score(y_test, pred)))
print()

train mse: 915
train rmse: 30
train r2: 0.9341571142311965

test mse: 1467
test rmse: 38
test r2: 0.8929986478151111



### LightGBM Regressor

In [22]:
# set up the model

lgbm_model = ltb.LGBMRegressor()

# train the model

lgbm_model.fit(X_train, y_train)

In [23]:
# evaluate the model:
# ====================


# We will evaluate performance using the mean squared error and
# the root of the mean squared error and r2

# make predictions for train set
pred = lgbm_model.predict(X_train)

# determine mse, rmse and r2
print('train mse: {}'.format(int(
    mean_squared_error(y_train, pred))))
print('train rmse: {}'.format(int(
    mean_squared_error(y_train, pred, squared=False))))
print('train r2: {}'.format(
    r2_score(y_train, pred)))
print()

# make predictions for test set
pred = lgbm_model.predict(X_test)

# determine mse, rmse and r2
print('test mse: {}'.format(int(
    mean_squared_error(y_test, pred))))
print('test rmse: {}'.format(int(
    mean_squared_error(y_test, pred, squared=False))))
print('test r2: {}'.format(
    r2_score(y_test, pred)))
print()

train mse: 1251
train rmse: 35
train r2: 0.9099993760786396

test mse: 1370
test rmse: 37
test r2: 0.9000518027508108



#### Saving the trained model

In [24]:
filename = 'lasso_model.sav'
pickle.dump(lasso_model, open(filename,'wb'))