# Importing Necessary Libraries

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder

#Loading the Dataset

In [None]:
df=pd.read_csv('/content/city_day.csv')

#Initial Data Exploration

In [None]:
df.head(5)

Unnamed: 0,City,Date,PM2.5,PM10,NO,NO2,NOx,NH3,CO,SO2,O3,Benzene,Toluene,Xylene,AQI,AQI_Bucket
0,Ahmedabad,2015-01-01,,,0.92,18.22,17.15,,0.92,27.64,133.36,0.0,0.02,0.0,,
1,Ahmedabad,2015-01-02,,,0.97,15.69,16.46,,0.97,24.55,34.06,3.68,5.5,3.77,,
2,Ahmedabad,2015-01-03,,,17.4,19.3,29.7,,17.4,29.07,30.7,6.8,16.4,2.25,,
3,Ahmedabad,2015-01-04,,,1.7,18.48,17.97,,1.7,18.59,36.08,4.43,10.14,1.0,,
4,Ahmedabad,2015-01-05,,,22.1,21.42,37.76,,22.1,39.33,39.31,7.01,18.89,2.78,,


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 28169 entries, 0 to 28168
Data columns (total 16 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   City        28169 non-null  object 
 1   Date        28169 non-null  object 
 2   PM2.5       23617 non-null  float64
 3   PM10        17164 non-null  float64
 4   NO          24710 non-null  float64
 5   NO2         24704 non-null  float64
 6   NOx         24027 non-null  float64
 7   NH3         18048 non-null  float64
 8   CO          26149 non-null  float64
 9   SO2         24371 non-null  float64
 10  O3          24281 non-null  float64
 11  Benzene     22783 non-null  float64
 12  Toluene     20454 non-null  float64
 13  Xylene      11180 non-null  float64
 14  AQI         23594 non-null  float64
 15  AQI_Bucket  23594 non-null  object 
dtypes: float64(13), object(3)
memory usage: 3.4+ MB


In [None]:
df.isnull().sum()

Unnamed: 0,0
City,0
Date,0
PM2.5,4552
PM10,11005
NO,3459
NO2,3465
NOx,4142
NH3,10121
CO,2020
SO2,3798


In [None]:
# Converting the 'Date' column to datetime format
df['Date'] = pd.to_datetime(df['Date'])

In [None]:
# Dropping duplicate rows
data = df.drop_duplicates()

#Handling Missing Values

In [None]:

# Imputing numeric columns with the mean
numeric_columns = data.select_dtypes(include=['float64']).columns
data[numeric_columns] = data[numeric_columns].fillna(data[numeric_columns].mean())

In [None]:
# Imputing categorical columns ('AQI_Bucket') with the most frequent value
data['AQI_Bucket'] = data['AQI_Bucket'].fillna(data['AQI_Bucket'].mode()[0])

In [None]:
# Checking for missing values after cleaning
data.isnull().sum()

Unnamed: 0,0
City,0
Date,0
PM2.5,0
PM10,0
NO,0
NO2,0
NOx,0
NH3,0
CO,0
SO2,0


#Outlier Detection and Replacement

In [None]:
# Defining a function to detect and replace outliers using the IQR method
def replace_outliers(df, columns):
    for col in columns:
        Q1 = df[col].quantile(0.25)
        Q3 = df[col].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR

        # Replacing outliers below the lower bound
        df.loc[df[col] < lower_bound, col] = lower_bound
        # Replacing outliers above the upper bound
        df.loc[df[col] > upper_bound, col] = upper_bound
    return df

In [None]:
# Selecting numeric columns to check for outliers
numeric_columns = data.select_dtypes(include=['float64']).columns

In [None]:
# Replacing outliers
data_replaced = replace_outliers(data, numeric_columns)

In [None]:
# Checking the shape of the data before and after outlier removal
data.shape, data_replaced.shape

((28169, 16), (28169, 16))

In [None]:
# Checking the dataset summary after replacing outliers
data_replaced.describe()

Unnamed: 0,Date,PM2.5,PM10,NO,NO2,NOx,NH3,CO,SO2,O3,Benzene,Toluene,Xylene,AQI
count,28169,28169.0,28169.0,28169.0,28169.0,28169.0,28169.0,28169.0,28169.0,28169.0,28169.0,28169.0,28169.0,28169.0
mean,2018-04-10 06:13:44.310412288,62.299648,112.429396,14.543764,27.538464,29.497341,21.425835,1.235389,12.13595,33.890519,2.302043,6.974455,2.589073,158.233378
min,2015-01-01 00:00:00,0.04,30.385856,0.02,0.01,0.0,0.01,0.0,0.01,0.01,0.0,0.0,0.331178,13.0
25%,2017-03-16 00:00:00,34.2,85.96,6.37,13.39,14.88,12.51,0.55,6.1,20.95,0.28,1.44,1.98,92.0
50%,2018-06-28 00:00:00,60.83,123.00943,11.81,25.99,28.15,24.096256,0.97,10.7,34.698675,1.95,7.24,3.079215,145.0
75%,2019-07-16 00:00:00,74.51,123.00943,17.92,35.3,36.62,24.096256,1.75,14.830197,42.78,3.330679,8.989373,3.079215,186.0
max,2020-07-01 00:00:00,134.975,178.583574,35.245,68.165,69.23,41.475639,3.55,27.925492,75.525,7.906696,20.313432,4.728037,327.0
std,,35.183317,40.395794,10.294798,17.500591,19.171594,10.717556,0.972426,7.491894,17.614358,2.19659,6.062154,1.243207,84.493469


In [None]:
data_replaced.to_csv('cleaned_dataset.csv', index=False)

#Feature Engineering

In [None]:
# Converting 'Date' column to datetime
data_replaced['Date'] = pd.to_datetime(data_replaced['Date'])

In [None]:
# Extracting year, month, and day as separate features
data_replaced['Year'] = data_replaced['Date'].dt.year
data_replaced['Month'] = data_replaced['Date'].dt.month
data_replaced['Day'] = data_replaced['Date'].dt.day

In [None]:
# Adding cyclic representation of Month and Day
data_replaced['Month_sin'] = np.sin(2 * np.pi * data_replaced['Month'] / 12)
data_replaced['Month_cos'] = np.cos(2 * np.pi * data_replaced['Month'] / 12)
data_replaced['Day_sin'] = np.sin(2 * np.pi * data_replaced['Day'] / 31)
data_replaced['Day_cos'] = np.cos(2 * np.pi * data_replaced['Day'] / 31)

In [None]:
# One-hot encode the 'City' column
encoder = OneHotEncoder(sparse_output=False, drop='first')
city_encoded = encoder.fit_transform(data_replaced[['City']])
city_encoded_df = pd.DataFrame(city_encoded, columns=encoder.get_feature_names_out(['City']))

In [None]:
# Concatenating encoded city data with the original dataset
data_replaced = pd.concat([data_replaced, city_encoded_df], axis=1)

In [None]:
# Dropping unnecessary columns
data_replaced = data_replaced.drop(columns=['Date', 'City', 'AQI_Bucket'])


In [None]:
# Final feature-engineered dataset preview
data_replaced.head()

Unnamed: 0,PM2.5,PM10,NO,NO2,NOx,NH3,CO,SO2,O3,Benzene,...,City_Jorapokhar,City_Kochi,City_Kolkata,City_Lucknow,City_Mumbai,City_Patna,City_Shillong,City_Talcher,City_Thiruvananthapuram,City_Visakhapatnam
0,69.773367,123.00943,0.92,18.22,17.15,24.096256,0.92,27.64,75.525,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,69.773367,123.00943,0.97,15.69,16.46,24.096256,0.97,24.55,34.06,3.68,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,69.773367,123.00943,17.4,19.3,29.7,24.096256,3.55,27.925492,30.7,6.8,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,69.773367,123.00943,1.7,18.48,17.97,24.096256,1.7,18.59,36.08,4.43,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,69.773367,123.00943,22.1,21.42,37.76,24.096256,3.55,27.925492,39.31,7.01,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


#Preparing Data for Model Training

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

# Define features (X) and target (y)
X = data_replaced.drop(columns=['AQI'])
y = data_replaced['AQI']

In [None]:
# Spliting data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Scaling features using Min-Max Scaler
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

#Random Forest Model

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score


rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train_scaled, y_train)

# Making predictions
rf_predictions = rf_model.predict(X_test_scaled)

# Evaluating the Random Forest Model
rf_mse = mean_squared_error(y_test, rf_predictions)
rf_r2 = r2_score(y_test, rf_predictions)

print(f"Random Forest MSE: {rf_mse}")
print(f"Random Forest R2 Score: {rf_r2}")


Random Forest MSE: 715.0321596529731
Random Forest R2 Score: 0.8998093737619848


#LSTM Model

In [None]:
import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense

# Reshaping data for LSTM
X_train_lstm = np.expand_dims(X_train_scaled, axis=1)
X_test_lstm = np.expand_dims(X_test_scaled, axis=1)

# Building the LSTM model
lstm_model = Sequential([
    LSTM(64, input_shape=(X_train_lstm.shape[1], X_train_lstm.shape[2]), return_sequences=False),
    Dense(32, activation='relu'),
    Dense(1)
])

lstm_model.compile(optimizer='adam', loss='mse', metrics=['mae'])

# Training the LSTM model
lstm_model.fit(X_train_lstm, y_train, epochs=10, batch_size=32, validation_data=(X_test_lstm, y_test), verbose=1)

# Making predictions
lstm_predictions = lstm_model.predict(X_test_lstm)

# Evaluating the LSTM Model
lstm_mse = mean_squared_error(y_test, lstm_predictions)
lstm_r2 = r2_score(y_test, lstm_predictions)

print(f"LSTM MSE: {lstm_mse}")
print(f"LSTM R2 Score: {lstm_r2}")


  super().__init__(**kwargs)


Epoch 1/10
[1m705/705[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 5ms/step - loss: 19845.3086 - mae: 112.1815 - val_loss: 2655.3989 - val_mae: 38.4656
Epoch 2/10
[1m705/705[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 3ms/step - loss: 2162.3408 - mae: 34.8456 - val_loss: 1431.2856 - val_mae: 28.2443
Epoch 3/10
[1m705/705[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - loss: 1407.9423 - mae: 27.2228 - val_loss: 1268.5980 - val_mae: 25.4523
Epoch 4/10
[1m705/705[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3ms/step - loss: 1281.1078 - mae: 25.2340 - val_loss: 1230.1519 - val_mae: 24.6773
Epoch 5/10
[1m705/705[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 6ms/step - loss: 1266.4323 - mae: 24.7640 - val_loss: 1183.7528 - val_mae: 24.4505
Epoch 6/10
[1m705/705[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - loss: 1236.6790 - mae: 24.3779 - val_loss: 1146.9351 - val_mae: 23.7305
Epoch 7/10
[1m705/705[0m [32m━━━━━━