In [79]:
#Air Quality Monitoring Assistant

In [80]:

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# LOAD DATASET
data=pd.read_csv("AQI.csv")
data.head()

Unnamed: 0,country,state,city,station,last_update,latitude,longitude,pollutant_id,pollutant_min,pollutant_max,pollutant_avg
0,India,Andhra_Pradesh,Vijayawada,"Rajiv Nagar, Vijayawada - APPCB",19-05-2025 10:00:00,16.554731,80.64911,SO2,20.0,23.0,21.0
1,India,Andhra_Pradesh,Vijayawada,"Rajiv Nagar, Vijayawada - APPCB",19-05-2025 10:00:00,16.554731,80.64911,OZONE,17.0,35.0,22.0
2,India,Andhra_Pradesh,Visakhapatnam,"GVM Corporation, Visakhapatnam - APPCB",19-05-2025 10:00:00,17.72,83.3,PM10,23.0,66.0,36.0
3,India,Arunachal_Pradesh,Naharlagun,"Naharlagun, Naharlagun - APSPCB",19-05-2025 10:00:00,27.103358,93.679645,NO2,4.0,5.0,5.0
4,India,Arunachal_Pradesh,Naharlagun,"Naharlagun, Naharlagun - APSPCB",19-05-2025 10:00:00,27.103358,93.679645,CO,10.0,19.0,12.0


In [81]:
# Checking shape
data.shape

(3077, 11)

In [82]:
# Information 
print(data.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3077 entries, 0 to 3076
Data columns (total 11 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   country        3077 non-null   object 
 1   state          3077 non-null   object 
 2   city           3077 non-null   object 
 3   station        3077 non-null   object 
 4   last_update    3077 non-null   object 
 5   latitude       3077 non-null   float64
 6   longitude      3077 non-null   float64
 7   pollutant_id   3077 non-null   object 
 8   pollutant_min  2778 non-null   float64
 9   pollutant_max  2778 non-null   float64
 10  pollutant_avg  2778 non-null   float64
dtypes: float64(5), object(6)
memory usage: 264.6+ KB
None


In [83]:
# Describe
print(data.describe())

          latitude    longitude  pollutant_min  pollutant_max  pollutant_avg
count  3077.000000  3077.000000    2778.000000    2778.000000    2778.000000
mean     22.250067    78.682006      21.670986      59.251620      33.933765
std       5.519664     4.854371      25.535470      65.896529      36.125555
min       8.514909    70.909168       1.000000       1.000000       1.000000
25%      19.036458    75.521300       5.000000      15.000000       9.000000
50%      23.023389    77.508730      13.000000      40.000000      23.000000
75%      26.833997    80.723178      29.000000      79.000000      45.000000
max      34.066206    94.636574     303.000000     500.000000     336.000000


In [84]:
# HANDLING MISSING DATA

# Return null values or not
print(data.isnull())

      country  state   city  station  last_update  latitude  longitude  \
0       False  False  False    False        False     False      False   
1       False  False  False    False        False     False      False   
2       False  False  False    False        False     False      False   
3       False  False  False    False        False     False      False   
4       False  False  False    False        False     False      False   
...       ...    ...    ...      ...          ...       ...        ...   
3072    False  False  False    False        False     False      False   
3073    False  False  False    False        False     False      False   
3074    False  False  False    False        False     False      False   
3075    False  False  False    False        False     False      False   
3076    False  False  False    False        False     False      False   

      pollutant_id  pollutant_min  pollutant_max  pollutant_avg  
0            False          False          Fa

In [85]:
# number of null values
print(data.isnull().sum())

country            0
state              0
city               0
station            0
last_update        0
latitude           0
longitude          0
pollutant_id       0
pollutant_min    299
pollutant_max    299
pollutant_avg    299
dtype: int64


In [86]:
# Filling null values
min_median = data.groupby('city')['pollutant_min'].median()
data['pollutant_min']=data['pollutant_min'].fillna(data['city'].map(min_median))

In [87]:
max_median = data.groupby('city')['pollutant_max'].median()
data['pollutant_max'] = data['pollutant_max'].fillna(data['city'].map(max_median))

In [88]:
avg_median = data.groupby('city')['pollutant_avg'].median()
data['pollutant_avg'] = data['pollutant_avg'].fillna(data['city'].map(avg_median))

In [89]:
print(data.isnull().sum())

country           0
state             0
city              0
station           0
last_update       0
latitude          0
longitude         0
pollutant_id      0
pollutant_min    84
pollutant_max    84
pollutant_avg    84
dtype: int64


In [90]:
# PIVOTING / RESHAPING THE DATASET
data_wide=data.pivot_table(index=['city','last_update'],columns='pollutant_id',values='pollutant_avg',aggfunc='mean').reset_index()
print(data_wide.head())

pollutant_id        city          last_update         CO        NH3  \
0               Agartala  19-05-2025 10:00:00   2.000000        NaN   
1                   Agra  19-05-2025 10:00:00  23.833333   3.200000   
2              Ahmedabad  19-05-2025 10:00:00  20.375000   4.857143   
3             Ahmednagar  19-05-2025 10:00:00  31.000000  13.000000   
4                 Aizawl  19-05-2025 10:00:00   3.000000   2.000000   

pollutant_id        NO2       OZONE       PM10      PM2.5        SO2  
0             64.500000  127.000000        NaN  64.500000  64.500000  
1             26.333333   25.500000  97.833333  60.416667  20.166667  
2             37.777778   12.222222  66.111111  63.750000  12.000000  
3             28.000000    6.000000  12.000000  11.000000  10.000000  
4                   NaN    1.000000   2.000000   2.000000   2.000000  


In [91]:
# ADDING AQI 
# To derive AQI we will use CPCB threshold rule

def aqi_catagory_pm25(pm25):
    if pm25<=50:
        return 'Good'
    elif pm25<=100:
        return 'Satisfactory'
    elif pm25<=200:
        return 'Moderate'
    elif pm25<=300:
        return 'Poor'
    elif pm25<=400:
        return 'Very Poor'
    else:
        return 'Severe'

data_wide['AQI_Catagory']=data_wide['PM2.5'].apply(aqi_catagory_pm25)

In [101]:
# Encoding AQI catagory
from sklearn.preprocessing import LabelEncoder

le=LabelEncoder()
data_wide['AQI_Catagory']=le.fit_transform(data_wide['AQI_Catagory'])

In [107]:
X=data_wide[['PM2.5','PM10','NO2','SO2','CO','OZONE']]
y=data_wide['AQI_Catagory']

In [109]:
# Train test split
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [111]:
# Train model ML
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(
    n_estimators=100,
    random_state=42
)

model.fit(X_train, y_train)

0,1,2
,n_estimators,100
,criterion,'gini'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [113]:
from sklearn.metrics import accuracy_score, classification_report

y_pred = model.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

Accuracy: 1.0
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        34
           1       1.00      1.00      1.00         2
           2       1.00      1.00      1.00         9

    accuracy                           1.00        45
   macro avg       1.00      1.00      1.00        45
weighted avg       1.00      1.00      1.00        45



In [115]:
# Feature Importance
import pandas as pd

importance_df = pd.DataFrame({
    'Feature': X.columns,
    'Importance': model.feature_importances_
}).sort_values(by='Importance', ascending=False)

importance_df


Unnamed: 0,Feature,Importance
0,PM2.5,0.6257
1,PM10,0.157175
2,NO2,0.083845
4,CO,0.060912
5,OZONE,0.042826
3,SO2,0.029541


In [119]:
# Sample demo
sample = [[120, 180, 45, 12, 1.2, 30]]
prediction = model.predict(sample)

print("Predicted AQI Category:", prediction)


Predicted AQI Category: [1]




In [123]:
# Connect to AI
aqi_labels = {
    0: "Good",
    1: "Satisfactory",
    2: "Moderate",
    3: "Poor",
    4: "Very Poor",
    5: "Severe"
}

print("Air Quality:", aqi_labels[prediction[0]])


Air Quality: Satisfactory


In [125]:
# Saving the model
import joblib
joblib.dump(model, "aqi_model.pkl")

['aqi_model.pkl']