# üåç AQI Analysis ‚Äì Hyderabad, Pakistan

### Project Objective:
- Analyze real-time AQI data from Hyderabad
- Understand patterns in PM2.5, PM10, NO2, O3
- Prepare data for ML forecasting
- Explore temporal trends (hour/day/month)
- Apply SHAP to explain feature importance


import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pymongo import MongoClient
import shap
import joblib
from config import MONGO_URI, DB_NAME

plt.style.use("seaborn")
sns.set_theme()


# Connect to MongoDB Atlas Feature Store
client = MongoClient(MONGO_URI)
db = client[DB_NAME]

# Load engineered features
df = pd.DataFrame(list(db.engineered_features.find()))
df.drop(columns="_id", inplace=True)

df.head()



# Shape and info
print("Dataset shape:", df.shape)
df.info()


df.describe()

df.isnull().sum()


plt.figure(figsize=(10,5))
sns.histplot(df['pm2_5'], kde=True, bins=30, color='skyblue')
plt.title("Distribution of PM2.5 Levels")
plt.xlabel("PM2.5")
plt.ylabel("Frequency")
plt.show()


plt.figure(figsize=(10,5))
sns.histplot(df['pm10'], kde=True, bins=30, color='orange')
plt.title("Distribution of PM10 Levels")
plt.xlabel("PM10")
plt.ylabel("Frequency")
plt.show()


plt.figure(figsize=(10,5))
sns.boxplot(data=df[['pm2_5', 'pm10', 'no2', 'o3']])
plt.title("Pollutant Concentration Comparison")
plt.ylabel("Concentration")
plt.show()


plt.figure(figsize=(8,6))
sns.heatmap(df.corr(), annot=True, cmap='coolwarm', fmt=".2f")
plt.title("Correlation Between Features")
plt.show()


plt.figure(figsize=(12,5))
sns.boxplot(x='hour', y='pm2_5', data=df)
plt.title("Hourly PM2.5 Distribution")
plt.xlabel("Hour of Day")
plt.ylabel("PM2.5")
plt.show()


plt.figure(figsize=(12,5))
sns.boxplot(x='day', y='pm2_5', data=df)
plt.title("Daily PM2.5 Distribution")
plt.xlabel("Day of Month")
plt.ylabel("PM2.5")
plt.show()


plt.figure(figsize=(12,5))
sns.boxplot(x='month', y='pm2_5', data=df)
plt.title("Monthly PM2.5 Distribution")
plt.xlabel("Month")
plt.ylabel("PM2.5")
plt.show()


def aqi_category(pm25):
    if pm25 <= 50:
        return "Good"
    elif pm25 <= 100:
        return "Moderate"
    elif pm25 <= 150:
        return "Unhealthy"
    else:
        return "Hazardous"

df['AQI_Category'] = df['pm2_5'].apply(aqi_category)
df['AQI_Category'].value_counts()


plt.figure(figsize=(8,5))
sns.countplot(x='AQI_Category', data=df, order=["Good","Moderate","Unhealthy","Hazardous"])
plt.title("AQI Category Distribution")
plt.show()


df['lag1'] = df['pm2_5'].shift(1)
df['lag2'] = df['pm2_5'].shift(2)
df['lag3'] = df['pm2_5'].shift(3)

df.dropna(inplace=True)
df.head()


# Load pre-trained model
model = joblib.load("../models/best_model.pkl")

X = df.drop(columns=['pm2_5', 'AQI_Category'])
explainer = shap.Explainer(model, X)
shap_values = explainer(X)

# Summary plot
shap.summary_plot(shap_values, X)


df.to_csv("../data/hyderabad_aqi_eda_clean.csv", index=False)