# AQI Data Analysis Notebook

This notebook provides exploratory data analysis for the AQI prediction system.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from datetime import datetime, timedelta

# Import our custom modules
import sys
sys.path.append('../src')

from features.feature_store import AQIFeatureStore
from data.data_collector import AQIWeatherDataCollector

# Set plotting style
plt.style.use('default')
sns.set_palette("husl")

print("AQI Analysis Notebook")
print("=" * 50)

## 1. Data Collection and Loading

In [None]:
# Initialize components
feature_store = AQIFeatureStore()
data_collector = AQIWeatherDataCollector()

# Collect current data
print("Collecting current AQI data...")
current_data = data_collector.run_collection_cycle()
print(f"Collected data for {len(current_data)} cities")
current_data.head()

In [None]:
# Load historical data from feature store
print("Loading historical data...")
historical_data = feature_store.get_training_data(days=30)
print(f"Loaded {len(historical_data)} historical records")
historical_data.head()

## 2. Data Quality Assessment

In [None]:
# Check for missing values
print("Missing values analysis:")
missing_data = historical_data.isnull().sum()
missing_percent = (missing_data / len(historical_data)) * 100
missing_df = pd.DataFrame({'Missing Count': missing_data, 'Missing %': missing_percent})
missing_df[missing_df['Missing Count'] > 0].sort_values('Missing %', ascending=False)

In [None]:
# Data types and basic statistics
print("Data types:")
historical_data.dtypes

In [None]:
# Basic statistics
print("Basic statistics:")
historical_data.describe()

## 3. Exploratory Data Analysis

In [None]:
# AQI distribution across cities
plt.figure(figsize=(12, 6))
sns.boxplot(data=historical_data, x='city', y='aqi')
plt.title('AQI Distribution by City')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

In [None]:
# Correlation heatmap
numeric_cols = ['temperature', 'humidity', 'pressure', 'wind_speed', 'aqi', 'pm2_5', 'pm10', 'co', 'no2', 'o3', 'so2']
correlation_matrix = historical_data[numeric_cols].corr()

plt.figure(figsize=(12, 10))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0, fmt='.2f')
plt.title('Correlation Matrix of Environmental Factors')
plt.tight_layout()
plt.show()

In [None]:
# Time series analysis for a specific city
city_data = historical_data[historical_data['city'] == 'Delhi'].copy()
city_data = city_data.sort_values('timestamp')

fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(15, 10))

# AQI over time
ax1.plot(city_data['timestamp'], city_data['aqi'], linewidth=2, label='AQI')
ax1.set_title('Delhi AQI Over Time')
ax1.set_ylabel('AQI')
ax1.grid(True, alpha=0.3)

# PM2.5 over time
ax2.plot(city_data['timestamp'], city_data['pm2_5'], linewidth=2, color='orange', label='PM2.5')
ax2.set_title('Delhi PM2.5 Over Time')
ax2.set_ylabel('PM2.5 (μg/m³)')
ax2.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

## 4. Weather vs Air Quality Analysis

In [None]:
# Temperature vs AQI
plt.figure(figsize=(10, 6))
sns.scatterplot(data=historical_data, x='temperature', y='aqi', hue='city', alpha=0.6)
plt.title('Temperature vs AQI by City')
plt.xlabel('Temperature (°C)')
plt.ylabel('AQI')
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
plt.tight_layout()
plt.show()

In [None]:
# Wind speed vs PM2.5
plt.figure(figsize=(10, 6))
sns.scatterplot(data=historical_data, x='wind_speed', y='pm2_5', hue='city', alpha=0.6)
plt.title('Wind Speed vs PM2.5 by City')
plt.xlabel('Wind Speed (m/s)')
plt.ylabel('PM2.5 (μg/m³)')
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
plt.tight_layout()
plt.show()

## 5. Seasonal and Temporal Patterns

In [None]:
# Hourly patterns
historical_data['hour'] = historical_data['timestamp'].dt.hour

plt.figure(figsize=(12, 6))
hourly_aqi = historical_data.groupby('hour')['aqi'].mean()
plt.plot(hourly_aqi.index, hourly_aqi.values, marker='o', linewidth=2)
plt.title('Average AQI by Hour of Day')
plt.xlabel('Hour')
plt.ylabel('Average AQI')
plt.grid(True, alpha=0.3)
plt.xticks(range(0, 24))
plt.tight_layout()
plt.show()

In [None]:
# Day of week patterns
historical_data['day_of_week'] = historical_data['timestamp'].dt.day_name()
day_order = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']

plt.figure(figsize=(12, 6))
daily_aqi = historical_data.groupby('day_of_week')['aqi'].mean().reindex(day_order)
bars = plt.bar(daily_aqi.index, daily_aqi.values)
plt.title('Average AQI by Day of Week')
plt.xlabel('Day')
plt.ylabel('Average AQI')
plt.xticks(rotation=45)

# Color bars based on AQI levels
for bar, aqi in zip(bars, daily_aqi.values):
    if aqi < 50:
        bar.set_color('green')
    elif aqi < 100:
        bar.set_color('yellow')
    elif aqi < 150:
        bar.set_color('orange')
    else:
        bar.set_color('red')

plt.tight_layout()
plt.show()

## 6. Air Quality Index Categories Analysis

In [None]:
# AQI category distribution
def get_aqi_category(aqi):
    if aqi <= 50:
        return 'Good'
    elif aqi <= 100:
        return 'Moderate'
    elif aqi <= 150:
        return 'Unhealthy for Sensitive Groups'
    elif aqi <= 200:
        return 'Unhealthy'
    elif aqi <= 300:
        return 'Very Unhealthy'
    else:
        return 'Hazardous'

historical_data['aqi_category'] = historical_data['aqi'].apply(get_aqi_category)

# Category distribution
category_counts = historical_data['aqi_category'].value_counts()
category_percentages = (category_counts / len(historical_data)) * 100

plt.figure(figsize=(10, 6))
bars = plt.bar(category_counts.index, category_counts.values)
plt.title('AQI Category Distribution')
plt.xlabel('AQI Category')
plt.ylabel('Count')
plt.xticks(rotation=45, ha='right')

# Add percentages on bars
for bar, percentage in zip(bars, category_percentages):
    plt.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 5,
             f'{percentage:.1f}%', ha='center', va='bottom')

plt.tight_layout()
plt.show()

## 7. City-wise Analysis

In [None]:
# City comparison
city_stats = historical_data.groupby('city').agg({
    'aqi': ['mean', 'std', 'min', 'max'],
    'pm2_5': ['mean', 'std'],
    'temperature': 'mean',
    'humidity': 'mean'
}).round(2)

city_stats.columns = ['_'.join(col).strip() for col in city_stats.columns.values]
city_stats = city_stats.reset_index()

print("City-wise Statistics:")
city_stats

In [None]:
# Radar chart for city comparison
categories = ['aqi_mean', 'pm2_5_mean', 'temperature_mean', 'humidity_mean']
fig = go.Figure()

for city in city_stats['city']:
    city_data = city_stats[city_stats['city'] == city]
    values = city_data[categories].values[0].tolist()
    values += values[:1]  # Close the loop
    
    fig.add_trace(go.Scatterpolar(
        r=values,
        theta=categories + [categories[0]],
        fill='toself',
        name=city
    ))

fig.update_layout(
    polar=dict(
        radialaxis=dict(
            visible=True,
            range=[0, max(city_stats[categories].max()) * 1.1]
        )
    ),
    showlegend=True,
    title="City Comparison - Environmental Factors"
)

fig.show()

## 8. Summary and Insights

In [None]:
print("AQI Data Analysis Summary")
print("=" * 40)
print(f"Total records analyzed: {len(historical_data)}")
print(f"Cities covered: {len(historical_data['city'].unique())}")
print(f"Date range: {historical_data['timestamp'].min()} to {historical_data['timestamp'].max()}")
print()

# Key insights
print("Key Insights:")
print(f"1. Average AQI across all cities: {historical_data['aqi'].mean():.1f}")
print(f"2. City with best air quality: {city_stats.loc[city_stats['aqi_mean'].idxmin(), 'city']} (AQI: {city_stats['aqi_mean'].min():.1f})")
print(f"3. City with worst air quality: {city_stats.loc[city_stats['aqi_mean'].idxmax(), 'city']} (AQI: {city_stats['aqi_mean'].max():.1f})")
print(f"4. Most variable city (highest AQI std): {city_stats.loc[city_stats['aqi_std'].idxmax(), 'city']} (Std: {city_stats['aqi_std'].max():.1f})")
print(f"5. Strongest correlation with AQI: {correlation_matrix['aqi'].drop('aqi').abs().idxmax()} ({correlation_matrix['aqi'].drop('aqi').abs().max():.3f})")

# AQI category breakdown
print("\nAQI Category Distribution:")
for category, count in category_counts.items():
    percentage = (count / len(historical_data)) * 100
    print(f"  {category}: {count} ({percentage:.1f}%)")