In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

# Set style
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")

In [None]:
# Load_Data

df = pd.read_csv('../data/delhi_aqi.csv')
df['date'] = pd.to_datetime(df['date'])

print(f"Dataset shape: {df.shape}")
print(f"\nColumns: {df.columns.tolist()}")
print(f"\nDate range: {df['date'].min()} to {df['date'].max()}")


In [None]:
"""
## 3. Basic Statistics
"""

# Display basic statistics
print("Basic Statistics:")
print(df.describe())

# Check for missing values
print("\nMissing Values:")
print(df.isnull().sum())

In [None]:
"""
## 4. Time Series Analysis
"""

# Create time series plot for PM2.5
fig = go.Figure()
fig.add_trace(go.Scatter(x=df['date'], y=df['pm2_5'], mode='lines', name='PM2.5'))
fig.update_layout(title='PM2.5 Time Series', xaxis_title='Date', yaxis_title='PM2.5 (μg/m³)')
fig.show()

In [None]:
"""
## 5. Correlation Analysis
"""

# Calculate correlation matrix
corr_matrix = df[['pm2_5', 'pm10', 'no2', 'o3', 'co', 'so2', 'nh3']].corr()

# Create heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', center=0, square=True)
plt.title('Correlation Matrix of Pollutants')
plt.tight_layout()
plt.show()


In [None]:
# Create subplots for distributions
fig, axes = plt.subplots(3, 3, figsize=(15, 12))
pollutants = ['pm2_5', 'pm10', 'no2', 'o3', 'co', 'so2', 'nh3']

for idx, pollutant in enumerate(pollutants):
    ax = axes[idx // 3, idx % 3]
    df[pollutant].hist(bins=50, ax=ax, edgecolor='black')
    ax.set_title(f'{pollutant} Distribution')
    ax.set_xlabel('Concentration (μg/m³)')
    ax.set_ylabel('Frequency')

plt.tight_layout()
plt.show()

In [None]:

# Extract hour from datetime
df['hour'] = df['date'].dt.hour

# Group by hour
hourly_avg = df.groupby('hour').agg({
    'pm2_5': 'mean',
    'pm10': 'mean',
    'no2': 'mean'
}).reset_index()

# Plot hourly patterns
fig, ax = plt.subplots(figsize=(12, 6))
for pollutant in ['pm2_5', 'pm10', 'no2']:
    ax.plot(hourly_avg['hour'], hourly_avg[pollutant], marker='o', label=pollutant)

ax.set_xlabel('Hour of Day')
ax.set_ylabel('Concentration (μg/m³)')
ax.set_title('Hourly Pollution Patterns')
ax.legend()
ax.grid(True, alpha=0.3)
plt.show()


In [None]:
# Define AQI categories based on PM2.5
def get_aqi_category(pm25):
    if pm25 <= 30:
        return 'Good'
    elif pm25 <= 60:
        return 'Satisfactory'
    elif pm25 <= 90:
        return 'Moderate'
    elif pm25 <= 120:
        return 'Poor'
    elif pm25 <= 250:
        return 'Very Poor'
    else:
        return 'Severe'

df['aqi_category'] = df['pm2_5'].apply(get_aqi_category)

# Count by category
category_counts = df['aqi_category'].value_counts()

# Plot categories
fig, ax = plt.subplots(figsize=(10, 6))
colors = ['#00E400', '#FFFF00', '#FF7E00', '#FF0000', '#8F3F97', '#7E0023']
category_counts.plot(kind='bar', color=colors, ax=ax, edgecolor='black')
ax.set_xlabel('AQI Category')
ax.set_ylabel('Count')
ax.set_title('Distribution of AQI Categories')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()


In [None]:
# Extract time features
df['hour'] = df['date'].dt.hour
df['day'] = df['date'].dt.day
df['month'] = df['date'].dt.month
df['day_of_week'] = df['date'].dt.dayofweek
df['is_weekend'] = df['day_of_week'].isin([5, 6]).astype(int)

# Create derived features
df['total_pollutants'] = df[['co', 'no', 'no2', 'o3', 'so2', 'nh3']].sum(axis=1)
df['pm_ratio'] = df['pm2_5'] / (df['pm10'] + 1e-5)

print("New features created:")
print(df[['hour', 'day', 'month', 'day_of_week', 'is_weekend', 'total_pollutants', 'pm_ratio']].head())


In [None]:
# Save processed data
df.to_csv('../data/processed_delhi_aqi.csv', index=False)
print("Processed data saved to '../data/processed_delhi_aqi.csv'")