# Week 3: EDA & Visualization

This notebook builds on feature exploration with deeper visualizations and temporal/geographic analysis.


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

sns.set(style="whitegrid")

df = pd.read_csv("../data/data.csv", encoding="ISO-8859-1")
df['date'] = pd.to_datetime(df['date'], errors='coerce')
df['year'] = df['date'].dt.year
df['month'] = df['date'].dt.month
df['day'] = df['date'].dt.day

pollutants = ['so2', 'no2', 'rspm', 'spm', 'pm2_5']


In [None]:
monthly_avg = df.groupby('month')[pollutants].mean()
monthly_avg.plot(figsize=(10, 6), title="Monthly Pollution Trends")
plt.ylabel("Concentration")
plt.xticks(range(1, 13))
plt.grid()
plt.show()

In [None]:
pivot = df.pivot_table(values='no2', index='year', columns='month', aggfunc='mean')
plt.figure(figsize=(12, 6))
sns.heatmap(pivot, cmap='coolwarm', annot=True, fmt=".1f")
plt.title("NO2 Levels by Year and Month")
plt.show()

In [None]:
top_cities = df['location'].value_counts().head(6).index.tolist()
df_top_cities = df[df['location'].isin(top_cities)]

plt.figure(figsize=(12, 6))
sns.lineplot(data=df_top_cities, x='year', y='no2', hue='location')
plt.title("NO2 Trends Over Time (Top Cities)")
plt.ylabel("NO2 Level")
plt.show()

In [None]:
type_trends = df.groupby(['year', 'type'])['no2'].mean().unstack()
type_trends.plot(figsize=(12, 6), title="NO2 Trends by Area Type")
plt.ylabel("NO2 Level")
plt.show()

## Summary

- Seasonal (monthly) NO2 and SO2 patterns emerge, with spikes in certain months.
- Top cities show consistent long-term NO2 trends.
- Industrial and residential areas show differing pollution dynamics.
