# Week 4: Data Cleaning & Preparation

This notebook focuses on cleaning the dataset and preparing it for modeling.


In [None]:
import pandas as pd
import numpy as np

df = pd.read_csv("../data/data.csv", encoding="ISO-8859-1")
df['date'] = pd.to_datetime(df['date'], errors='coerce')
df['year'] = df['date'].dt.year
df['month'] = df['date'].dt.month


In [None]:
cols_to_keep = ['state', 'location', 'type', 'date', 'year', 'month',
                'so2', 'no2', 'rspm', 'spm', 'pm2_5']
df = df[cols_to_keep]

In [None]:
df = df.dropna(subset=['so2', 'no2', 'rspm', 'spm', 'pm2_5'], how='all')

for col in ['so2', 'no2', 'rspm', 'spm', 'pm2_5']:
    df[col] = df[col].fillna(df[col].median())

In [None]:
df['total_pollution'] = df[['so2', 'no2', 'rspm', 'spm', 'pm2_5']].sum(axis=1)

df['season'] = df['month'].map({
    12: 'Winter', 1: 'Winter', 2: 'Winter',
    3: 'Spring', 4: 'Spring', 5: 'Spring',
    6: 'Summer', 7: 'Summer', 8: 'Summer',
    9: 'Fall', 10: 'Fall', 11: 'Fall'
})

In [None]:
print("Cleaned dataset shape:", df.shape)
df.head()

In [None]:
df.to_csv("../data/cleaned_air_quality.csv", index=False)