# Week 4: Basic Modeling - Predicting NO2 Levels

This notebook builds a basic regression model to predict NO2 levels based on other pollutants and metadata.


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

sns.set(style="whitegrid")

df = pd.read_csv("../data/data.csv", encoding="ISO-8859-1")
df['date'] = pd.to_datetime(df['date'], errors='coerce')
df['year'] = df['date'].dt.year
df['month'] = df['date'].dt.month


In [None]:
cols_needed = ['so2', 'rspm', 'spm', 'pm2_5', 'type', 'month', 'year', 'no2']
df_model = df[cols_needed].dropna()

# Encode categorical variable
df_model = pd.get_dummies(df_model, columns=['type'], drop_first=True)
df_model.head()

In [None]:
X = df_model.drop('no2', axis=1)
y = df_model['no2']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = LinearRegression()
model.fit(X_train, y_train)

In [None]:
y_pred = model.predict(X_test)
rmse = mean_squared_error(y_test, y_pred, squared=False)
print(f"RMSE: {rmse:.2f}")

# Plot actual vs predicted
plt.figure(figsize=(6,6))
sns.scatterplot(x=y_test, y=y_pred, alpha=0.3)
plt.xlabel("Actual NO2")
plt.ylabel("Predicted NO2")
plt.title("Actual vs. Predicted NO2 Levels")
plt.plot([0, max(y_test)], [0, max(y_test)], color='red', linestyle='--')
plt.show()

## Summary

- A linear regression model was trained to predict NO2 levels using other features.
- RMSE gives us a baseline error to compare future models.
- Actual vs. predicted plot shows correlation but room for improvement.
