In [1]:
import pandas as pd
import numpy as np
import joblib
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error

In [4]:
df = pd.read_csv('../data/Processed_data/model_ready_data.csv')
df['Timestamp'] = pd.to_datetime(df['Timestamp'])

# 2. Split Train vs Test (Time-based split)
# Train: 2019 - 2023
# Test: 2024
print("Splitting data...")
train_data = df[df['Timestamp'].dt.year <= 2023]
test_data = df[df['Timestamp'].dt.year == 2024]

# 3. Define Features (X) and Target (y)
target = 'PM2_5'

# We drop these because:
# - Timestamp: Model can't read dates directly (we used Month/DayOfWeek instead)
# - PM2_5: This is the answer we are trying to predict!
# - PM2_5_AQI: This is calculated FROM the answer, so it's cheating if we include it.
cols_to_drop = ['Timestamp', 'PM2_5', 'PM2_5_AQI']
features = [c for c in df.columns if c not in cols_to_drop]

X_train = train_data[features]
y_train = train_data[target]
X_test = test_data[features]
y_test = test_data[target]

print(f"Training on {len(X_train)} rows. Testing on {len(X_test)} rows.")

# 4. Train the Model
print("Training Random Forest... (Hold tight!)")
model = RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1)
model.fit(X_train, y_train)

# 5. Evaluate
predictions = model.predict(X_test)
mae = mean_absolute_error(y_test, predictions)
print(f"\nModel Results (2024 Data):")
print(f"MAE: {mae:.2f} µg/m³")

# 6. Save the Brain
model_path = '../models/pm25_prediction_model.pkl'
joblib.dump(model, model_path)
print(f"Model saved to: {model_path}")

Splitting data...
Training on 9296 rows. Testing on 1826 rows.
Training Random Forest... (Hold tight!)

Model Results (2024 Data):
MAE: 8.02 µg/m³
Model saved to: ../models/pm25_prediction_model.pkl
