In [36]:
# Set up

import pandas as pd
import numpy as np

# from sklearn.model_selection import train_test_split   # <- not used, you can remove
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.linear_model import Ridge
from sklearn.ensemble import RandomForestRegressor

import sys; sys.path.append("..")
from src.data import ASSETS_DIR
from src.features import make_features, time_split
from src.evaluate import compute_metrics as eval_metrics

CLEAN_PATH_CSV = ASSETS_DIR / "clean_weather.csv"
TIME_COL = "last_updated"
TARGET = "temperature_celsius"
RANDOM_STATE = 42


In [37]:
# Load and time index

df = pd.read_csv(CLEAN_PATH_CSV)

# Basic checks
assert TIME_COL in df.columns, f"Missing {TIME_COL}"
assert TARGET in df.columns, f"Missing {TARGET}"

df[TIME_COL] = pd.to_datetime(df[TIME_COL], errors="coerce")
df = df.sort_values(TIME_COL).set_index(TIME_COL)

df.head() 

Unnamed: 0_level_0,location_name,country,latitude,longitude,temperature_celsius,feels_like_celsius,humidity,pressure_mb,wind_kph,precip_mm,cloud,uv_index,year,month,dayofyear,dow,sin_doy,cos_doy
last_updated,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
2024-05-16 09:45:00+00:00,London,United Kingdom,51.52,-0.11,14.0,14.5,88,1005.0,4.0,0.025,50,3.0,2024,5,137,3,0.706727,-0.707487
2024-05-16 15:15:00+00:00,London,United Kingdom,51.52,-0.11,15.0,15.1,77,1005.0,11.2,0.01,50,3.0,2024,5,137,3,0.706727,-0.707487
2024-05-16 17:45:00+00:00,Tokyo,Japan,35.69,139.69,24.0,25.3,47,1001.0,33.1,0.0,25,2.5,2024,5,137,3,0.706727,-0.707487
2024-05-16 23:00:00+00:00,Tokyo,Japan,35.69,139.69,18.3,18.3,44,1005.0,19.1,0.0,0,1.0,2024,5,137,3,0.706727,-0.707487
2024-05-17 17:15:00+00:00,London,United Kingdom,51.52,-0.11,21.0,21.0,43,1010.0,6.1,0.025,0,4.0,2024,5,138,4,0.694452,-0.719539


In [29]:
# Feature Engineering (via src.features)

X, y, feature_cols = make_features(
    df,
    target=TARGET,
    lag_list=[1, 2, 3, 7, 14],
    roll_windows=[3, 7],
    include_raw=True
)

# column groups for preprocessing
cat_cols = ["dow"]
num_cols = [c for c in feature_cols if c not in cat_cols]

X.head(3), y.head(3)


(                           humidity  pressure_mb  wind_kph  precip_mm  cloud  \
 last_updated                                                                   
 2024-05-22 15:15:00+00:00        94       1007.0      20.2      0.025     75   
 2024-05-22 23:15:00+00:00        78       1017.0      16.9      0.000     25   
 2024-05-23 15:00:00+00:00        59       1014.0      24.1      0.010     50   
 
                            uv_index   sin_doy   cos_doy  lag1  lag2  lag3  \
 last_updated                                                                
 2024-05-22 15:15:00+00:00       3.0  0.630072 -0.776537  20.6  14.0  20.0   
 2024-05-22 23:15:00+00:00       1.0  0.630072 -0.776537  14.0  20.6  14.0   
 2024-05-23 15:00:00+00:00       4.0  0.616621 -0.787260  20.0  14.0  20.6   
 
                            lag7  lag14  roll3_mean  roll7_mean  roll3_std  \
 last_updated                                                                
 2024-05-22 15:15:00+00:00  20.0   14.0      

In [32]:
# Train-test split (time-based, no shuffle)

X_train, X_test, y_train, y_test = time_split(X, y, split=0.8)
X_train.shape, X_test.shape


((716, 18), (179, 18))

In [33]:
# Preprocess and model definittions 


# Preprocessing:
# - Scale numerics for linear models (helps Ridge)
# - One-hot encode 'dow'
preprocess = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), num_cols),
        ("cat", OneHotEncoder(handle_unknown="ignore", sparse_output=False), cat_cols),
    ],
    remainder="drop",
)

# Models
ridge = Pipeline(steps=[
    ("prep", preprocess),
    ("model", Ridge(alpha=1.0))
])

rf = Pipeline(steps=[
    # Trees don't strictly need scaling, but keeping same preprocess keeps columns aligned with OHE
    ("prep", preprocess),
    ("model", RandomForestRegressor(
        n_estimators=300, max_depth=None, min_samples_split=2,
        n_jobs=-1, random_state=RANDOM_STATE
    ))
])

models = {
    "Ridge": ridge,
    "RandomForest": rf,
}
list(models.keys())


['Ridge', 'RandomForest']

In [35]:
# Train, predict and evaluate 

def eval_metrics(y_true, y_pred):
    rmse = float(np.sqrt(mean_squared_error(y_true, y_pred)))
    mae  = float(mean_absolute_error(y_true, y_pred))
    mape = float((np.abs((y_true - y_pred) / np.clip(np.abs(y_true), 1e-8, None))).mean() * 100)
    r2   = float(r2_score(y_true, y_pred))
    return rmse, mae, mape, r2

results = []
preds = {}

for name, pipe in models.items():
    pipe.fit(X_train, y_train)
    y_hat = pipe.predict(X_test)
    preds[name] = y_hat
    rmse, mae, mape, r2 = eval_metrics(y_test, y_hat)
    results.append({"model": name, "RMSE": rmse, "MAE": mae, "MAPE": mape, "R2": r2})

pd.DataFrame(results).sort_values("RMSE")


Unnamed: 0,model,RMSE,MAE,MAPE,R2
0,Ridge,2.318154,1.79985,8.734474,0.860015
1,RandomForest,2.544411,1.939438,8.881474,0.831356


In [25]:
# Simple ensemple 

# Pick top-2 by RMSE
df_res = pd.DataFrame(results).sort_values("RMSE").reset_index(drop=True)
top2 = df_res["model"].head(2).tolist()

# Average predictions of top 2
y_hat_ens = 0.0
for m in top2:
    y_hat_ens += preds[m]
y_hat_ens = y_hat_ens / 2.0

rmse, mae, mape, r2 = eval_metrics(y_test, y_hat_ens)

pd.concat([
    df_res,
    pd.DataFrame([{"model": f"Ensemble({'+'.join(top2)})", "RMSE": rmse, "MAE": mae, "MAPE": mape, "R2": r2}])
], ignore_index=True).sort_values("RMSE")

Unnamed: 0,model,RMSE,MAE,MAPE,R2
2,Ensemble(Ridge+RandomForest),2.31491,1.786137,8.465478,0.860406
0,Ridge,2.318154,1.79985,8.734474,0.860015
1,RandomForest,2.544411,1.939438,8.881474,0.831356


## 📊 Multivariate Baseline Interpretation

Compared to the univariate baseline from 02:
- RMSE dropped from ~9–10°C to ~2.3°C.
- R² improved from negative to ~0.86, showing strong explanatory power.
- MAE and MAPE indicate predictions are, on average, within ~1.8°C or ~8.4% of actual temperature.

The best performer was the **Ensemble(Ridge+RandomForest)**, with the lowest MAE/MAPE and slightly better RMSE than individual models.

**Key drivers of improvement:**
- Inclusion of multiple lag features and rolling statistics
- Addition of weather variables and seasonal encodings (`sin_doy`, `cos_doy`)
- Use of models suited for both linear and non-linear patterns

**Next steps:**
- Tune hyperparameters for Ridge (`alpha`) and RandomForest (`max_depth`, `n_estimators`)
- Explore boosting methods (XGBoost, LightGBM)
- Experiment with different ensemble strategies (weighted averaging, stacking)
