Here we want to build a model. We decide on building an LSTM for the following reasons: 
- Need to capture long-term information in the model.
- Other model choices would not be appropriate (due to non-staionary time series).
- Complexity of time series can be properly captured.

We will first build a model for all states, then build a model per state.

In [None]:
import pickle
import polars as pl
import ml_utils
import absl.logging
absl.logging.set_verbosity(absl.logging.ERROR)
import warnings
warnings.filterwarnings("ignore")
from sklearn.metrics import mean_absolute_error
import constants
import numpy as np

In [None]:
with open("df_all.pkl", "rb") as file:
    df_all = pickle.load(file)

In [None]:
df_all = df_all.with_columns([
    pl.col("date").dt.weekday().alias("weekday"),
    pl.col("date").dt.month().alias("month"),
    pl.col("date").dt.ordinal_day().alias("day_of_year")   
])

df_all.head(8)

In [None]:
for lag in range(1, constants.WINDOW_SIZE+1):
    df_all = df_all.with_columns(
        pl.col("all").shift(lag).alias(f"all_lag_{lag}")
    )

df_all = df_all.drop_nulls()

In [None]:
df_all

In [None]:
X_seq = df_all.select([f"all_lag_{lag}" for lag in range(1, constants.WINDOW_SIZE+1)]).to_numpy().reshape(-1, constants.WINDOW_SIZE, 1)
X_features = df_all.select(["weekday", "month", "day_of_year", "is_religion_or_culture_holiday", "is_other_holiday", "is_low_donation_holiday", "is_high_donation_holiday"]).to_numpy()
y = df_all['all'].to_numpy()

In [None]:
# Set up scalers 
from sklearn.preprocessing import StandardScaler

y_scaler = StandardScaler()
X_scaler = StandardScaler()

In [None]:
y_scaled = y_scaler.fit_transform(y.reshape(-1, 1)).flatten()
X_features_scaled = X_scaler.fit_transform(X_features)
X_seq_scaled = y_scaler.transform(X_seq.reshape(-1, 1)).reshape(-1, constants.WINDOW_SIZE, 1)

In [None]:
with open('x_scaler.pkl', 'wb') as file:
    pickle.dump(X_scaler, file)
    
with open('y_scaler.pkl', 'wb') as file:
    pickle.dump(y_scaler, file)

In [None]:
X_seq_train, X_seq_val, X_seq_test, X_features_train, X_features_val, X_features_test, y_train, y_val, y_test = ml_utils.train_val_split_lstm_feature_data(
    X_seq_scaled,
    X_features_scaled,
    y_scaled,
    constants.TRAIN_FRAC,
    constants.VAL_FRAC
)

In [None]:
experiment_id = ml_utils.get_or_create_mlflow_experiment("NewTest101")

In [None]:
model_type = ["LSTM", "SimpleRNN"]
seq_units = [12, 24, 48]
dense_units = [4, 8, 12]
activation_list = ['mish', 'relu', 'sigmoid']
optimizer_list = ['adam', 'rmsprop']
dropout_list = [0.05, 0.1]

In [None]:
for seq_type in model_type:
    for seq_unit in seq_units:
        for dense_unit in dense_units:
            for activation in activation_list:
                for optimizer in optimizer_list:
                    for dropout in dropout_list:
                        print(f"Running {seq_type} experiment with seq_units={seq_unit}, dense_units={dense_unit}, activation={activation}, optimizer={optimizer}, dropout={dropout}.")
                        ml_utils.run_experiment(
                            X_seq_train=X_seq_train, X_features_train=X_features_train, y_train=y_train,
                            X_seq_val=X_seq_val, X_features_val=X_features_val, y_val=y_val,
                            seq_type=seq_type, seq_units=seq_unit, dense_units=dense_unit, activation=activation, dropout=dropout,
                            optimizer=optimizer, experiment_id=experiment_id
                        )

In [None]:
model = ml_utils.get_best_model(experiment_id, "metrics.val_loss")

In [None]:
y_pred = y_scaler.inverse_transform(model.predict([X_seq_test, X_features_test]))
y_test_scaled = y_scaler.inverse_transform(y_test.reshape(-1, 1))
mean_absolute_error(y_test_scaled, y_pred)

In [None]:
pred_df = pl.DataFrame({'y_true': y_test_scaled.flatten(), 'y_pred': np.round(y_pred).flatten()})

pred_df = pred_df.with_columns(
    (pl.col('y_true') - pl.col('y_pred')).alias('diff')
).with_columns(
    pl.Series("date", df_all[-pred_df.height:]["date"])
)

pred_df = pred_df.with_columns(
    (abs(pl.col('diff')).alias('abs_diff'))
)

import matplotlib.pyplot as plt

plt.figure(figsize=(14, 6))
plt.plot(pred_df["date"], pred_df["y_true"], label="Actual")
plt.plot(pred_df["date"], pred_df["y_pred"], label="Predicted")
plt.title("Actual vs Predicted Donations")
plt.xlabel("Date")
plt.ylabel("Donations")
plt.legend()
plt.grid(True)
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

pred_df.sort(by='abs_diff', descending=True)

In [None]:
plt.figure(figsize=(8, 5))
plt.hist(pred_df["diff"], bins=25, edgecolor='k', alpha=0.7)
plt.title("Distribution of Prediction Errors (diff)")
plt.xlabel("Prediction Error (y_true - y_pred)")
plt.ylabel("Frequency")
plt.grid(True)
plt.show()