Here we want to build a model. We decide on building an LSTM for the following reasons: 
- Need to capture long-term information in the model.
- Other model choices would not be appropriate (due to non-staionary time series).
- Complexity of time series can be properly captured.

We will first build a model for all states, then build a model per state.

In [1]:
import pickle
import polars as pl
import numpy as np
import ml_utils
import mlflow
import warnings

warnings.filterwarnings("ignore")

In [2]:
with open('blood_donations.pkl', 'rb') as file:
    df = pickle.load(file)

df.head(10)

date,state,a,b,o,ab,all
date,str,i64,i64,i64,i64,i64
2006-01-01,"""Johor""",19,20,45,3,87
2006-01-02,"""Johor""",4,3,6,2,15
2006-01-03,"""Johor""",2,2,4,0,8
2006-01-04,"""Johor""",7,11,12,3,33
2006-01-05,"""Johor""",3,8,8,1,20
2006-01-06,"""Johor""",2,0,2,0,4
2006-01-07,"""Johor""",1,0,0,0,1
2006-01-08,"""Johor""",20,27,30,9,86
2006-01-09,"""Johor""",0,1,1,1,3
2006-01-10,"""Johor""",1,0,1,0,2


In [3]:
df = df.with_columns([
    pl.col("date").dt.weekday().alias("weekday"),
    pl.col("date").dt.week().alias("week"),
    pl.col("date").dt.month().alias("month"),
    pl.col("date").dt.ordinal_day().alias("day_of_year")   
])

df.head(10)

date,state,a,b,o,ab,all,weekday,week,month,day_of_year
date,str,i64,i64,i64,i64,i64,i8,i8,i8,i16
2006-01-01,"""Johor""",19,20,45,3,87,7,52,1,1
2006-01-02,"""Johor""",4,3,6,2,15,1,1,1,2
2006-01-03,"""Johor""",2,2,4,0,8,2,1,1,3
2006-01-04,"""Johor""",7,11,12,3,33,3,1,1,4
2006-01-05,"""Johor""",3,8,8,1,20,4,1,1,5
2006-01-06,"""Johor""",2,0,2,0,4,5,1,1,6
2006-01-07,"""Johor""",1,0,0,0,1,6,1,1,7
2006-01-08,"""Johor""",20,27,30,9,86,7,1,1,8
2006-01-09,"""Johor""",0,1,1,1,3,1,2,1,9
2006-01-10,"""Johor""",1,0,1,0,2,2,2,1,10


In [4]:
# Starting with all
df_all = df.group_by(['date']).agg([
    pl.sum('all').alias('all'),
    pl.first('weekday').alias('weekday'),
    pl.first('week').alias('week'),
    pl.first('month').alias('month'),
    pl.first('day_of_year').alias('day_of_year')
]).sort('date')

df_all.head(10)

date,all,weekday,week,month,day_of_year
date,i64,i8,i8,i8,i16
2006-01-01,525,7,52,1,1
2006-01-02,227,1,1,1,2
2006-01-03,112,2,1,1,3
2006-01-04,391,3,1,1,4
2006-01-05,582,4,1,1,5
2006-01-06,324,5,1,1,6
2006-01-07,118,6,1,1,7
2006-01-08,795,7,1,1,8
2006-01-09,346,1,2,1,9
2006-01-10,2,2,2,1,10


In [5]:
feature_cols = ["all", "weekday", "week", "month", "day_of_year"]
feature_data = df_all.select(feature_cols).to_numpy()

In [6]:
SEQ_LENGTH = 7
donations_array = df_all.drop(['date']).to_numpy()
X, y = ml_utils.create_sequences_for_lstm(donations_array, SEQ_LENGTH)

In [7]:
X_train, X_val, X_test, y_train, y_val, y_test = ml_utils.create_train_val_test(
    X, y, train_frac=0.85, val_frac=0.1
)

print(X_train.shape, y_train.shape)
print(X_val.shape, y_val.shape)
print(X_test.shape, y_test.shape)

(6003, 7, 5) (6003,)
(706, 7, 5) (706,)
(354, 7, 5) (354,)


In [8]:
experiment_id = ml_utils.get_or_create_mlflow_experiment("MalaysiaBloodDonations")

In [9]:
import absl.logging 
absl.logging.set_verbosity(absl.logging.ERROR)

In [10]:
units_list = [125, 250, 500, 1000]
activation_list = ["relu"]
optimizer_list = ["adam", "rmsprop", "adagrad"]
dropout_list = [0.05, 0.1, 0.2]

mlflow.tensorflow.autolog(log_models=True, log_datasets=False)
for units in units_list:
    for activation in activation_list:
        for optimizer in optimizer_list:
            for dropout in dropout_list:    
                print(f"Running experiment with units={units}, activation={activation}, optimizer={optimizer}, dropout={dropout}")
                ml_utils.run_experiment(
                    X_train, y_train, X_val, y_val,
                    units, activation, dropout, optimizer, experiment_id
                )

Running experiment with units=125, activation=relu, optimizer=adam, dropout=0.05


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 131ms/step
Validation Loss: 183526.375, Validation MAE: 284.10784912109375
Running experiment with units=125, activation=relu, optimizer=adam, dropout=0.1


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 144ms/step
Validation Loss: 213515.859375, Validation MAE: 311.9988708496094
Running experiment with units=125, activation=relu, optimizer=adam, dropout=0.2


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 135ms/step
Validation Loss: 201352.8125, Validation MAE: 301.1484069824219
Running experiment with units=125, activation=relu, optimizer=rmsprop, dropout=0.05


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 120ms/step
Validation Loss: 200111.0, Validation MAE: 299.67156982421875
Running experiment with units=125, activation=relu, optimizer=rmsprop, dropout=0.1






[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 122ms/step
Validation Loss: 193374.140625, Validation MAE: 290.1282653808594
Running experiment with units=125, activation=relu, optimizer=rmsprop, dropout=0.2






[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 121ms/step
Validation Loss: 213586.671875, Validation MAE: 318.6155090332031
Running experiment with units=125, activation=relu, optimizer=adagrad, dropout=0.05




[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 178ms/step
Validation Loss: 243134.6875, Validation MAE: 360.8981628417969
Running experiment with units=125, activation=relu, optimizer=adagrad, dropout=0.1




KeyboardInterrupt: 

In [None]:
import matplotlib.pyplot as plt
plt.figure(figsize=(14, 6))
plt.plot(y_test, label='Actual Donations')
plt.plot(model.predict(X_test), label='Predicted Donations')
plt.title("LSTM Predictions vs. Actual (With Dates)")
plt.xlabel("Date")
plt.ylabel("Donations")
plt.legend()
plt.grid(True)
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()
