In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import random 
import os

In [2]:
import tensorflow as tf
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import LSTM, Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.regularizers import l2
from sklearn.metrics import mean_squared_error, r2_score
import keras_tuner as kt

2024-09-18 01:43:21.043678: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-09-18 01:43:21.994097: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-09-18 01:43:21.994202: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-09-18 01:43:22.123506: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-09-18 01:43:22.378342: I tensorflow/core/platform/cpu_feature_guar

In [3]:
SEED = 0

def set_seeds(seed=SEED):
    os.environ['PYTHONHASHSEED'] = str(seed)
    random.seed(seed)
    tf.random.set_seed(seed)
    np.random.seed(seed)

def set_global_determinism(seed=SEED):
    set_seeds(seed=seed)

    os.environ['TF_DETERMINISTIC_OPS'] = '1'
    os.environ['TF_CUDNN_DETERMINISTIC'] = '1'

    tf.config.threading.set_inter_op_parallelism_threads(1)
    tf.config.threading.set_intra_op_parallelism_threads(1)

set_global_determinism(seed=SEED)

In [4]:
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))

Num GPUs Available:  1


In [5]:
df = pd.read_csv("/home/ubuntu/datasets/covid19vaccinesbycountybydemographic.csv")

df

Unnamed: 0,county,county_type,demographic_category,demographic_value,est_population,est_age_12plus_pop,est_age_5plus_pop,administered_date,partially_vaccinated,total_partially_vaccinated,...,cumulative_fully_vaccinated,at_least_one_dose,cumulative_at_least_one_dose,cumulative_unvax_total_pop,cumulative_unvax_12plus_pop,cumulative_unvax_5plus_pop,suppress_data,up_to_date_count,cumulative_up_to_date_count,administered_year
0,,HPI_RCP_TRACT,HPI Quartile Census Mixed,1,,,,2024-08-23,0.0,17088.0,...,69129.0,0.0,86225.0,,,,False,0,8609,2024
1,,HPI_RCP_TRACT,HPI Quartile Census Mixed,1,,,,2024-08-22,0.0,17088.0,...,69129.0,0.0,86225.0,,,,False,0,8609,2024
2,,HPI_RCP_TRACT,HPI Quartile Census Mixed,1,,,,2024-08-21,4.0,17088.0,...,69129.0,4.0,86225.0,,,,False,8,8609,2024
3,,HPI_RCP_TRACT,HPI Quartile Census Mixed,1,,,,2024-08-20,0.0,17084.0,...,69129.0,0.0,86221.0,,,,False,0,8601,2024
4,,HPI_RCP_TRACT,HPI Quartile Census Mixed,1,,,,2024-08-19,2.0,17084.0,...,69129.0,2.0,86221.0,,,,False,2,8601,2024
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2185832,Yuba,DERIVED FROM RECIP ZIP,VEM Quartile,4,39.0,28.0,39.0,2020-07-31,0.0,0.0,...,0.0,0.0,0.0,39.0,28.0,39.0,False,0,0,2020
2185833,Yuba,DERIVED FROM RECIP ZIP,VEM Quartile,4,39.0,28.0,39.0,2020-07-30,0.0,0.0,...,0.0,0.0,0.0,39.0,28.0,39.0,False,0,0,2020
2185834,Yuba,DERIVED FROM RECIP ZIP,VEM Quartile,4,39.0,28.0,39.0,2020-07-29,0.0,0.0,...,0.0,0.0,0.0,39.0,28.0,39.0,False,0,0,2020
2185835,Yuba,DERIVED FROM RECIP ZIP,VEM Quartile,4,39.0,28.0,39.0,2020-07-28,0.0,0.0,...,0.0,0.0,0.0,39.0,28.0,39.0,False,0,0,2020


In [6]:
df = df.iloc[::-1].reset_index(drop=True)

df

Unnamed: 0,county,county_type,demographic_category,demographic_value,est_population,est_age_12plus_pop,est_age_5plus_pop,administered_date,partially_vaccinated,total_partially_vaccinated,...,cumulative_fully_vaccinated,at_least_one_dose,cumulative_at_least_one_dose,cumulative_unvax_total_pop,cumulative_unvax_12plus_pop,cumulative_unvax_5plus_pop,suppress_data,up_to_date_count,cumulative_up_to_date_count,administered_year
0,Yuba,DERIVED FROM RECIP ZIP,VEM Quartile,4,39.0,28.0,39.0,2020-07-27,0.0,0.0,...,0.0,0.0,0.0,39.0,28.0,39.0,False,0,0,2020
1,Yuba,DERIVED FROM RECIP ZIP,VEM Quartile,4,39.0,28.0,39.0,2020-07-28,0.0,0.0,...,0.0,0.0,0.0,39.0,28.0,39.0,False,0,0,2020
2,Yuba,DERIVED FROM RECIP ZIP,VEM Quartile,4,39.0,28.0,39.0,2020-07-29,0.0,0.0,...,0.0,0.0,0.0,39.0,28.0,39.0,False,0,0,2020
3,Yuba,DERIVED FROM RECIP ZIP,VEM Quartile,4,39.0,28.0,39.0,2020-07-30,0.0,0.0,...,0.0,0.0,0.0,39.0,28.0,39.0,False,0,0,2020
4,Yuba,DERIVED FROM RECIP ZIP,VEM Quartile,4,39.0,28.0,39.0,2020-07-31,0.0,0.0,...,0.0,0.0,0.0,39.0,28.0,39.0,False,0,0,2020
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2185832,,HPI_RCP_TRACT,HPI Quartile Census Mixed,1,,,,2024-08-19,2.0,17084.0,...,69129.0,2.0,86221.0,,,,False,2,8601,2024
2185833,,HPI_RCP_TRACT,HPI Quartile Census Mixed,1,,,,2024-08-20,0.0,17084.0,...,69129.0,0.0,86221.0,,,,False,0,8601,2024
2185834,,HPI_RCP_TRACT,HPI Quartile Census Mixed,1,,,,2024-08-21,4.0,17088.0,...,69129.0,4.0,86225.0,,,,False,8,8609,2024
2185835,,HPI_RCP_TRACT,HPI Quartile Census Mixed,1,,,,2024-08-22,0.0,17088.0,...,69129.0,0.0,86225.0,,,,False,0,8609,2024


In [7]:
df = df[df['demographic_category'].str.contains("Race/Ethnicity")]

df = df[~df['demographic_value'].str.contains("Other Race|Unknown")]

In [8]:
encoder = LabelEncoder()
df['county_encoded'] = encoder.fit_transform(df['county'])
df['demographic_encoded'] = encoder.fit_transform(df['demographic_value'])

In [9]:
df['group'] = df['county_encoded'].astype(str) + "_" + df['demographic_encoded'].astype(str)

df['group']

5956       58_6
5957       58_6
5958       58_6
5959       58_6
5960       58_6
           ... 
2161592     0_0
2161593     0_0
2161594     0_0
2161595     0_0
2161596     0_0
Name: group, Length: 614957, dtype: object

In [10]:
df['percent_fully_vaccinated'] = df['cumulative_fully_vaccinated'] / df['est_population']

df = df[df['percent_fully_vaccinated'] != 0]
df = df[df['percent_fully_vaccinated'].notnull()]
df['percent_fully_vaccinated'] = df['percent_fully_vaccinated'].clip(upper=1)

In [11]:
prep_df = df[['percent_fully_vaccinated', 'county_encoded', 'demographic_encoded', 'group']]

prep_df = prep_df.reset_index(drop = True)

prep_df

Unnamed: 0,percent_fully_vaccinated,county_encoded,demographic_encoded,group
0,0.000025,58,6,58_6
1,0.000025,58,6,58_6
2,0.000025,58,6,58_6
3,0.000025,58,6,58_6
4,0.000025,58,6,58_6
...,...,...,...,...
522139,1.000000,0,0,0_0
522140,1.000000,0,0,0_0
522141,1.000000,0,0,0_0
522142,1.000000,0,0,0_0


In [12]:
prep_df = prep_df[prep_df['percent_fully_vaccinated'] != 0]
prep_df = prep_df[prep_df['percent_fully_vaccinated'].notnull()]


prep_df

Unnamed: 0,percent_fully_vaccinated,county_encoded,demographic_encoded,group
0,0.000025,58,6,58_6
1,0.000025,58,6,58_6
2,0.000025,58,6,58_6
3,0.000025,58,6,58_6
4,0.000025,58,6,58_6
...,...,...,...,...
522139,1.000000,0,0,0_0
522140,1.000000,0,0,0_0
522141,1.000000,0,0,0_0
522142,1.000000,0,0,0_0


In [13]:
time_step = 10
validation_percentage = 0.05

X_train = []
y_train = []
X_val = []
y_val = []

grouped_county = df.groupby('group')

for name, group in grouped_county:
    values = group[['percent_fully_vaccinated', 'county_encoded', 'demographic_encoded']].values

    validation_split = int(len(values) * validation_percentage)
    
    if len(values) > time_step + validation_split:
        for i in range(len(values) - time_step - validation_split):
            X_train.append(values[i:i+time_step, :])
            y_train.append(values[i+time_step, 0])
        
        for i in range(len(values) - time_step - validation_split, len(values) - time_step):
            X_val.append(values[i:i+time_step, :])
            y_val.append(values[i+time_step, 0])


X_train = np.array(X_train)
y_train = np.array(y_train)
X_val = np.array(X_val)
y_val = np.array(y_val)

In [14]:
print("Training data shape:", X_train.shape)
print("Validation data shape:", X_val.shape)

Training data shape: (492355, 10, 3)
Validation data shape: (25959, 10, 3)


In [15]:
print("Labels data shape:", y_train.shape)
print("Validation data shape:", y_val.shape)

Labels data shape: (492355,)
Validation data shape: (25959,)


In [16]:
def model_builder(hp):
    model = Sequential()

    hp_activation = hp.Choice('activation', values=['relu', 'tanh', 'sigmoid'])

    hp_dense_layers = hp.Choice('num_dense_layers', [1,2])

    hp_layer_1 = hp.Int('layer_1', min_value=16, max_value=512, step=16)

    hp_layer_2 = hp.Int('layer_2', min_value=16, max_value=512, step=16)

    hp_layer_3 = hp.Int('layer_3', min_value=16, max_value=512, step=16)


    hp_learning_rate = hp.Choice('learning_rate', values=[1e-2, 1e-3, 1e-4])

    model.add(LSTM(units = hp_layer_1, activation=hp_activation, input_shape = (X_train.shape[1],X_train.shape[2])))

    model.add(Dense(units = hp_layer_2, activation = hp_activation))

    if hp_dense_layers >= 2:
        model.add(Dense(units = hp_layer_3, activation = hp_activation))

    model.add(Dense(units=1))


    model.compile(optimizer = Adam(learning_rate=hp_learning_rate), loss='mse', metrics=['mae'])   

    return model

In [19]:
tuner = kt.Hyperband(model_builder, 
                     objective='val_mae',
                     max_epochs=10,
                     factor = 3,
                     directory = '/home/ubuntu/trials',
                     project_name = 'one')

Reloading Tuner from /home/ubuntu/trials/one/tuner0.json


In [17]:
stop_early = EarlyStopping(monitor = 'val_mae', patience = 5)

In [19]:
tuner.search(X_train, y_train, epochs=50, validation_data = (X_val, y_val), callbacks=[stop_early], batch_size = 128)

Trial 30 Complete [00h 05m 17s]
val_mae: 0.0007575827185064554

Best val_mae So Far: 0.0006517667789012194
Total elapsed time: 00h 49m 43s


In [20]:
best_hp = tuner.get_best_hyperparameters(num_trials=1)[0]

In [21]:
print("Best hyperparameters:")
for key, value in best_hp.values.items():
    print(f"{key}: {value}")

Best hyperparameters:
activation: tanh
num_dense_layers: 2
layer_1: 256
layer_2: 208
layer_3: 464
learning_rate: 0.001
tuner/epochs: 10
tuner/initial_epoch: 0
tuner/bracket: 0
tuner/round: 0


In [22]:
model_1 = tuner.hypermodel.build(best_hp)

model_1.summary()

2024-09-18 01:44:11.925450: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1929] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 79091 MB memory:  -> device: 0, name: NVIDIA A100 80GB PCIe, pci bus id: 0002:00:01.0, compute capability: 8.0


Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm (LSTM)                 (None, 256)               266240    
                                                                 
 dense (Dense)               (None, 208)               53456     
                                                                 
 dense_1 (Dense)             (None, 464)               96976     
                                                                 
 dense_2 (Dense)             (None, 1)                 465       
                                                                 
Total params: 417137 (1.59 MB)
Trainable params: 417137 (1.59 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [23]:
history = model_1.fit(X_train,y_train, epochs = 50, validation_data = (X_val, y_val), batch_size = 128)

Epoch 1/50


2024-09-18 01:44:15.298752: I external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:454] Loaded cuDNN version 8906
2024-09-18 01:44:15.908882: I external/local_xla/xla/service/service.cc:168] XLA service 0x7ff6e7d69ad0 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
2024-09-18 01:44:15.909064: I external/local_xla/xla/service/service.cc:176]   StreamExecutor device (0): NVIDIA A100 80GB PCIe, Compute Capability 8.0
2024-09-18 01:44:15.930004: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:269] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.
I0000 00:00:1726623856.048152    1485 device_compiler.h:186] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


In [29]:
model_1.save('/home/ubuntu/models/LSTM_1', save_format='tf')

INFO:tensorflow:Assets written to: /home/ubuntu/models/LSTM_1/assets


INFO:tensorflow:Assets written to: /home/ubuntu/models/LSTM_1/assets


In [None]:
# Model 1 - Overfitting and Exhibit extreme predictions - Using Sigmoid to constrain the output between 0-1

In [24]:
def model_reg(hp):
    model = Sequential()
    hp_dropout = hp.Choice('dropout_rate', values=[0.2, 0.3, 0.4, 0.5])
    hp_l2_reg = hp.Choice('l2_reg_strength', values=[1e-2, 1e-3, 1e-4])


    hp_dense_layers = hp.Choice('num_dense_layers', [1,2])

    hp_layer_1 = hp.Int('layer_1', min_value=16, max_value=512, step=16)

    hp_layer_2 = hp.Int('layer_2', min_value=16, max_value=512, step=16)

    hp_layer_3 = hp.Int('layer_3', min_value=16, max_value=512, step=16)


    hp_learning_rate = hp.Choice('learning_rate', values=[1e-2, 1e-3, 1e-4])

    model.add(LSTM(units = hp_layer_1, activation='sigmoid', input_shape = (X_train.shape[1],X_train.shape[2]),  kernel_regularizer=l2(hp_l2_reg)))
    model.add(Dropout(rate=hp_dropout))

    model.add(Dense(units = hp_layer_2, activation = 'sigmoid',  kernel_regularizer=l2(hp_l2_reg)))
    model.add(Dropout(rate=hp_dropout))

    if hp_dense_layers >= 2:
        model.add(Dense(units = hp_layer_3, activation = 'sigmoid',  kernel_regularizer=l2(hp_l2_reg)))
        model.add(Dropout(rate=hp_dropout))

    model.add(Dense(units=1))


    model.compile(optimizer = Adam(learning_rate=hp_learning_rate), loss='mse', metrics=['mae'])   

    return model

In [25]:
tuner = kt.Hyperband(model_reg, 
                     objective='val_mae',
                     max_epochs=10,
                     factor = 3,
                     directory = '/home/ubuntu/trials',
                     project_name = 'second')

Reloading Tuner from /home/ubuntu/trials/second/tuner0.json


In [24]:
tuner.search(X_train, y_train, epochs=50, validation_data = (X_val, y_val), callbacks=[stop_early], batch_size = 128)

Trial 30 Complete [00h 05m 16s]
val_mae: 0.1892874836921692

Best val_mae So Far: 0.013418631628155708
Total elapsed time: 01h 04m 10s


In [26]:
best_hp1 = tuner.get_best_hyperparameters(num_trials=1)[0]

In [27]:
model_2 = tuner.hypermodel.build(best_hp1)

model_2.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm_1 (LSTM)               (None, 512)               1056768   
                                                                 
 dropout (Dropout)           (None, 512)               0         
                                                                 
 dense_3 (Dense)             (None, 128)               65664     
                                                                 
 dropout_1 (Dropout)         (None, 128)               0         
                                                                 
 dense_4 (Dense)             (None, 1)                 129       
                                                                 
Total params: 1122561 (4.28 MB)
Trainable params: 1122561 (4.28 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [28]:
history = model_2.fit(X_train,y_train, epochs = 50, validation_data = (X_val, y_val), batch_size = 128)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


In [30]:
model_2.save('/home/ubuntu/models/LSTM_2', save_format='tf')

INFO:tensorflow:Assets written to: /home/ubuntu/models/LSTM_2/assets


INFO:tensorflow:Assets written to: /home/ubuntu/models/LSTM_2/assets
