In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import random 
import os

In [2]:
import tensorflow as tf
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import LSTM, Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.regularizers import l2
from sklearn.metrics import mean_squared_error, r2_score
import keras_tuner as kt

In [3]:
SEED = 0

def set_seeds(seed=SEED):
    os.environ['PYTHONHASHSEED'] = str(seed)
    random.seed(seed)
    tf.random.set_seed(seed)
    np.random.seed(seed)

def set_global_determinism(seed=SEED):
    set_seeds(seed=seed)

    os.environ['TF_DETERMINISTIC_OPS'] = '1'
    os.environ['TF_CUDNN_DETERMINISTIC'] = '1'

    tf.config.threading.set_inter_op_parallelism_threads(1)
    tf.config.threading.set_intra_op_parallelism_threads(1)

set_global_determinism(seed=SEED)

In [4]:
import sys
print(sys.executable)

c:\Users\dangn\anaconda3\envs\tensorflowgpu\python.exe


In [5]:
import tensorflow as tf   
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))

Num GPUs Available:  1


In [6]:
df = pd.read_csv("datasets\covid19vaccinesbycountybydemographic.csv")

df

Unnamed: 0,county,county_type,demographic_category,demographic_value,est_population,est_age_12plus_pop,est_age_5plus_pop,administered_date,partially_vaccinated,total_partially_vaccinated,...,cumulative_fully_vaccinated,at_least_one_dose,cumulative_at_least_one_dose,cumulative_unvax_total_pop,cumulative_unvax_12plus_pop,cumulative_unvax_5plus_pop,suppress_data,up_to_date_count,cumulative_up_to_date_count,administered_year
0,,HPI_RCP_TRACT,HPI Quartile Census Mixed,1,,,,2024-08-23,0.0,17088.0,...,69129.0,0.0,86225.0,,,,False,0,8609,2024
1,,HPI_RCP_TRACT,HPI Quartile Census Mixed,1,,,,2024-08-22,0.0,17088.0,...,69129.0,0.0,86225.0,,,,False,0,8609,2024
2,,HPI_RCP_TRACT,HPI Quartile Census Mixed,1,,,,2024-08-21,4.0,17088.0,...,69129.0,4.0,86225.0,,,,False,8,8609,2024
3,,HPI_RCP_TRACT,HPI Quartile Census Mixed,1,,,,2024-08-20,0.0,17084.0,...,69129.0,0.0,86221.0,,,,False,0,8601,2024
4,,HPI_RCP_TRACT,HPI Quartile Census Mixed,1,,,,2024-08-19,2.0,17084.0,...,69129.0,2.0,86221.0,,,,False,2,8601,2024
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2185832,Yuba,DERIVED FROM RECIP ZIP,VEM Quartile,4,39.0,28.0,39.0,2020-07-31,0.0,0.0,...,0.0,0.0,0.0,39.0,28.0,39.0,False,0,0,2020
2185833,Yuba,DERIVED FROM RECIP ZIP,VEM Quartile,4,39.0,28.0,39.0,2020-07-30,0.0,0.0,...,0.0,0.0,0.0,39.0,28.0,39.0,False,0,0,2020
2185834,Yuba,DERIVED FROM RECIP ZIP,VEM Quartile,4,39.0,28.0,39.0,2020-07-29,0.0,0.0,...,0.0,0.0,0.0,39.0,28.0,39.0,False,0,0,2020
2185835,Yuba,DERIVED FROM RECIP ZIP,VEM Quartile,4,39.0,28.0,39.0,2020-07-28,0.0,0.0,...,0.0,0.0,0.0,39.0,28.0,39.0,False,0,0,2020


In [7]:
df = df[df['demographic_category'].str.contains("Race/Ethnicity")]

df['demographic_category'].unique()

array(['Race/Ethnicity'], dtype=object)

In [8]:
df['demographic_value'].unique()

array(['American Indian or Alaska Native', 'Asian',
       'Black or African American', 'Latino', 'Multiracial',
       'Native Hawaiian or Other Pacific Islander', 'Other Race',
       'Unknown', 'White'], dtype=object)

In [9]:
df = df[~df['demographic_value'].str.contains("Other Race|Unknown")]

df['demographic_value'].unique()

array(['American Indian or Alaska Native', 'Asian',
       'Black or African American', 'Latino', 'Multiracial',
       'Native Hawaiian or Other Pacific Islander', 'White'], dtype=object)

In [10]:
df['county'].unique()

array(['Alameda', 'Alpine', 'Amador', 'Butte', 'Calaveras', 'Colusa',
       'Contra Costa', 'Del Norte', 'El Dorado', 'Fresno', 'Glenn',
       'Humboldt', 'Imperial', 'Inyo', 'Kern', 'Kings', 'Lake', 'Lassen',
       'Los Angeles', 'Madera', 'Marin', 'Mariposa', 'Mendocino',
       'Merced', 'Modoc', 'Mono', 'Monterey', 'Napa', 'Nevada', 'Orange',
       'Placer', 'Plumas', 'Riverside', 'Sacramento', 'San Benito',
       'San Bernardino', 'San Diego', 'San Francisco', 'San Joaquin',
       'San Luis Obispo', 'San Mateo', 'Santa Barbara', 'Santa Clara',
       'Santa Cruz', 'Shasta', 'Sierra', 'Siskiyou', 'Solano', 'Sonoma',
       'Stanislaus', 'Statewide', 'Sutter', 'Tehama', 'Trinity', 'Tulare',
       'Tuolumne', 'Ventura', 'Yolo', 'Yuba'], dtype=object)

In [11]:
encoder = LabelEncoder()
df['county_encoded'] = encoder.fit_transform(df['county'])
df['demographic_encoded'] = encoder.fit_transform(df['demographic_value'])

In [12]:
df['percent_fully_vaccinated'] = df['cumulative_fully_vaccinated'] / df['est_population']

df = df[df['percent_fully_vaccinated'] != 0]
df = df[df['percent_fully_vaccinated'].notnull()]
df['percent_fully_vaccinated'] = df['percent_fully_vaccinated'].clip(upper=1)

In [13]:
df['administered_date'] = pd.to_datetime(df['administered_date'], format = '%Y-%m-%d')

df['time_since_start'] = (df['administered_date'] - df['administered_date'].min()).dt.days

In [14]:
prep_df = df[['percent_fully_vaccinated', 'county_encoded', 'demographic_encoded', 'time_since_start']]

prep_df

Unnamed: 0,percent_fully_vaccinated,county_encoded,demographic_encoded,time_since_start
24240,1.000000,0,0,1465
24241,1.000000,0,0,1464
24242,1.000000,0,0,1463
24243,1.000000,0,0,1462
24244,1.000000,0,0,1461
...,...,...,...,...
2179837,0.000025,58,6,20
2179838,0.000025,58,6,19
2179839,0.000025,58,6,18
2179840,0.000025,58,6,17


In [15]:
prep_df = prep_df[prep_df['percent_fully_vaccinated'] != 0]
prep_df = prep_df[prep_df['percent_fully_vaccinated'].notnull()]


prep_df

Unnamed: 0,percent_fully_vaccinated,county_encoded,demographic_encoded,time_since_start
24240,1.000000,0,0,1465
24241,1.000000,0,0,1464
24242,1.000000,0,0,1463
24243,1.000000,0,0,1462
24244,1.000000,0,0,1461
...,...,...,...,...
2179837,0.000025,58,6,20
2179838,0.000025,58,6,19
2179839,0.000025,58,6,18
2179840,0.000025,58,6,17


In [16]:
time_step = 10


grouped_county = prep_df.groupby(['county_encoded', 'demographic_encoded'])

In [17]:
X = []
y = []

for name, group in grouped_county:
    values = group[['percent_fully_vaccinated', 'county_encoded', 'demographic_encoded', 'time_since_start']].values

    for i in range(len(values) - time_step):
        X.append(values[i:i+time_step,:])
        y.append(values[i+time_step, 0])

X = np.array(X)
y = np.array(y)

In [18]:
X.shape

(518314, 10, 4)

In [19]:
y.shape

(518314,)

In [18]:
np.set_printoptions(suppress=True)

X

array([[[   1.        ,    0.        ,    0.        , 1465.        ],
        [   1.        ,    0.        ,    0.        , 1464.        ],
        [   1.        ,    0.        ,    0.        , 1463.        ],
        ...,
        [   1.        ,    0.        ,    0.        , 1458.        ],
        [   1.        ,    0.        ,    0.        , 1457.        ],
        [   1.        ,    0.        ,    0.        , 1456.        ]],

       [[   1.        ,    0.        ,    0.        , 1464.        ],
        [   1.        ,    0.        ,    0.        , 1463.        ],
        [   1.        ,    0.        ,    0.        , 1462.        ],
        ...,
        [   1.        ,    0.        ,    0.        , 1457.        ],
        [   1.        ,    0.        ,    0.        , 1456.        ],
        [   1.        ,    0.        ,    0.        , 1455.        ]],

       [[   1.        ,    0.        ,    0.        , 1463.        ],
        [   1.        ,    0.        ,    0.        , 1462. 

In [19]:
def model_builder(hp):
    model = Sequential()

    hp_activation = hp.Choice('activation', values=['relu', 'tanh'])

    hp_dense_layers = hp.Choice('num_dense_layers', [1,2,3])

    hp_layer_1 = hp.Int('layer_1', min_value=16, max_value=512, step=16)

    hp_layer_2 = hp.Int('layer_2', min_value=16, max_value=512, step=16)

    hp_layer_3 = hp.Int('layer_3', min_value=16, max_value=512, step=16)

    hp_layer_4 = hp.Int('layer_4', min_value=16, max_value=512, step=16)


    hp_learning_rate = hp.Choice('learning_rate', values=[1e-2, 1e-3, 1e-4])

    model.add(LSTM(units = hp_layer_1, activation=hp_activation, input_shape = (X.shape[1],X.shape[2])))

    model.add(Dense(units = hp_layer_2, activation = hp_activation))

    if hp_dense_layers >= 2:
        model.add(Dense(units = hp_layer_3, activation = hp_activation))
    if hp_dense_layers == 3:
        model.add(Dense(units = hp_layer_4, activation = hp_activation))

    model.add(Dense(units=1, activation=hp_activation))


    model.compile(optimizer = Adam(learning_rate=hp_learning_rate), loss='mse', metrics=['mae'])   

    return model

In [20]:
tuner = kt.Hyperband(model_builder, 
                     objective='val_mae',
                     max_epochs=10,
                     factor = 3,
                     directory = 'trials\county',
                     project_name = 'one')

Reloading Tuner from trials\county\one\tuner0.json


In [21]:
stop_early = EarlyStopping(monitor = 'val_mae', patience = 5)

In [24]:
tuner.search(X, y, epochs=50, validation_split = 0.2, callbacks=[stop_early])

Trial 30 Complete [00h 15m 37s]
val_mae: 0.0017375904135406017

Best val_mae So Far: 0.001705626375041902
Total elapsed time: 04h 01m 31s


In [22]:
best_hp = tuner.get_best_hyperparameters(num_trials=1)[0]

In [26]:
# model = tuner.hypermodel.build(best_hp)

# model.summary()

In [23]:
def model_regularizers(hp):
    model = Sequential()

    hp_dropout = hp.Choice('dropout_rate', values=[0.0, 0.2, 0.3, 0.4, 0.5])
    hp_l2_reg = hp.Choice('l2_reg_strength', values=[0.0, 1e-2, 1e-3, 1e-4])

    model.add(LSTM(units = best_hp.get("layer_1"), activation=best_hp.get("activation"), input_shape = (X.shape[1],X.shape[2]),  kernel_regularizer=l2(hp_l2_reg)))
    model.add(Dropout(rate=hp_dropout))

    model.add(Dense(units = best_hp.get("layer_2"), activation=best_hp.get("activation"), kernel_regularizer=l2(hp_l2_reg)))
    model.add(Dropout(rate=hp_dropout))

    if best_hp.get("num_dense_layers") >= 2:
        model.add(Dense(units = best_hp.get("layer_3"), activation=best_hp.get("activation"), kernel_regularizer=l2(hp_l2_reg)))
        model.add(Dropout(rate=hp_dropout))
    if best_hp.get("num_dense_layers") == 3:
        model.add(Dense(units = best_hp.get("layer_4"), activation=best_hp.get("activation"), kernel_regularizer=l2(hp_l2_reg)))
        model.add(Dropout(rate=hp_dropout))

    model.add(Dense(units=1, activation=best_hp.get("activation")))

    model.compile(optimizer = Adam(learning_rate=best_hp.get("learning_rate")), loss='mse', metrics=['mae'])

    return model

    

In [24]:
tuner = kt.Hyperband(model_regularizers, 
                     objective='val_mae',
                     max_epochs=10,
                     factor = 3,
                     directory = 'trials\county',
                     project_name = 'second')

Reloading Tuner from trials\county\second\tuner0.json


In [29]:
tuner.search(X, y, epochs=50, validation_split = 0.2, callbacks=[stop_early])

Trial 27 Complete [00h 08m 57s]
val_mae: 0.15486955642700195

Best val_mae So Far: 0.0014074437785893679
Total elapsed time: 02h 05m 20s


In [25]:
best_hp1 = tuner.get_best_hyperparameters(num_trials=1)[0]

In [26]:
model = tuner.hypermodel.build(best_hp1)

model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm (LSTM)                 (None, 224)               205184    
                                                                 
 dropout (Dropout)           (None, 224)               0         
                                                                 
 dense (Dense)               (None, 224)               50400     
                                                                 
 dropout_1 (Dropout)         (None, 224)               0         
                                                                 
 dense_1 (Dense)             (None, 16)                3600      
                                                                 
 dropout_2 (Dropout)         (None, 16)                0         
                                                                 
 dense_2 (Dense)             (None, 1)                 1

In [27]:
history = model.fit(X, y, epochs = 50, validation_split = 0.2)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


In [28]:
model.save('models/LSTM_county_r', save_format='tf')



INFO:tensorflow:Assets written to: models/LSTM_county_r\assets


INFO:tensorflow:Assets written to: models/LSTM_county_r\assets
