In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
import tensorflow as tf
from matplotlib import pyplot as plt
from tensorflow.keras.callbacks import LearningRateScheduler
import math
import random
from tensorflow.keras.wrappers.scikit_learn import KerasClassifier
from sklearn.model_selection import cross_val_score, GridSearchCV, RandomizedSearchCV


In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
def convert_month(df):
    unique_zips = np.unique(df_train.zip)

    df_ls = []
    for z in unique_zips:
        df_sub = df[df.zip == z]
        df_sub['month_continuous'] = list(range(1,df_sub.shape[0]+1))
        df_ls.append(df_sub)
    return pd.concat(df_ls)

def agg_monthly(col_to_agg, df_train, df_test, groupby_cols):
    # connvert month
    df_train_v2 = convert_month(df_train)
    df_test_v2 = convert_month(df_test)
    
    # aggregate monthly
    df_train_agg = df_train_v2.groupby(groupby_cols).aggregate(col_to_agg).reset_index()
    df_test_agg = df_test_v2.groupby(groupby_cols).aggregate(col_to_agg).reset_index()

    return df_train_agg, df_test_agg

def split_sequence3(df, x_cols, y_col, n_steps):
    X,y = [],[] 
    zips = np.unique(df.zip)
    
    # loop through all zips
    for i in range(zips.shape[0]): 
        
        X_sub, y_sub = [],[]
        
        # subset to this zipcode
        df_zip = df[df.zip == zips[i]]
        full_dim = df_zip.shape[0]
        
        for j in range(full_dim):
            # find the end of this pattern
            end_jx = j + n_steps
            out_end_ix = end_jx + n_steps

            # check if we are beyond the sequence
            if out_end_ix > full_dim:
                break

            # else, gather input and output parts of the pattern
            x_sub_seq = df_zip[x_cols].iloc[j:end_jx]
            y_sub_seq = df_zip[y_col].iloc[end_jx:out_end_ix]

            X_sub.append(x_sub_seq.values)
            y.append(y_sub_seq.values)

        X.append(X_sub)
    
    #  return next index
    return np.array(X), np.array(y)


def normalize_data(X_train, X_test=None):
    scaler = MinMaxScaler()
    if X_test == None:
        X_train_normalized = scaler.fit_transform(X_train)
        return X_train_normalized
    else:
        X_train_normalized = scaler.fit_transform(X_train)
        X_test_normalized = scaler.transform(X_test)
        return X_train_normalized,X_test_normalized


In [4]:
col_to_agg = {'sex':'mean', 'age':'mean','deaths':sum, # 'death':sum, 'dead':'mean', 
       'dual':sum, 'poverty': 'mean', 'popdensity': 'mean', 
       'medianhousevalue': 'mean','pct_blk': 'mean', 'medhouseholdincome' : 'mean', 'pct_owner_occ': 'mean',
       'hispanic': 'mean','education': 'mean', 'smoke_rate': 'mean', 'mean_bmi': 'mean', 
       'rmax': 'mean', 'pr': 'mean', 'population':sum,
       'race_0':'mean', 'race_1':'mean', 'race_2':'mean', 'race_3':'mean', 
       'race_4':'mean', 'race_5':'mean', 'race_6':'mean',
       'ICU_DAY':sum, 'CCI_DAY':sum, 'LOS':'mean', 'Parkinson_pdx2dx_25':sum,
       'Alzheimer_pdx2dx_25':sum, 'Dementia_pdx2dx_25':sum, 'CHF_pdx2dx_25':sum,
       'AMI_pdx2dx_25':sum, 'COPD_pdx2dx_25':sum, 'DM_pdx2dx_25':sum, 'Stroke_pdx2dx_25':sum,
       'CVD_pdx2dx_25':sum, 'CSD_pdx2dx_25':sum, 'Ischemic_stroke_pdx2dx_25':sum,
       'Hemo_Stroke_pdx2dx_25':sum, 'neo_140_149':sum, 'neo_150_159':sum, 'neo_160_165':sum,
       'neo_170_176':sum, 'neo_179_189':sum, 'neo_190_199':sum, 'neo_200_209':sum,
       'neo_210_229':sum, 'neo_230_234':sum, 'neo_235_238':sum, 'neo_239':sum, 
       'pm25_summer_4y_avg':'mean', 'pm25_winter_4y_avg':'mean', 'pm25_fall_4y_avg':'mean', 'pm25_spring_4y_avg': 'mean',
       'ozone_summer_4y_avg':'mean', 'ozone_winter_4y_avg':'mean', 'ozone_fall_4y_avg': 'mean', 'ozone_spring_4y_avg': 'mean',
       'no2_summer_4y_avg':'mean', 'no2_winter_4y_avg': 'mean', 'no2_fall_4y_avg': 'mean', 'no2_spring_4y_avg': 'mean', 
       'summer_tmmx_4y_avg': 'mean','summer_rmax_4y_avg': 'mean', 
       'winter_tmmx_4y_avg': 'mean', 'winter_rmax_4y_avg': 'mean',
        'm_count':sum, 'f_count':sum, 'mean_age':'mean',
       'white_count':sum, 'black_count':sum, 'hispanic_count':sum, 'asian_count':sum,
       'native_count':sum, 'monthly_pop':'mean' #,'deaths_next_year': sum
        }

In [5]:
x_col2 = ['sex','age','poverty', 'popdensity', 'medianhousevalue',
       'medhouseholdincome', 'pct_owner_occ', 'education',
       'smoke_rate', 'mean_bmi', 'rmax', 'pr', 'ICU_DAY',
       'CCI_DAY', 'LOS', 'Parkinson_pdx2dx_25', 'Alzheimer_pdx2dx_25',
       'Dementia_pdx2dx_25', 'CHF_pdx2dx_25', 'AMI_pdx2dx_25',
       'COPD_pdx2dx_25', 'DM_pdx2dx_25', 'Stroke_pdx2dx_25', 'CVD_pdx2dx_25',
       'CSD_pdx2dx_25', 'Ischemic_stroke_pdx2dx_25', 'Hemo_Stroke_pdx2dx_25',
       'neo_140_149', 'neo_150_159', 'neo_160_165', 'neo_170_176',
       'neo_179_189', 'neo_190_199', 'neo_200_209', 'neo_210_229',
       'neo_230_234', 'neo_235_238', 'neo_239', 'pm25_summer_4y_avg',
       'pm25_winter_4y_avg', 'pm25_fall_4y_avg', 'pm25_spring_4y_avg',
       'ozone_summer_4y_avg', 'ozone_winter_4y_avg', 'ozone_fall_4y_avg',
       'ozone_spring_4y_avg', 'no2_summer_4y_avg', 'no2_winter_4y_avg',
       'no2_fall_4y_avg', 'no2_spring_4y_avg', 'summer_tmmx_4y_avg',
       'summer_rmax_4y_avg', 'winter_tmmx_4y_avg', 'winter_rmax_4y_avg',
       'm_count', 'f_count', 'mean_age', 'white_count', 'black_count',
       'hispanic_count', 'asian_count', 'native_count', 'monthly_pop']

y_col1 = ['deaths']

df_train = pd.read_csv('zip_train_v3.csv').drop(columns=['Unnamed: 1','zip.1'])
df_test = pd.read_csv('zip_test_v3.csv').drop(columns=['Unnamed: 1','zip.1'])

groupby_cols1 = ['zip','AYEAR','month_continuous']
df_train_agg2, df_test_agg2 = agg_monthly(col_to_agg, df_train, df_test, groupby_cols1)

n_steps=12

# problem: test data - does not have y value for 12 months/1 year in advance
X_train_rnn3, y_train_rnn3 = split_sequence3(df_train_agg2, x_col2, y_col1, n_steps)
#X_test_rnn3, y_test_rnn3 = split_sequence3(test_temp, x_col2, y_col1, n_steps)

# reshape to 2D for normalizing
xdim2 = len(x_col2)

X_train_rnn3_re = X_train_rnn3.reshape((X_train_rnn3.shape[0]*X_train_rnn3.shape[1]*n_steps,xdim2))
y_train_rnn3_re = y_train_rnn3.reshape((y_train_rnn3.shape[0],y_train_rnn3.shape[1]))

# normalize
X_train_rnn_normalized3 =normalize_data(X_train_rnn3_re)

# reshape to 3D for RNN input
X_train_rnn_normalized3 = X_train_rnn_normalized3.reshape((X_train_rnn3.shape[0]*X_train_rnn3.shape[1],n_steps,xdim2))

In [6]:
xdim2
# https://machinelearningmastery.com/grid-search-hyperparameters-deep-learning-models-python-keras/

63

In [7]:
def create_model_n_units(n_units=12):
    m = tf.keras.metrics.RootMeanSquaredError()
    drop_rate=0.1
    # create model
    bigru_model = tf.keras.Sequential()
    bigru_model.add(tf.keras.layers.Input(shape=(12,63)))
    bigru_model.add(tf.keras.layers.Bidirectional(tf.keras.layers.GRU(n_units, return_sequences=True)))
    bigru_model.add(tf.keras.layers.BatchNormalization())
    bigru_model.add(tf.keras.layers.Bidirectional(tf.keras.layers.GRU(n_units, return_sequences=True)))
    bigru_model.add(tf.keras.layers.BatchNormalization())
    bigru_model.add(tf.keras.layers.Bidirectional(tf.keras.layers.GRU(n_units)))
    bigru_model.add(tf.keras.layers.Dense(64,activation='relu'))
    bigru_model.add(tf.keras.layers.Dropout(drop_rate))
    bigru_model.add(tf.keras.layers.Dense(12,activation='linear'))
    
    # compile model
    bigru_model.compile(optimizer='adam', loss="mean_squared_error", 
                        metrics=m)
    
    return bigru_model

In [None]:
model1 = KerasClassifier(build_fn=create_model_n_units, epochs=20,
                         batch_size=256,verbose=0)

n_units= [10, 25, 50, 75, 100]
param_grid1 = dict(n_units=n_units)

grid = GridSearchCV(estimator=model1, param_grid=param_grid1, n_jobs=-1, cv=3)
grid_result = grid.fit(X_train_rnn_normalized3, y_train_rnn3_re)


In [None]:
# hyperparameters
n_units = 50
drop_rate = 0.2
# dim_embed = 50
# Define learning rate scheduler
lrate = LearningRateScheduler(scheduler)
callbacks_list = [lrate]

optimizer = tf.keras.optimizers.Adam()
lr_metric = get_lr_metric(optimizer)
loss = "mean_squared_error"
metrics = ['accuracy', lr_metric]

batch_size = 64
epochs = 10
validation_split = 0.1
verbose = 1

# define model
bigru_model = tf.keras.Sequential()
bigru_model.add(tf.keras.layers.Input(shape=(12,xdim2)))
bigru_model.add(tf.keras.layers.Bidirectional(tf.keras.layers.GRU(n_units, return_sequences=True)))
bigru_model.add(tf.keras.layers.BatchNormalization())
# bigru_model.add(tf.keras.layers.Dropout(drop_rate))
bigru_model.add(tf.keras.layers.Bidirectional(tf.keras.layers.GRU(n_units, return_sequences=True)))
bigru_model.add(tf.keras.layers.BatchNormalization())
bigru_model.add(tf.keras.layers.Bidirectional(tf.keras.layers.GRU(n_units, return_sequences=True)))
bigru_model.add(tf.keras.layers.BatchNormalization())
bigru_model.add(tf.keras.layers.Bidirectional(tf.keras.layers.GRU(n_units)))
bigru_model.add(tf.keras.layers.Dense(64,activation='relu'))
# gru_model.add(tf.keras.layers.Dropout(drop_rate))
# bigru_model.add(tf.keras.layers.Dense(8,activation='relu'))
bigru_model.add(tf.keras.layers.Dense(12,activation='linear'))
# lstm_model.add(tf.keras.layers.Activation('softmax'))
# Compile model

# Fit
bigru_model.compile(optimizer=optimizer, loss=loss, metrics=metrics)
bigru_model.summary()

In [None]:
# bigru_history = bigru_model.fit(X_train_rnn_normalized1, 
#                                 y_train_rnn1_re, batch_size=batch_size, epochs=epochs, 
#                                 validation_split=validation_split, 
#                                 callbacks=callbacks_list,verbose=verbose)
