In [None]:
import pandas as pd 
import numpy as np 
import json
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from tqdm import tqdm 
from sklearn.metrics import mean_squared_error
import tensorflow as tf
import warnings
warnings.filterwarnings("ignore")
import random
import time 

In [None]:
# load data 
mbd_data = pd.read_csv('./outputs/mbd_data.csv')
clusters = json.loads(open('./outputs/clusters_xgboost.json',"r").read())
cluster_keys = ["0","1","2","3","4","5","6","-1"]
mbd_data = mbd_data.sort_values(by=["cfips","first_day_of_month"]).reset_index(drop=True)
mbd_data.median_hh_inc.ffill(inplace=True)
mbd_data.pct_it_workers.ffill(inplace=True)

sc = MinMaxScaler()
mbd_data.median_hh_inc = sc.fit_transform( mbd_data.median_hh_inc.values.reshape(-1,1) ).reshape(-1)

cfips = mbd_data.cfips.unique()

In [None]:
def smape(y_true, y_pred):
    numerator = np.abs(y_pred - y_true)
    denominator = (np.abs(y_pred) + np.abs(y_true)) / 2
    smape_val = np.mean(numerator / denominator)
    return smape_val

In [None]:
# Plot a county 

def plot_county(cfip):
#     cfip = clusters["-1"][0]
    location = mbd_data[mbd_data.cfips == cfip].county.iloc[0] + ", " +mbd_data[mbd_data.cfips == cfip].state.iloc[0]
    # Create a figure and axis object
    fig, ax = plt.subplots(figsize=(10, 6))
    dates = pd.date_range(start='2019-08-01', end='2022-12-01', freq='MS')
    values = mbd_data[mbd_data.cfips == cfip]["microbusiness_density"].values

    # Plot the line
    ax.plot(dates, values)

    # Set the axis labels and title
    ax.set_xlabel('Date')
    ax.set_ylabel('Microbusiness Density')
    ax.set_title(f'{location} ( CFIP {cfip} )  ')

    # Rotate the x-axis tick labels
    plt.setp(ax.get_xticklabels(), rotation=45, ha='right')

    # Show the plot
    plt.show()

In [None]:
def time_series_split(arr, k):
    n = len(arr)
    if k > n:
        return []
    window = [list(arr[i:i+k]) for i in range(n-k+1)]
    return np.array(window)

In [None]:
k = 4
cfip = clusters["0"][9]
data = mbd_data[mbd_data.cfips == int(cfip)]["microbusiness_density"].values
data = time_series_split(data,k)
# time_col = np.arange(0,len(data))
# data  = np.insert(data,0,time_col,axis=1)
train_size = int(0.9*len(data))
X_train, X_test, y_train, y_test = data[:train_size,:-1],data[train_size:,:-1] , data[:train_size,-1:], data[train_size:,-1:]
X_train, X_test = X_train[:,np.newaxis, :], X_test[:,np.newaxis, :]
plot_county(int(cfip))

In [None]:
# Define the model
# Set the random seed for reproducibility
random.seed(123)
np.random.seed(123)
tf.random.set_seed(123)

model = tf.keras.Sequential([
    tf.keras.layers.LSTM(32,input_shape=(1,k-1)),
    tf.keras.layers.Dense(1)
])

# Compile the model
model.compile(optimizer='adam', loss='mse')


In [None]:
model.fit(X_train,y_train,epochs=150,verbose=0)

In [None]:
y_pred = model.predict(X_train).reshape(-1)
y_true = y_train.reshape(-1)

y_pred = model.predict(X_test).reshape(-1)
y_true = y_test.reshape(-1)

In [None]:
y_true

In [None]:
y_pred

In [None]:
smape(y_true,y_pred)

In [None]:
plt.plot(y_true-y_pred)

In [None]:
plt.plot(y_true)
plt.plot(y_pred)

In [None]:
random.seed(123)
np.random.seed(123)
tf.random.set_seed(123)

def get_model(k,units):
    model = tf.keras.Sequential([
        tf.keras.layers.LSTM(units,input_shape=(1,k-1)),
        tf.keras.layers.Dense(1)
    ])
    
    # Compile the model
    model.compile(optimizer='adam', loss='mse')
    return model 

    
def get_data(k,cfip):
    data = mbd_data[mbd_data.cfips == int(cfip)]["microbusiness_density"].values
    data = time_series_split(data,k)
    train_size = int(0.9*len(data))
    X_train, X_test, y_train, y_test = data[:train_size,:-1],data[train_size:,:-1] , data[:train_size,-1:], data[train_size:,-1:]
    X_train, X_test = X_train[:,np.newaxis, :], X_test[:,np.newaxis, :]
    return X_train, X_test, y_train, y_test


In [None]:
pbar = tqdm(total=(13-3)*(6))
for k in range(3,13):
    for units in [4,8,12,16,32,64]:
        model = get_model(k,units)
        X_train, X_test, y_train, y_test = get_data(k,cfip)
        model.fit(X_train,y_train,epochs=200,verbose=0)
        y_pred = model.predict(X_train).reshape(-1)
        y_true = y_train.reshape(-1)
        train_smape = smape(y_true,y_pred)
        
        y_pred = model.predict(X_test).reshape(-1)
        y_true = y_test.reshape(-1)
        val_smape = smape(y_true,y_pred)
        pbar.update(1)      