In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from sklearn.preprocessing import MinMaxScaler, LabelEncoder

In [4]:
# read, parse, organize dataframes

df1 = pd.read_csv('cleaned_data/cleaned_US_cases_and_deaths.csv', parse_dates=['Date'])
df2 = pd.read_csv('cleaned_data/cleaned_us_covid_daily_reports.csv', parse_dates=['Date'])

df1.sort_values('Date', inplace=True)
df1.set_index('Date', inplace=True)
display(df1.head())

df2.sort_values('Date', inplace=True)
df2.set_index('Date', inplace=True)
display(df2.head())

Unnamed: 0_level_0,UID,iso2,iso3,code3,Province_State,Country_Region,Lat,Long_,Combined_Key,Cases,Population,Deaths
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
2020-01-22,84040005,US,USA,840,Oklahoma,US,34.373666,-96.038025,"Atoka, Oklahoma, US",0,13758,0
2020-01-22,84090006,US,USA,840,California,US,0.0,0.0,"Unassigned, California, US",0,0,0
2020-01-22,84016025,US,USA,840,Idaho,US,43.466657,-114.806589,"Camas, Idaho, US",0,1106,0
2020-01-22,84025009,US,USA,840,Massachusetts,US,42.668763,-70.946872,"Essex, Massachusetts, US",0,789034,0
2020-01-22,84051157,US,USA,840,Virginia,US,38.682956,-78.15827,"Rappahannock, Virginia, US",0,7370,0


Unnamed: 0_level_0,Province_State,Country_Region,Last_Update,Lat,Long_,Confirmed,Deaths,FIPS,Incident_Rate,Total_Test_Results,Case_Fatality_Ratio,UID,ISO3,Testing_Rate,Coordinates
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
2020-04-12,Alabama,US,2020-04-12 23:18:15,32.3182,-86.9023,3667,93,1.0,74.788122,10878490.0,1.625139,84000001.0,USA,114536.357556,"(32.3182, -86.9023)"
2020-04-12,New Hampshire,US,2020-04-12 23:18:15,43.4525,-71.5639,929,23,33.0,68.323342,2374730.0,1.126753,84000033.0,USA,849.592303,"(43.4525, -71.5639)"
2020-04-12,New Jersey,US,2020-04-12 23:18:15,40.2989,-74.521,61850,2350,34.0,696.337277,16949240.0,2.058999,84000034.0,USA,1426.844055,"(40.2989, -74.521)"
2020-04-12,New Mexico,US,2020-04-12 23:18:15,34.8405,-106.2485,1245,26,35.0,59.375371,4932993.0,1.655681,84000035.0,USA,1720.625619,"(34.8405, -106.2485)"
2020-04-12,New York,US,2020-04-12 23:18:15,42.1657,-74.9481,189033,9382,36.0,971.714125,461601.0,1.998068,84000036.0,USA,2372.835493,"(42.1657, -74.9481)"


In [6]:
# begin merge into 1 df

keep = ['Province_State', 'Country_Region', 'Last_Update', 'Lat', 'Long_',
       'Confirmed', 'Deaths', 'FIPS', 'Incident_Rate', 'Total_Test_Results',
       'Case_Fatality_Ratio', 'UID', 'Testing_Rate', 'Coordinates']

merged_df = pd.merge(df1, df2[keep], on=['UID', 'Date'], how='left')

merged_df['Last_Update'] = pd.to_datetime(merged_df['Last_Update'], errors='coerce')

# UID should be int 
merged_df['UID'] = merged_df['UID'].fillna(0).astype(int)

# Handle FIPS (convert to int where possible, otherwise keep as str)
merged_df['FIPS'] = merged_df['FIPS'].apply(lambda x: str(int(x)) if not pd.isna(x) else '')

# drop `_y` columns
merged_df = merged_df.drop(columns=['Province_State_y', 'Country_Region_y', 'Lat_y', 'Long__y', 'Deaths_y'], errors='ignore')

# rename the x columns to regular
merged_df = merged_df.rename(columns={
    'Province_State_x': 'Province_State',
    'Country_Region_x': 'Country_Region',
    'Lat_x': 'Lat',
    'Long__x': 'Long_',
    'Deaths_x': 'Deaths'
})

merged_df.head()

Unnamed: 0_level_0,UID,iso2,iso3,code3,Province_State,Country_Region,Lat,Long_,Combined_Key,Cases,Population,Deaths,Last_Update,Confirmed,FIPS,Incident_Rate,Total_Test_Results,Case_Fatality_Ratio,Testing_Rate,Coordinates
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
2020-01-22,84040005,US,USA,840,Oklahoma,US,34.373666,-96.038025,"Atoka, Oklahoma, US",0,13758,0,NaT,,,,,,,
2020-01-22,84090006,US,USA,840,California,US,0.0,0.0,"Unassigned, California, US",0,0,0,NaT,,,,,,,
2020-01-22,84016025,US,USA,840,Idaho,US,43.466657,-114.806589,"Camas, Idaho, US",0,1106,0,NaT,,,,,,,
2020-01-22,84025009,US,USA,840,Massachusetts,US,42.668763,-70.946872,"Essex, Massachusetts, US",0,789034,0,NaT,,,,,,,
2020-01-22,84051157,US,USA,840,Virginia,US,38.682956,-78.15827,"Rappahannock, Virginia, US",0,7370,0,NaT,,,,,,,


In [None]:
# encode categorical features for model

label_encoder = LabelEncoder()

# create new column for encoded province/state vals
merged_df['Province_State_encoded'] = label_encoder.fit_transform(merged_df['Province_State'])
merged_df.drop(columns=['Province_State'], inplace=True)

# testing on 'Wyoming'
wyoming_encoded = label_encoder.transform(['Wyoming'])[0]

# filter DF for the rows corresponding to Wyoming
subset_df = merged_df[merged_df['Province_State_encoded'] == wyoming_encoded].copy()

subset_df.drop(columns='Deaths', inplace=True) # dropping deaths for now
subset_df.head()

In [None]:

# check for date
if 'Date' in subset_df.columns:
    subset_df['Date'] = pd.to_datetime(subset_df['Date'])
    subset_df.sort_values('Date', inplace=True)
    subset_df.set_index('Date', inplace=True)
else:
    subset_df.index = pd.to_datetime(subset_df.index)
    subset_df.sort_index(inplace=True)

features = ['Cases', 'Population', 'Province_State_encoded']
target = 'Cases'  # predicting cases for now 
lookback = 14

# had better MSE and val loss with scaling
scaler = MinMaxScaler()
subset_df[features] = scaler.fit_transform(subset_df[features])

# sequences
def create_sequences(data, features, lookback, target):
    X, y = [], []
    for i in range(len(data) - lookback):
        # window of 'lookback' days for input
        seq_x = data.iloc[i:i+lookback][features].values
        
        # target = value of 'Cases' at the next time step
        seq_y = data.iloc[i+lookback][target]
        X.append(seq_x)
        y.append(seq_y)
    return np.array(X), np.array(y)

# sequences from the subset
X, y = create_sequences(subset_df, features, lookback, target)

split = int(0.8 * len(X))
X_train, y_train = X[:split], y[:split]
X_test, y_test = X[split:], y[split:]


# build lstm and train
model = Sequential([
    LSTM(64, activation='relu', input_shape=(X_train.shape[1], X_train.shape[2])),
    Dropout(0.2),
    Dense(1)  # predicting a single value: Cases
])

model.compile(optimizer='adam', loss='mse')
model.summary()

history = model.fit(X_train, y_train,
                    epochs=50,
                    batch_size=32,
                    validation_data=(X_test, y_test))

test_loss = model.evaluate(X_test, y_test)
print("Test loss (MSE):", test_loss)

In [30]:
df2.columns

Index(['Province_State', 'Country_Region', 'Last_Update', 'Lat', 'Long_',
       'Confirmed', 'Deaths', 'FIPS', 'Incident_Rate', 'Total_Test_Results',
       'Case_Fatality_Ratio', 'UID', 'ISO3', 'Testing_Rate', 'Coordinates'],
      dtype='object')

In [30]:
merged_df.shape

(3819954, 20)

In [24]:
merged_df.isnull().sum()

UID                          0
iso2                         0
iso3                         0
code3                        0
Province_State               0
Country_Region               0
Lat                          0
Long_                        0
Combined_Key                 0
Cases                        0
Population                   0
Deaths                       0
Last_Update            3815916
Confirmed              3815916
FIPS                         0
Incident_Rate          3815916
Total_Test_Results     3815916
Case_Fatality_Ratio    3815916
Testing_Rate           3815916
Coordinates            3815916
dtype: int64

In [28]:
merged_df.describe().transpose()

Unnamed: 0,count,mean,min,25%,50%,75%,max,std
UID,3819954.0,83428874.204288,16.0,84018105.0,84029205.0,84046119.0,84099999.0,4323530.293789
code3,3819954.0,834.489962,16.0,840.0,840.0,840.0,850.0,36.524886
Lat,3819954.0,36.721258,-14.271,33.89666,38.004509,41.580717,69.314792,9.078591
Long_,3819954.0,-88.640757,-174.1596,-97.805006,-89.48851,-82.312372,145.6739,21.781304
Cases,3819954.0,14087.564984,-3073.0,330.0,2272.0,8159.0,3710586.0,63145.061001
Population,3819954.0,99603.521135,0.0,9917.0,24909.0,64979.0,10039107.0,324115.647989
Deaths,3819954.0,186.881503,-82.0,4.0,37.0,122.0,35545.0,772.752361
Last_Update,4038.0,2021-09-21 03:22:16.336800512,2020-04-12 23:18:15,2021-01-08 11:30:46.500000,2021-09-15 03:31:56,2022-06-14 04:32:28,2023-03-10 04:31:51,
Confirmed,4038.0,10684.91258,0.0,132.0,5565.5,13212.0,61027.0,15728.773888
Incident_Rate,4038.0,9496.411871,0.0,155.955317,4636.209196,18208.599023,37159.697739,11028.719099
