In [3]:
import sys
import numpy as np
import pandas as pd

sys.path.append("../wave_cluster")
from data_tools import *

### Data Import

In [5]:
# US state level demographics and Covid-19 Time-Series data (# new cases every day)
demographics = pd.read_csv("data/us_demographics.csv", index_col = 0)
data = pd.read_csv("data/us_state_daily.csv", index_col = 0)

# dropping the first 45 days for unreliable data 
data = data.iloc[45:,:]

# and dropping a few of the territories with irregular data
dr = ['US_VI', 'US_MP', 'US_GU', 'US_AS']
data = data.drop(dr, axis = 1)

# Normalization of each locations time-series to be cases per 100,000 persons
population = demographics.loc[data.columns,"population"]
norm_data = data.apply(lambda x: x/population[x.name])
data = norm_data * 100000 # cases per 100,000

# Windowed average smoothing of each time-series 
# Using 7 day average with 3 days in front and behind burrent time-stamp
front = 3
back = 3
smooth_data = window_average(data.to_numpy(), front = front, back = back)

# fix the index
smoothed_index = data.index[front:-back - 1]
data = pd.DataFrame(smooth_data, index = smoothed_index, columns = data.columns)

# remove any negative entries (reporting errors still present after smoothing)
data[data < 0] = 0

# there are a few crazy spikes left over which I beleive can be attributed to 
# reporting error. I will average these out, hopefully without loss.
idd = data.columns.get_loc("US_DC")
data.iloc[565, idd] = (data.iloc[564, idd] + data.iloc[566, idd])/2

idd = data.columns.get_loc("US_MO")
mo_col = data.loc[:,"US_MO"].to_numpy().flatten()
smooth_vec = window_average(mo_col[350:420], 10,10)
data.iloc[350 + 11: 420-10,idd] = smooth_vec