# Imports

In [15]:
import pandas as pd
import numpy as np
import os
import pickle

import matplotlib.pyplot as plt
import seaborn as sns
sns.set()
sns.set_context(context='talk')
plt.rcParams['figure.figsize'] = [16,8]

# Date Time, Lag, Window Features

In [16]:
# use existing data
df_flu = pickle.load(open(r"..\data\df_flu_all_features.pkl", "rb" ))

# add next three weeks to predict as well
df_flu['y2'] = df_flu['Total Cases'].shift(-1)
df_flu['y3'] = df_flu['Total Cases'].shift(-2)
df_flu['y4'] = df_flu['Total Cases'].shift(-3)
df_flu.rename(columns={'Total Cases':'y1'}, inplace=True)

# add season
season_dict = {
    1:'Winter',
    2:'Winter',
    3:'Spring',
    4:'Spring',
    5:'Spring',
    6:'Summer',
    7:'Summer',
    8:'Summer',
    9:'Fall',
    10:'Fall',
    11:'Fall',
    12:'Winter'}
df_flu["season"] = df_flu["month"].map(season_dict)

# include 7 lag features (based on autocorrelation plot showing significant autocorrelation for 7 previous values)
arr_list = []
for i in range(1,8):
    arr_list.append(df_flu['y1'].shift(i).to_numpy())

col_names = [*range(1,8)]
new_col_names = []
for name in col_names:
    name = 't - ' + str(name)
    new_col_names.append(name)
df = pd.DataFrame.from_dict(dict(zip(new_col_names, arr_list)))
df_flu = df_flu.reset_index(drop=True)
df_flu = pd.concat([df_flu, df], axis=1)

# same week from previous year
df_flu['this week last year'] = df_flu['y1'].shift(52)

# use 2-week moving average (based on EDA - simplifies the pattern without losing too much data)
df_flu['two-week moving average'] = df_flu['t - 1'].rolling(window=2).mean()

# expanding mean
df_flu['expanding mean'] = df_flu['t - 1'].expanding().mean()

In [17]:
df_flu_pre_COVID = df_flu[df_flu['Week Ending'] < pd.to_datetime('2020-03-01')]
df_flu_COVID = df_flu[df_flu['Week Ending'] >= pd.to_datetime('2020-03-01')]

In [18]:
pickle.dump(df_flu, open(r"..\data\df_flu_all_features.pkl", "wb" ))
pickle.dump(df_flu_pre_COVID, open(r"..\data\df_flu_pre_COVID_all_features.pkl", "wb" ))
pickle.dump(df_flu_COVID, open(r"..\data\df_flu_COVID_all_features.pkl", "wb" ))

In [22]:
df_flu

Unnamed: 0,year,month,Week Ending,y1,Perceived health very good or excellent 13,Perceived health fair or poor 13,Perceived mental health very good or excellent 14,Perceived mental health fair or poor 14,Perceived life stress most days quite a bit or extremely stressful 15,Body mass index adjusted self-reported adult (18 years and over) overweight 16 17 18 19 20 21,...,t - 1,t - 2,t - 3,t - 4,t - 5,t - 6,t - 7,this week last year,two-week moving average,expanding mean
0,2015,9,2015-09-05,9.0,,,,,,,...,,,,,,,,,,
1,2015,9,2015-09-12,18.0,,,,,,,...,9.0,,,,,,,,,9.000000
2,2015,9,2015-09-19,15.0,,,,,,,...,18.0,9.0,,,,,,,13.5,13.500000
3,2015,9,2015-09-26,29.0,,,,,,,...,15.0,18.0,9.0,,,,,,16.5,14.000000
4,2015,10,2015-10-03,59.0,,,,,,,...,29.0,15.0,18.0,9.0,,,,,22.0,17.750000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
374,2022,11,2022-11-05,2407.0,19416200.0,3702600.0,18653600.0,3713700.0,6605100.0,10058800.0,...,1031.0,438.0,251.0,149.0,124.0,84.0,33.0,19.0,734.5,707.256684
375,2022,11,2022-11-12,4144.0,19416200.0,3702600.0,18653600.0,3713700.0,6605100.0,10058800.0,...,2407.0,1031.0,438.0,251.0,149.0,124.0,84.0,20.0,1719.0,711.789333
376,2022,11,2022-11-19,5460.0,19416200.0,3702600.0,18653600.0,3713700.0,6605100.0,10058800.0,...,4144.0,2407.0,1031.0,438.0,251.0,149.0,124.0,20.0,3275.5,720.917553
377,2022,11,2022-11-26,7773.0,19416200.0,3702600.0,18653600.0,3713700.0,6605100.0,10058800.0,...,5460.0,4144.0,2407.0,1031.0,438.0,251.0,149.0,27.0,4802.0,733.488064


In [21]:
df_flu_pre_COVID

Unnamed: 0,year,month,Week Ending,y1,Perceived health very good or excellent 13,Perceived health fair or poor 13,Perceived mental health very good or excellent 14,Perceived mental health fair or poor 14,Perceived life stress most days quite a bit or extremely stressful 15,Body mass index adjusted self-reported adult (18 years and over) overweight 16 17 18 19 20 21,...,t - 1,t - 2,t - 3,t - 4,t - 5,t - 6,t - 7,this week last year,two-week moving average,expanding mean
0,2015,9,2015-09-05,9.0,,,,,,,...,,,,,,,,,,
1,2015,9,2015-09-12,18.0,,,,,,,...,9.0,,,,,,,,,9.000000
2,2015,9,2015-09-19,15.0,,,,,,,...,18.0,9.0,,,,,,,13.5,13.500000
3,2015,9,2015-09-26,29.0,,,,,,,...,15.0,18.0,9.0,,,,,,16.5,14.000000
4,2015,10,2015-10-03,59.0,,,,,,,...,29.0,15.0,18.0,9.0,,,,,22.0,17.750000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
230,2020,2,2020-02-01,4789.0,19464600.0,3464200.0,20701100.0,2494800.0,6729400.0,9850600.0,...,4182.0,3549.0,3811.0,3845.0,3126.0,2206.0,1290.0,2037.0,3865.5,942.056522
231,2020,2,2020-02-08,4654.0,19464600.0,3464200.0,20701100.0,2494800.0,6729400.0,9850600.0,...,4789.0,4182.0,3549.0,3811.0,3845.0,3126.0,2206.0,1709.0,4485.5,958.709957
232,2020,2,2020-02-15,3989.0,19464600.0,3464200.0,20701100.0,2494800.0,6729400.0,9850600.0,...,4654.0,4789.0,4182.0,3549.0,3811.0,3845.0,3126.0,1521.0,4721.5,974.637931
233,2020,2,2020-02-22,3656.0,19464600.0,3464200.0,20701100.0,2494800.0,6729400.0,9850600.0,...,3989.0,4654.0,4789.0,4182.0,3549.0,3811.0,3845.0,1558.0,4321.5,987.575107


In [23]:
df_flu_COVID

Unnamed: 0,year,month,Week Ending,y1,Perceived health very good or excellent 13,Perceived health fair or poor 13,Perceived mental health very good or excellent 14,Perceived mental health fair or poor 14,Perceived life stress most days quite a bit or extremely stressful 15,Body mass index adjusted self-reported adult (18 years and over) overweight 16 17 18 19 20 21,...,t - 1,t - 2,t - 3,t - 4,t - 5,t - 6,t - 7,this week last year,two-week moving average,expanding mean
235,2020,3,2020-03-07,3404.0,19464600.0,3464200.0,20701100.0,2494800.0,6729400.0,9850600.0,...,3576.0,3656.0,3989.0,4654.0,4789.0,4182.0,3549.0,1759.0,3616.0,1009.944681
236,2020,3,2020-03-14,2991.0,19464600.0,3464200.0,20701100.0,2494800.0,6729400.0,9850600.0,...,3404.0,3576.0,3656.0,3989.0,4654.0,4789.0,4182.0,1634.0,3490.0,1020.088983
237,2020,3,2020-03-21,1619.0,19464600.0,3464200.0,20701100.0,2494800.0,6729400.0,9850600.0,...,2991.0,3404.0,3576.0,3656.0,3989.0,4654.0,4789.0,1625.0,3197.5,1028.405063
238,2020,3,2020-03-28,512.0,19464600.0,3464200.0,20701100.0,2494800.0,6729400.0,9850600.0,...,1619.0,2991.0,3404.0,3576.0,3656.0,3989.0,4654.0,1621.0,2305.0,1030.886555
239,2020,4,2020-04-04,161.0,19464600.0,3464200.0,20701100.0,2494800.0,6729400.0,9850600.0,...,512.0,1619.0,2991.0,3404.0,3576.0,3656.0,3989.0,1593.0,1065.5,1028.715481
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
374,2022,11,2022-11-05,2407.0,19416200.0,3702600.0,18653600.0,3713700.0,6605100.0,10058800.0,...,1031.0,438.0,251.0,149.0,124.0,84.0,33.0,19.0,734.5,707.256684
375,2022,11,2022-11-12,4144.0,19416200.0,3702600.0,18653600.0,3713700.0,6605100.0,10058800.0,...,2407.0,1031.0,438.0,251.0,149.0,124.0,84.0,20.0,1719.0,711.789333
376,2022,11,2022-11-19,5460.0,19416200.0,3702600.0,18653600.0,3713700.0,6605100.0,10058800.0,...,4144.0,2407.0,1031.0,438.0,251.0,149.0,124.0,20.0,3275.5,720.917553
377,2022,11,2022-11-26,7773.0,19416200.0,3702600.0,18653600.0,3713700.0,6605100.0,10058800.0,...,5460.0,4144.0,2407.0,1031.0,438.0,251.0,149.0,27.0,4802.0,733.488064
