In [1]:
import pandas as pd
import numpy as np
#!pip install linearmodels
from linearmodels import PanelOLS
#from linearmodels import RandomEffects
#!pip install statsmodels
import statsmodels.api as sm
import statsmodels.formula.api as smf
# !pip install gpboost
# import gpboost as gpb
# that is the package we will use to do the boosted mixed effects
!pip install plotly
import plotly.express as px
#import plotly.graph_objects as go
#import plotly.figure_factory as ff




# Data Wrangling

In [None]:
#df_ned = pd.read_csv("../data/non_epidemic_data.csv")
#df_ned.head()

# will not all this data simply be 1 fixed effect or random effect?
# every one of these will be measured by a group-based intercept
# don't think it's useful to include these
# if wrong, feel free to adapt model to include these

## CDC Data

In [3]:
cdc_data = ["cdc_community_transmission", "cdc_gathering_bans", "cdc_mask_mandates",  "cdc_stay_at_home"]
# "cdc_vaccinations", "cdc_vaccine_hesitancy" have not been included yet
# these still need to be cleaned by Achilleas
# have thusfar also been omitted due to lots of vars and very unclear
# "cdc_bar_closings", "cdc_restaurant_closings

In [4]:
li = []
for i in cdc_data:
    df = pd.read_csv(f"../data/cdc_scraper/{i}.csv", low_memory=False)
    df["date"] = pd.to_datetime(df["date"])
    df["state_fips"] = df["state_fips"].astype("int")
    df["county_fips"] = df["state_fips"].astype("int")
    df.set_index(['date', 'state_fips', 'county_fips'])
    li.append(df)
df_indep = pd.concat(li)

In [5]:
df_indep.drop(["state_code", "county_name", "fips_code_text"], axis=1, inplace=True)
df_indep.set_index(['date', 'state_fips', 'county_fips'], inplace=True)

In [6]:
numerical_vars = ["cases_per_100k_7_day_count", "general_gb_order_code", "percent_test_results_reported"]
categorical_vars = [x for x in df_indep.columns if not x in numerical_vars or numerical_vars.remove(x)]
for i in numerical_vars:
    df_indep[i] = df_indep[i].astype("float64")
for i in categorical_vars:
    df_indep[i] = df_indep[i].astype("category")

## Infection Data

In [7]:
county_daily_df = pd.read_csv("../data/county_daily.csv")

In [8]:
county_daily_df["county_fips"] = county_daily_df["fips"]%1000
county_daily_df["state_fips"] = county_daily_df["fips"]//1000 
county_daily_df["state_fips"].fillna(0)
county_daily_df.drop(["Unnamed: 0" ,"fips", "county", "state", "cases_avg", "deaths_avg"], axis=1, inplace=True)

In [9]:
county_daily_df["date"] = pd.to_datetime(county_daily_df["date"])
county_daily_df["state_fips"] = county_daily_df["state_fips"].astype("int")
county_daily_df["county_fips"] = county_daily_df["state_fips"].astype("int")
county_daily_df.set_index(['date', 'state_fips', 'county_fips'], inplace=True)
county_daily_df.head(15)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,cases,deaths
date,state_fips,county_fips,Unnamed: 3_level_1,Unnamed: 4_level_1
2020-01-21,53,53,1,0
2020-01-22,53,53,0,0
2020-01-23,53,53,0,0
2020-01-24,53,53,0,0
2020-01-24,17,17,1,0
2020-01-25,53,53,0,0
2020-01-25,17,17,0,0
2020-01-25,6,6,1,0
2020-01-26,53,53,0,0
2020-01-26,17,17,0,0


## Merging

In [10]:
df = pd.concat([county_daily_df, df_indep])

In [11]:
df.reset_index(inplace=True)

In [12]:
# time should be integer encoded for mixed models
df["ts"] = df["date"].values.astype(np.int64)
df["ts"] = df["ts"] - df["ts"].min()

In [13]:
# only 1 category is allowed in the model.
# therefore, combining state and county fips
df["fips"] = df["state_fips"]*1000 + df["county_fips"]

In [None]:
df

# Panel Data Analysis

## Infection Numbers

Remember for the paper that we deliberately have not chosen for multivariate ARIMA.

We will extend the same functional form by a boosting procedure. This will also help us with the disbalance in the data.

In [15]:
dfs = df.copy()
dfs = dfs[["fips", "cases", "ts"]]
dfs.dropna(inplace=True)
dfs

Unnamed: 0,fips,cases,ts
0,53053,1.0,0
1,53053,0.0,86400000000000
2,53053,0.0,172800000000000
3,53053,0.0,259200000000000
4,17017,1.0,259200000000000
...,...,...,...
2603925,69069,0.0,73526400000000000
2603926,69069,0.0,73526400000000000
2603927,78078,0.0,73526400000000000
2603928,78078,0.0,73526400000000000


In [None]:
model = smf.mixedlm("cases ~ ts", dfs, groups=dfs["fips"])
result = model.fit(method=["bfgs"])
print(result.summary())

In [None]:
# Mixed Model - to reformulate in more python based format
#endog = 
#exog = 
#groups = 
#exog_re = 
#mod = sm.MixedLM(endog, exog, groups=data["tree"], exog_re=exog["Intercept"])


In [27]:
fig = px.line(dfs.iloc[-20000:], x="ts", y="cases")
fig.show()