# Current App 12/31/21

## Entire App

```python
import streamlit as st

# from fsds.imports import *

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl

import os,glob,sys,joblib,zipfile,json
import datetime as dt
import re

import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio
pio.templates.default = "plotly_dark"

plt.rcParams['figure.figsize'] = (12,6)
pd.set_option('display.max_columns',0)
# fs.check_package_versions(['statsmodels'],fpath=True)



### FORECAST SPECIFIC FUNCTIONS
import statsmodels.api as sms
import statsmodels.tsa.api as tsa
from pmdarima import auto_arima
import project_functions as fn
# from fsds import pandemic as fn
import os,json,glob

with open("FILE_DIRECTORY.json") as f:
    FPATHS = json.load(f)


### TItle
st.markdown('# Planning for the Pandemic')
st.markdown("""
- James M. Irving, PhD.
    - james.irving.phd@gmail.com
    - [LinkedIn](https://www.linkedin.com/in/james-irving-4246b571/)
    - [GitHub Repo](https://github.com/jirvingphd/predicting-the-pandemic)

___""")

st.markdown("""## ***Goal***
- Covid-19 and the various strains that have since emerged has upended modern life and fundamentally changed how we function as a society.
- Part of what has made it difficult to tackle the pandemic is the differences between states, state laws/policies, and a lack of public understanding about the predictability of the surges in cases. 
- The goal of this dashboard is to find the provide easy access state-level coronavirus and hospital capacity statistics.
    - Furthermore, I wanted to provide on-demand timeseries forecasts into the near future for all/any of these statistics.
""")



st.markdown('## ***The Data***')
st.markdown('- This dashboard uses data from several APIs and kaggle datasets. To fetch the lateast data, click the button below.')
WORKFLOW_BUTTON = st.button("Fetch new data.",)

st.markdown('> Note: it can take up to 2 minutes to download the data.')

st.markdown("""### Sources
- Coronavirus Data by State- # of Cases/Deaths by State
    - [Kaggle Dataset: "COVID-19 data from John Hopkins University"](https://www.kaggle.com/antgoldbloom/covid19-data-from-john-hopkins-university) 
    - Repackaged version of the data from the [official Johns Hopkins Repository](https://github.com/CSSEGISandData/COVID-19)
- Hospital Hospital & ICU Occupancy Data:
    - [HealthData.gob Api: "COVID-19 Reported Patient Impact and Hospital Capacity by State Timeseries API"](https://healthdata.gov/Hospital/COVID-19-Reported-Patient-Impact-and-Hospital-Capa/g62h-syeh)
""")
# RUN_FULL_WORKFLOW=False


def load_data(WORKFLOW_BUTTON=False):
    if WORKFLOW_BUTTON == True:
        df_states,STATES = fn.data_acquisition.FULL_WORKFLOW(merge_hospital_data=True)
        ## renaming since merge_hofspital_data=True
    #     DF = df_states.copy()
    #     print(STATES.keys())    
        
    else:
        # print(f"[i] Using previously downloaded data...")
        # df_states = pd.read_pickle(FPATHS['fpath_final_df_pickle'])
        df_states =  pd.read_csv(FPATHS['fpath_final_df_csv'],compression='gzip',parse_dates=['Date'],
                    index_col=[0,1])

    #     with open(FPATHS['fpath_final_states']) as f:
        STATES = joblib.load(FPATHS['fpath_final_states'])
    return df_states,STATES

## load data and save options
df, STATES = load_data(WORKFLOW_BUTTON)
options_stats= df.drop(['Deaths','Cases'],axis=1).columns.tolist()

st.markdown("___")
st.markdown("## ***Overview - Comparing All States***")

# calc dates for map
# today = dt.date.today()
# end_state = today
# start_date = pd.Timestamp(today) - pd.Timedelta(f'{str(n_days)} days')
latest_date = df.droplevel(0).index.max()
end_date = latest_date

## plot state map
n_days = st.slider(f"PAST N # OF DAYS BEFORE {latest_date.strftime('%m/%d/%Y')}",value=30,min_value=7,max_value=180)
col = st.selectbox("Which statistic to map?", options_stats)

start_date = pd.Timestamp(latest_date) - pd.Timedelta(f'{str(n_days)} days')


## get map
map = fn.app_functions.plot_map_columns(df,col=col, last_n_days=n_days,
plot_map=False,return_map=True)

# get just df
df_rank= fn.app_functions.plot_map_columns(df,col=col, last_n_days=n_days,
plot_map=False,return_map=False)

# show map
st.plotly_chart(map)

### Plot same stat for different states
st.markdown("___")
st.markdown('## ***Comparing Selected States***')

## select states and stats
stat_to_compare = st.multiselect("Which statistic to compare?",options_stats,
default=["Cases-New"])
states_to_compare = st.multiselect("Which states to compare?",list(STATES.keys()),
default=["NY",'MD','FL','CA','TX'])

## get and show plot
plot_df = fn.app_functions.get_states_to_plot(df,state_list=states_to_compare,
            plot_cols=stat_to_compare,
                            agg_func= 'mean',
                  rename_cols=True,fill_method='interpolate',
                  remove_outliers=False, state_first=True,
                  threshold_type=['0','%'], diagnose=False)
st.plotly_chart(px.line(plot_df))


st.markdown("___")

# ############################## PRIOR TO  09/21 ###########################
st.markdown('## ***Timeseries Forecasting by State/Statistic***')


default_model_start = latest_date - pd.to_timedelta('365 days')
state_name = st.selectbox('Select State', list(STATES.keys()))
col = st.selectbox("Select statistic",options_stats)
start_date = st.date_input('Start Date for Training Data',
 value=default_model_start)#pd.to_datetime('06-2020'))


df_state = STATES[state_name].loc[start_date:].copy()

# # col = 'Cases-New'
ts = df_state[col].copy()
ax = ts.plot(title=f"{state_name}-{col}");
ax.set_ylabel(col)



st.pyplot(ax.get_figure())# plt.show()


st.markdown("""> **Click "`Run model`" below to start the modeling process for the selected state and statistic.**
-  [!] Warning: the gridsearch process may take several minutes. Try selecting a more recent start date to increase performance.""")


model_q = st.button('Run model.', 
on_click= fn.modeling.make_timeseries_model,args=(STATES,state_name,col))



# st.button('Hit me')
# st.checkbox('Check me out')
# st.radio('Radio', [1,2,3])
# st.multiselect('Multiselect', [1,2,3])
# st.slider('Slide me', min_value=0, max_value=10)
# st.select_slider('Slide to select', options=[1,'2'])
# st.text_input('Enter some text')
# st.number_input('Enter a number')
# st.text_area('Area for textual entry')
# st.date_input('Date input')
# st.time_input('Time entry')
# st.file_uploader('File uploader')
# st.color_picker('Pick a color')
```

## Parts of App

In [None]:
# !pip install fsds

```python
import streamlit as st

from fsds.imports import *
import pandas as pd

import os,glob,sys,joblib,zipfile,json
import datetime as dt
import re

import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio
pio.templates.default = "plotly_dark"

plt.rcParams['figure.figsize'] = (12,6)
pd.set_option('display.max_columns',0)
# fs.check_package_versions(['statsmodels'],fpath=True)



### FORECAST SPECIFIC FUNCTIONS
import statsmodels.api as sms
import statsmodels.tsa.api as tsa
from pmdarima import auto_arima
import project_functions as fn
# from fsds import pandemic as fn
import os,json,glob

with open("FILE_DIRECTORY.json") as f:
    FPATHS = json.load(f)
```

```python
### TItle
st.markdown('# Planning for the Pandemic')
st.markdown("""
- James M. Irving, PhD.
    - james.irving.phd@gmail.com
    - [LinkedIn](https://www.linkedin.com/in/james-irving-4246b571/)
    - [GitHub Repo](https://github.com/jirvingphd/predicting-the-pandemic)

___""")

st.markdown("""## ***Goal***
- Covid-19 and the various strains that have since emerged has upended modern life and fundamentally changed how we function as a society.
- Part of what has made it difficult to tackle the pandemic is the differences between states, state laws/policies, and a lack of public understanding about the predictability of the surges in cases. 
- The goal of this dashboard is to find the provide easy access state-level coronavirus and hospital capacity statistics.
    - Furthermore, I wanted to provide on-demand timeseries forecasts into the near future for all/any of these statistics.
""")



st.markdown('## ***The Data***')
st.markdown('- This dashboard uses data from several APIs and kaggle datasets. To fetch the lateast data, click the button below.')
WORKFLOW_BUTTON = st.button("Fetch new data.",)

st.markdown('> Note: it can take up to 2 minutes to download the data.')

st.markdown("""### Sources
- Coronavirus Data by State- # of Cases/Deaths by State
    - [Kaggle Dataset: "COVID-19 data from John Hopkins University"](https://www.kaggle.com/antgoldbloom/covid19-data-from-john-hopkins-university) 
    - Repackaged version of the data from the [official Johns Hopkins Repository](https://github.com/CSSEGISandData/COVID-19)
- Hospital Hospital & ICU Occupancy Data:
    - [HealthData.gob Api: "COVID-19 Reported Patient Impact and Hospital Capacity by State Timeseries API"](https://healthdata.gov/Hospital/COVID-19-Reported-Patient-Impact-and-Hospital-Capa/g62h-syeh)
""")
# RUN_FULL_WORKFLOW=False
```

```python
def load_data(WORKFLOW_BUTTON=False):
    if WORKFLOW_BUTTON == True:
        df_states,STATES = fn.data_acquisition.FULL_WORKFLOW(merge_hospital_data=True)
        ## renaming since merge_hofspital_data=True
    #     DF = df_states.copy()
    #     print(STATES.keys())    
        
    else:
        # print(f"[i] Using previously downloaded data...")
        # df_states = pd.read_pickle(FPATHS['fpath_final_df_pickle'])
        df_states =  pd.read_csv(FPATHS['fpath_final_df_csv'],compression='gzip',parse_dates=['Date'],
                    index_col=[0,1])

    #     with open(FPATHS['fpath_final_states']) as f:
        STATES = joblib.load(FPATHS['fpath_final_states'])
    return df_states,STATES

## load data and save options
df, STATES = load_data(WORKFLOW_BUTTON)
options_stats= df.drop(['Deaths','Cases'],axis=1).columns.tolist()

st.markdown("___")
st.markdown("## ***Overview - Comparing All States***")
## plot state map
n_days = st.slider("PAST N # OF DAYS",value=30,min_value=7,max_value=180)
col = st.selectbox("Which statistic to map?", options_stats)

# calc dates
today = dt.date.today()
end_state = today
start_date = pd.Timestamp(today) - pd.Timedelta(f'{str(n_days)} days')

## get map
map = fn.app_functions.plot_map_columns(df,col=col, last_n_days=n_days,
plot_map=False,return_map=True)

# get just df
df_rank= fn.app_functions.plot_map_columns(df,col=col, last_n_days=n_days,
plot_map=False,return_map=False)

# show map
st.plotly_chart(map)

### Plot same stat for different states
st.markdown("___")
st.markdown('## ***Comparing Selected States***')

## select states and stats
stat_to_compare = st.multiselect("Which statistic to compare?",options_stats,
default=["Cases-New"])
states_to_compare = st.multiselect("Which states to compare?",list(STATES.keys()),
default=["NY",'MD','FL','CA','TX'])

## get and show plot
plot_df = fn.app_functions.get_states_to_plot(df,state_list=states_to_compare,
            plot_cols=stat_to_compare,
                            agg_func= 'mean',
                  rename_cols=True,fill_method='interpolate',
                  remove_outliers=False, state_first=True,
                  threshold_type=['0','%'], diagnose=False)
st.plotly_chart(px.line(plot_df))


st.markdown("___")

# ############################## PRIOR TO  09/21 ###########################
st.markdown('## ***Timeseries Forecasting by State/Statistic***')


default_model_start = today - pd.to_timedelta('365 days')
state_name = st.selectbox('Select State', list(STATES.keys()))
col = st.selectbox("Select statistic",options_stats)
start_date = st.date_input('Start Date for Training Data',
 value=default_model_start)#pd.to_datetime('06-2020'))


df_state = STATES[state_name].loc[start_date:].copy()

# # col = 'Cases-New'
ts = df_state[col].copy()
ax = ts.plot(title=f"{state_name}-{col}");
ax.set_ylabel(col)



st.pyplot(ax.get_figure())# plt.show()


st.markdown("""> **Click "`Run model`" below to start the modeling process for the selected state and statistic.**
-  [!] Warning: the gridsearch process may take several minutes. Try selecting a more recent start date to increase performance.""")


model_q = st.button('Run model.', 
on_click= fn.modeling.make_timeseries_model,args=(STATES,state_name,col))



# st.button('Hit me')
# st.checkbox('Check me out')
# st.radio('Radio', [1,2,3])
# st.multiselect('Multiselect', [1,2,3])
# st.slider('Slide me', min_value=0, max_value=10)
# st.select_slider('Slide to select', options=[1,'2'])
# st.text_input('Enter some text')
# st.number_input('Enter a number')
# st.text_area('Area for textual entry')
# st.date_input('Date input')
# st.time_input('Time entry')
# st.file_uploader('File uploader')
# st.color_picker('Pick a color')

```

# Test Workflow

### Notes on Data Acquisition:

- I should create a function for each dataframe that I create, including:
    - population
    - hospital data
    - kaggle covid data
    
>- **Need to add back normalizing by population**
    - Issue was...? 
        - Old population data, not 2020 census


## Test App Code

In [1]:
import streamlit as st

from fsds.imports import *
import pandas as pd

import os,glob,sys,joblib,zipfile,json
import datetime as dt
import re

import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio
pio.templates.default = "plotly_dark"

plt.rcParams['figure.figsize'] = (12,6)
pd.set_option('display.max_columns',0)
# fs.check_package_versions(['statsmodels'],fpath=True)



### FORECAST SPECIFIC FUNCTIONS
import statsmodels.api as sms
import statsmodels.tsa.api as tsa
from pmdarima import auto_arima
import project_functions as fn
# from fsds import pandemic as fn
import os,json,glob

with open("FILE_DIRECTORY.json") as f:
    FPATHS = json.load(f)
FPATHS

fsds v0.4.14 loaded.


Package,Handle,Version,Documentation,Imported
pandas,pd,1.1.3,https://pandas.pydata.org/docs/,Y
fsds,fs,0.4.14,https://fs-ds.readthedocs.io/en/latest/,Y
numpy,np,1.21.5,https://numpy.org/doc/stable/reference/,Y
matplotlib,mpl,3.3.1,https://matplotlib.org/stable/api/index.html,Y
matplotlib.pyplot,plt,,https://matplotlib.org/stable/api/_as_gen/matplotlib.pyplot.html#module-matplotlib.pyplot,Y
seaborn,sns,0.11.0,https://seaborn.pydata.org/api.html,Y
IPython.display,dp,,https://ipython.readthedocs.io/en/stable/api/generated/IPython.display.html,Y
sklearn,,0.23.2,,N


{'fpath_raw': './data_raw/',
 'fpath_clean': './data/',
 'fpath_reference': './reference_data/',
 'fpath_state_data': './data/state_data/',
 'fpath_final_states': './data/FINAL_STATES.joblib',
 'fpath_final_df_csv': './data/FINAL_STATES.csv.gz',
 'fpath_final_df_pickle': './data/FINAL_STATES.pickle',
 'name_of_fpath_vars': ['fpath_raw',
  'fpath_clean',
  'fpath_reference',
  'fpath_state_data',
  'fpath_final_states',
  'fpath_final_df_csv',
  'fpath_final_df_pickle',
  'name_of_fpath_vars']}

In [2]:
def load_data(WORKFLOW_BUTTON=False):
    if WORKFLOW_BUTTON == True:
        df_states,STATES = fn.data_acquisition.FULL_WORKFLOW(merge_hospital_data=True)
        ## renaming since merge_hofspital_data=True
    #     DF = df_states.copy()
    #     print(STATES.keys())    
        
    else:
        # print(f"[i] Using previously downloaded data...")
        # df_states = pd.read_pickle(FPATHS['fpath_final_df_pickle'])
        df_states =  pd.read_csv(FPATHS['fpath_final_df_csv'],compression='gzip',parse_dates=['Date'],
                    index_col=[0,1])

    #     with open(FPATHS['fpath_final_states']) as f:
        STATES = joblib.load(FPATHS['fpath_final_states'])
    return df_states,STATES

In [3]:
WORKFLOW_BUTTON=False
## load data and save options
df, STATES = load_data(WORKFLOW_BUTTON)
options_stats= df.drop(['Deaths','Cases'],axis=1).columns.tolist()

[i] Retrieving kaggle dataset: antgoldbloom/covid19-data-from-john-hopkins-university
- Loading data from RAW_us_confirmed_cases.csv
- Loading data from RAW_us_deaths.csv
[i] Retrieving hospital data from https://healthdata.gov/resource/g62h-syeh.csv
[i] Workflow completed.
	Run time=0:00:51.464473 sec.
[i]The final files of note:
	./data/combined_us_states_full_data.csv
	./data/STATE_DICT.joblib
[i] Final joined data (DF) saved as ./data/FINAL_STATES.csv.gz
[i] Final joined data (DF) saved as ./data/FINAL_STATES.pickle


In [4]:
df

Unnamed: 0_level_0,Unnamed: 1_level_0,Deaths,Cases,Hospitalized Currently,ICU-Covid Currently,Deaths-New,Cases-New
Unnamed: 0_level_1,Date,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
AK,2020-03-23,0,39,0.0,0.0,0.0,0.0
AK,2020-03-24,0,43,0.0,0.0,0.0,4.0
AK,2020-03-25,1,50,0.0,0.0,1.0,7.0
AK,2020-03-26,1,64,0.0,0.0,0.0,14.0
AK,2020-03-27,1,75,0.0,0.0,0.0,11.0
...,...,...,...,...,...,...,...
WY,2021-12-26,1526,114242,63.0,28.0,0.0,0.0
WY,2021-12-27,1526,114624,64.0,26.0,0.0,382.0
WY,2021-12-28,1526,114917,69.0,21.0,0.0,293.0
WY,2021-12-29,1526,115242,74.0,25.0,0.0,325.0


In [5]:
df.loc['MD']

Unnamed: 0_level_0,Deaths,Cases,Hospitalized Currently,ICU-Covid Currently,Deaths-New,Cases-New
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2020-03-01,0,0,0.0,0.0,0.0,0.0
2020-03-02,0,0,0.0,0.0,0.0,0.0
2020-03-03,0,0,0.0,0.0,0.0,0.0
2020-03-04,0,0,0.0,0.0,0.0,0.0
2020-03-05,0,0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...
2021-12-26,11255,663414,1654.0,340.0,0.0,25035.0
2021-12-27,11255,668790,1805.0,367.0,0.0,5376.0
2021-12-28,11672,675364,1929.0,375.0,417.0,6574.0
2021-12-29,11703,686237,2113.0,403.0,31.0,10873.0


In [21]:
## 
state_df = df.loc['MD'].copy()
state_df

Unnamed: 0_level_0,Deaths,Cases,Hospitalized Currently,ICU-Covid Currently,Deaths-New,Cases-New
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2020-03-01,0,0,0.0,0.0,0.0,0.0
2020-03-02,0,0,0.0,0.0,0.0,0.0
2020-03-03,0,0,0.0,0.0,0.0,0.0
2020-03-04,0,0,0.0,0.0,0.0,0.0
2020-03-05,0,0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...
2021-12-26,11255,663414,1654.0,340.0,0.0,25035.0
2021-12-27,11255,668790,1805.0,367.0,0.0,5376.0
2021-12-28,11672,675364,1929.0,375.0,417.0,6574.0
2021-12-29,11703,686237,2113.0,403.0,31.0,10873.0


In [22]:
state_df.index

DatetimeIndex(['2020-03-01', '2020-03-02', '2020-03-03', '2020-03-04',
               '2020-03-05', '2020-03-07', '2020-03-08', '2020-03-09',
               '2020-03-10', '2020-03-11',
               ...
               '2021-12-20', '2021-12-22', '2021-12-23', '2021-12-24',
               '2021-12-25', '2021-12-26', '2021-12-27', '2021-12-28',
               '2021-12-29', '2021-12-30'],
              dtype='datetime64[ns]', name='Date', length=669, freq=None)

In [None]:
fn.data_acquisition.

In [16]:
# map_ = fn.app_functions.plot_map_columns(state_df,col="Cases-New", last_n_days=30,
# plot_map=False,return_map=True)

In [17]:
# ##### MAP CODE
# # calc dates for map
# # today = dt.date.today()
# # end_state = today
# # start_date = pd.Timestamp(today) - pd.Timedelta(f'{str(n_days)} days')
# latest_date = df.droplevel(0).index.max()
# end_date = latest_date

# ## plot state map
# n_days = st.slider(f"PAST N # OF DAYS BEFORE {latest_date.strftime('%m/%d/%Y')}",value=30,min_value=7,max_value=180)
# col = st.selectbox("Which statistic to map?", options_stats)

# start_date = pd.Timestamp(latest_date) - pd.Timedelta(f'{str(n_days)} days')


# ## get map
# map_ = fn.app_functions.plot_map_columns(df,col=col, last_n_days=n_days,
# plot_map=False,return_map=True)

# # get just df
# df_rank= fn.app_functions.plot_map_columns(df,col=col, last_n_days=n_days,
# plot_map=False,return_map=False)

# # show map
# st.plotly_chart(map)


In [18]:
# STATES.keys()

In [None]:
df

### NOTES:

- [x] Will want to change how the start date is determined. Instead of creating `today`, get the last date from one of the states

```python
## CURRENT WAY
# calc dates
today = dt.date.today()
end_state = today
start_date = pd.Timestamp(today) - pd.Timedelta(f'{str(n_days)} days')


## PROPOSED NEW WAY
latest_date = df.droplevel(0).index.max()
end_date = latest_date
start_date = pd.Timestamp(latest_date) - pd.Timedelta(f'{str(n_days)} days')
```

## Updating Functions

In [23]:
## from project functions > app finctions
def plot_map_columns(DF,col='Cases-New',last_n_days=90,
               map_metric='sum',plot_map=True,return_map=False):
    """
    """
    import datetime as dt

    today = dt.date.today()
    end_date = today
    start_date = pd.Timestamp(today) - pd.Timedelta(f'{last_n_days} days')

    plot_df = get_states_to_plot(DF,state_list=None, plot_cols=col,
                             agg_func= 'mean',
                      rename_cols=True,fill_method='interpolate',
                      remove_outliers=True, state_first=True,
                      threshold_type=['0','%'], diagnose=False)

    plot_df.columns = [c.split(' - ')[0] for c in plot_df.columns]
    plot_df = plot_df.loc[start_date:end_date]
    
    
    
    agg_data = plot_df.agg(map_metric).reset_index()
    agg_data.columns= ['state',col]
    
    color_column = col
    map_ = px.choropleth(agg_data,color=color_column,locations='state',
                    locationmode="USA-states", scope='usa', 
                            title=f"{map_metric.title()} {col} by State (Last {last_n_days} Days)",#.format(color_column),
                    color_continuous_scale=px.colors.sequential.Reds)
    if plot_map:
        map_.show(config={'scrollZoom':False})

    if return_map:
            return map_
    else:
        return agg_data
    
    
    
def get_states_to_plot(DF,state_list=["NY",'MD','TX','PA', 'FL'],
                       plot_cols=None, 
                      agg_func= 'mean',
              rename_cols=True,fill_method='interpolate',
              remove_outliers=False, state_first=False,
              threshold_type=['0','%'], diagnose=False):
    
    get_states_kwargs = dict(rename_cols=rename_cols,
                          fill_method=fill_method,
                          state_first=state_first,
                           threshold_type=threshold_type,
                          diagnose=diagnose)
    
    if state_list is None:   
        if isinstance(DF,pd.DataFrame):
            state_list = list(DF.index.get_level_values(0).unique())
    
        elif isinstance(DF,dict):
            state_list = list(sorted(DF.keys()))
    
    ## Get each state
    dfs_to_concat = []
    for state in state_list:
        dfs = get_state_df_ts_final(DF,state,ts_col=plot_cols,**get_states_kwargs)
        dfs_to_concat.append(dfs)
        
        
     ## Concatenate final dfs
    try:
        plot_df = pd.concat(dfs_to_concat,axis=1)#[STATES[s] for s in plot_states],axis=1).iplot()
        new_order = sorted(plot_df.columns.to_list())
        plot_df = plot_df[new_order]
    except:
        print('[!] pd.concat failed, returning list...')
        plot_df = dfs_to_concat
    return plot_df



############################################################### PRE-09/21/21
def get_state_df_ts(STATES, state_name,ts_col=None,group_col='state',# legacy compatible
                     freq='D', start_date='08-2020', agg_func='mean', #formerly sum
                    fill_nans=True, fill_method='interpolate',
                    rename_cols=True,name_sep=' - ',state_first=True,diagnose=True):
    """Take df_us and extracts state's data as then Freq/Aggregation provided
    
    
    Paramters:
    
        - DF
        - state_name
        - ts_col
        - group_col
        - freq 
        - start_date
        - agg_func
        -  fill_nans
        - fill_method
    
    """
    import inspect
    args = inspect.signature(get_state_df_ts)
    args = {k:v.default for k,v in args.parameters.items()}  
    
    
    if isinstance(STATES,dict):
        state_df = STATES[state_name].copy()
        
    elif isinstance(STATES, pd.DataFrame):
        ## Slicing out state df for index dates 
        state_df = STATES.loc[state_name].copy()

    else:
        ## Get state_df group
        state_df = STATES.groupby(group_col).get_group(state_name)#.resample(freq).agg(agg)

        
    ## visualize pre-resampling
    if diagnose:
        pfig = px.line(state_df,title='Pre-Resampling')
        pfig.show()
    
    
    if agg_func=='as_freq':
        try:
            state_df = state_df.resample(freq).asfreq(freq)
        except Exception as e:

            agg_func = args['agg_func']
            state_df = state_df.resample(freq).agg(agg_func)
            print(f"[!] Erorr using agg_func='as_freq'; Using default agg_func ('{agg_func}') instead.")
            print('\tError message below:')
            print("\t",e)        
    elif agg_func is None:
        pass
    else:
        ## Resample and aggregate state data
        state_df = state_df.resample(freq).agg(agg_func)
        
    ## Slice out time period desired.
    state_df = state_df.loc[start_date:]
    

    
    
    ## Deal with null values:
    if fill_method == 'interpolate':
        state_df = state_df.interpolate()
        
    elif fill_method == None:
        pass
    
    else:
        state_df = state_df.fillna(method=fill_method)
        
        
        

     ## Renamed columns with state name
    if rename_cols == True:
    
        ## Get and Rename Sum Cols 
        orig_cols = state_df.columns

        for col in orig_cols:

            if state_first==True:
                new_col_name = f"{state_name}{name_sep}{col}"
            else:
                new_col_name = f"{col}{name_sep}{state_name}"

            state_df[new_col_name] = state_df[col].copy()

        ## Drop original cols
        state_df.drop(orig_cols,axis=1,inplace=True)
    
    
    
    ## Return only columns containing ts_cols
    if ts_col is not None:

        if isinstance(ts_col,str):
            ts_col = [ts_col]

            # find cols that end with the column name
        selected_cols=[]
        for col in ts_col:
            selected_cols.extend([c for c in state_df.columns if c.endswith(col)])
            
        state_df = state_df[selected_cols]
        
    ## Add outlier removal here:
        
        
        
    ## Visualize post-resampling 
    if diagnose:
            pfig = px.line(state_df,title="post-Resampling")
            pfig.show()
    
    
    return state_df





### BEST VERSION YET! 09/12 ######
def get_state_df_ts_final(STATES, state_name,ts_col=None,group_col='state',# legacy compatible
                     freq='D', start_date='08-2020', agg_func='mean', #formerly sum
                    fill_nans=True, fill_method='interpolate',
                    rename_cols=True,name_sep=' - ',state_first=False,
                    remove_outliers=True,  n_diff=1, threshold_type='pct_change',
                      raw_thresh=100, pct_thresh=50, 
                          diagnose=True):
    """Take df_us and extracts state's data as then Freq/Aggregation provided
    
    
    Paramters:
    
        - DF
        - state_name
        - ts_col
        - group_col
        - freq 
        - start_date
        - agg_func
        -  fill_nans
        - fill_method
        
        
    Note on order:
    1. make state_df
    2. resample and aggregate
    3. slice start_date
    4. Select columns
    5. Remove Outliers
    6. Fill Null values
    7. Rename columns
    
    """
    ## Get default arguments for try/except
    import inspect
    args = inspect.signature(get_state_df_ts)
    args = {k:v.default for k,v in args.parameters.items()}  
    
    
    ## Slice state_df depending on which datatype 
    if isinstance(STATES,dict):
        state_df = STATES[state_name].copy()
        
    elif isinstance(STATES, pd.DataFrame):
        ## Slicing out state df for index dates 
        state_df = STATES.loc[state_name].copy()

    else:
        ## Get state_df group
        state_df = STATES.groupby(group_col).get_group(state_name)#.resample(freq).agg(agg)

        
        
    ## visualize pre-resampling
    if diagnose:
        pfig = px.line(state_df,title='Pre-Resampling')
        pfig.show()
    

    ## Resampling and Aggregating 
    if agg_func=='as_freq':
        try:
            state_df = state_df.resample(freq).asfreq(freq)
        except Exception as e:

            agg_func = args['agg_func']
            state_df = state_df.resample(freq).agg(agg_func)
            print(f"[!] Erorr using agg_func='as_freq'; Using default agg_func ('{agg_func}') instead.")
            print('\tError message below:')
            print("\t",e)        
            
    elif agg_func is None:
        pass
    
    else:
        state_df = state_df.resample(freq).agg(agg_func)
        
        
    ## Slice out time period desired.
    state_df = state_df.loc[start_date:]
    
    
    ## Return only columns containing ts_cols
    if ts_col is not None:

        if isinstance(ts_col,str):
            ts_col = [ts_col]

        # find cols that end with the column name
        selected_cols=[]
        for col in ts_col:
            selected_cols.extend([c for c in state_df.columns if c.endswith(col)])
            
        state_df = state_df[selected_cols]
        
        
    ## Remove Outleirs
    if remove_outliers:
        
        if isinstance(threshold_type,str):
            threshold_type= [threshold_type]
            
        for thresh_type in threshold_type:
            state_df = remove_outliers_ts(state_df,threshold_type=thresh_type,
                                         raw_thresh=raw_thresh,pct_thresh=pct_thresh,
                                         n_diff=n_diff,fill_method=fill_method)
        
   
    
    ## Deal with reamaining  null values:  (REMOVE??)
    if fill_method == 'interpolate':
        state_df = state_df.interpolate()
        
    elif fill_method == None:
        pass
    
    else:
        state_df = state_df.fillna(method=fill_method)
    
        

     ## Rename columns with state name
    if rename_cols == True:
    
        ## Get and Rename Sum Cols 
        orig_cols = state_df.columns

        for col in orig_cols:

            if state_first==True:
                new_col_name = f"{state_name}{name_sep}{col}"
            else:
                new_col_name = f"{col}{name_sep}{state_name}"

            state_df[new_col_name] = state_df[col].copy()

        ## Drop original cols
        state_df.drop(orig_cols,axis=1,inplace=True)
    

    ## Visualize post-resampling 
    if diagnose:
            pfig = px.line(state_df,title="post-Resampling")
            pfig.show()

    
    
    return state_df





def calc_perc_change(ts,periods=1,replace_inf_with_nan=True):
    """Calculated the pct_change with pandas and optionall replaces inf values"""
    ts_pct_change = ts.pct_change(periods=periods)#
    
    if replace_inf_with_nan:
        ts_pct_change = ts_pct_change.replace({np.inf:np.nan,
                                      -np.inf:np.nan})

    return ts_pct_change


def remove_outliers_ts(ts_,threshold_type='pct_change',raw_thresh=100,
                       pct_thresh=50, n_diff=1,fill_method='interpolate'):
    """ Remove outliers from time series.
    
    Parameters:
        - ts 
        - threshold_type {'raw',('pct_change','%'),('zero','0')}
        - raw_thresh 
        - pct_thresh
        - n_diff (1) - period for .diff or .pct_change
    """
    
    if isinstance(ts_, pd.Series):
        ts_df = ts_.to_frame(ts_.name)
    else:
        ts_df = ts_.copy()
    
    ## sve copy to remove outleirs from
    ts_out = ts_df.copy()
    
    for col in ts_df.columns:
        ts = ts_df[col].copy()
        
        ## use threshold techniques to identify outleirs
        if threshold_type == 'raw':
            ## saving deltas 
            deltas  = ts.diff(n_diff)
            idx_outliers = deltas.abs()>raw_thresh

        elif (threshold_type == '%') | (threshold_type == 'pct_change'):
            deltas = calc_perc_change(ts,periods=n_diff)
            idx_outliers = deltas.abs() > pct_thresh


        elif (threshold_type == 'zero') | (threshold_type == '0'): 
            deltas = ts.copy()
            idx_outliers = ts == 0
        else:
            raise Exception("Other threshold_kinds are not yet implemented.")


        ## SSaving outleirs
        outliers = deltas[idx_outliers]


        ## Filling in outliers
#         ts_out = ts.copy()
        ts_out[col].loc[outliers.index] = np.nan
    
    
    

        ## Deal with null values:
        if fill_method == 'interpolate':
            ts_out[col] = ts_out[col].interpolate()

        elif fill_method == None:
            pass

        else:
            ts_out[col] = ts_out[col].fillna(method=fill_method)

    
    return ts_out
     
    


In [24]:
map_ = plot_map_columns(df, plot_map=False, return_map=True)
map_

In [26]:
## get and show plot
states_to_compare=['FL','MD','SC']
stat_to_compare = 'Cases-New'
plot_df = fn.app_functions.get_states_to_plot(df,state_list=states_to_compare,
            plot_cols=stat_to_compare,
                            agg_func= 'mean',
                  rename_cols=True,fill_method='interpolate',
                  remove_outliers=False, state_first=True,
                  threshold_type=['0','%'], diagnose=False)
plot_df

Unnamed: 0_level_0,FL - Cases-New,MD - Cases-New,SC - Cases-New
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2020-08-01,9642.000000,1019.0,1583.000000
2020-08-02,7104.000000,909.0,1189.000000
2020-08-03,4752.000000,870.0,1163.000000
2020-08-04,5446.000000,710.0,1239.000000
2020-08-05,5409.000000,572.0,1282.000000
...,...,...,...
2021-12-26,44020.666667,25035.0,8918.000000
2021-12-27,28422.000000,5376.0,11152.333333
2021-12-28,29059.000000,6574.0,13386.666667
2021-12-29,46923.000000,10873.0,15621.000000


In [32]:
plot_df_W = plot_df.resample("W").sum()
plot_df_W

Unnamed: 0_level_0,FL - Cases-New,MD - Cases-New,SC - Cases-New
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2020-08-02,16746.000000,1928.0,2772.0
2020-08-09,45674.000000,5229.0,8647.0
2020-08-16,40610.000000,4709.0,6062.0
2020-08-23,27155.000000,3890.0,5491.0
2020-08-30,21015.000000,3689.0,6336.0
...,...,...,...
2021-12-05,69686.964286,9508.0,13512.5
2021-12-12,94368.142857,11018.0,11912.0
2021-12-19,171996.342857,21696.5,12385.0
2021-12-26,252864.300000,80903.5,28418.0


In [34]:
# pfig = px.line(plot_df_W,y="MD - Cases-New",animation_frame="Date")

### Notes re: lineplot

- [x] Would like to have a call back function (or modify `fn.app_functions.get_states_to_plot`) to optionally plot a rolling mean.
    - Added a checkbox
    
- Need to add back normalizing by population

In [38]:
## Apply smooothing
px.line(plot_df.rolling(7).mean())

## Adding Population

In [39]:
# from fn.legacy_data_acquisition
def download_world_pop(data_folder = "./reference_data/",load=True):
    """Downloads world pop zip from kaggle"""

    # Download kaggle dataset
    os.system(f'kaggle datasets download -p {data_folder} -d tanuprabhu/population-by-country-2020')

    ## Specify file and target folder
    file = 'population-by-country-2020.zip'
    target = os.path.join(data_folder,file)

    ## Move zip file to target
#     shutil.move(file,target)
    print(f'Population data saved to {target}')
    
    ## Load csv 
    if load:
        df = pd.read_csv(target)
    else:
        df = target
    return df

In [41]:
##
pop_df = download_world_pop()
pop_df

Population data saved to ./reference_data/population-by-country-2020.zip


Unnamed: 0,Country (or dependency),Population (2020),Yearly Change,Net Change,Density (P/Km²),Land Area (Km²),Migrants (net),Fert. Rate,Med. Age,Urban Pop %,World Share
0,China,1440297825,0.39 %,5540090,153,9388211,-348399.0,1.7,38,61 %,18.47 %
1,India,1382345085,0.99 %,13586631,464,2973190,-532687.0,2.2,28,35 %,17.70 %
2,United States,331341050,0.59 %,1937734,36,9147420,954806.0,1.8,38,83 %,4.25 %
3,Indonesia,274021604,1.07 %,2898047,151,1811570,-98955.0,2.3,30,56 %,3.51 %
4,Pakistan,221612785,2.00 %,4327022,287,770880,-233379.0,3.6,23,35 %,2.83 %
...,...,...,...,...,...,...,...,...,...,...,...
230,Montserrat,4993,0.06 %,3,50,100,,N.A.,N.A.,10 %,0.00 %
231,Falkland Islands,3497,3.05 %,103,0,12170,,N.A.,N.A.,66 %,0.00 %
232,Niue,1628,0.68 %,11,6,260,,N.A.,N.A.,46 %,0.00 %
233,Tokelau,1360,1.27 %,17,136,10,,N.A.,N.A.,0 %,0.00 %


### Potential US Pop Sources

- https://www.kaggle.com/zusmani/us-census-2020
    - `kaggle datasets download -d zusmani/us-census-2020`
    - Cons: 12 GB big!
      
- https://www.census.gov/data/datasets/2020/dec/2020-census-redistricting-summary-file-dataset.html

>- HOW TO!! https://towardsdatascience.com/accessing-census-data-with-python-3e2f2b56e20d

In [47]:
folder = "/Users/codingdojo//Downloads/"
files_list = sorted(glob.glob(folder+'**/*.sas',recursive=True))
files_list

['/Users/codingdojo//Downloads/2020PL_SAS_import_scripts/pl_all_4_2020_notab_dar.sas',
 '/Users/codingdojo//Downloads/2020PL_SAS_import_scripts/pl_geohd_2020_notab_dar.sas',
 '/Users/codingdojo//Downloads/2020PL_SAS_import_scripts/pl_part1_2020_notab_dar.sas',
 '/Users/codingdojo//Downloads/2020PL_SAS_import_scripts/pl_part2_2020_notab_dar.sas',
 '/Users/codingdojo//Downloads/2020PL_SAS_import_scripts/pl_part3_2020_notab_dar.sas']

In [53]:
pd.read_sas(files_list[2],)

ValueError: unable to infer format of SAS file

In [51]:
# pd.__version__

### `censusdata`
- Source: https://towardsdatascience.com/accessing-census-data-with-python-3e2f2b56e20d

In [54]:
!pip install CensusData

Collecting CensusData
  Downloading CensusData-1.15.tar.gz (26.6 MB)
     |████████████████████████████████| 26.6 MB 1.1 MB/s            
[?25h  Preparing metadata (setup.py) ... [?25ldone
Building wheels for collected packages: CensusData
  Building wheel for CensusData (setup.py) ... [?25ldone
[?25h  Created wheel for CensusData: filename=CensusData-1.15-py3-none-any.whl size=28205515 sha256=8ca712a661a0a12a593b63f4ca56f0b567fe1f857c6b38a96639558f29b854f5
  Stored in directory: /Users/codingdojo/Library/Caches/pip/wheels/1f/06/6b/e0561383170648c354625bd161b5a2bdf0d5a79067ec66e80c
Successfully built CensusData
Installing collected packages: CensusData
Successfully installed CensusData-1.15


In [55]:
import censusdata
censusdata.download?

In [56]:
res = censusdata.download('acs3',2020)

TypeError: download() missing 2 required positional arguments: 'geo' and 'var'

In [59]:
censusdata.censusgeo('')

### `census`


> https://github.com/datamade/census

In [61]:
!pip install census
!pip install us

Collecting census
  Downloading census-0.8.18-py3-none-any.whl (11 kB)
Installing collected packages: census
Successfully installed census-0.8.18
Collecting us
  Downloading us-2.0.2.tar.gz (14 kB)
  Preparing metadata (setup.py) ... [?25ldone
[?25hCollecting jellyfish==0.6.1
  Downloading jellyfish-0.6.1.tar.gz (132 kB)
     |████████████████████████████████| 132 kB 3.0 MB/s            
[?25h  Preparing metadata (setup.py) ... [?25ldone
[?25hBuilding wheels for collected packages: us, jellyfish
  Building wheel for us (setup.py) ... [?25ldone
[?25h  Created wheel for us: filename=us-2.0.2-py3-none-any.whl size=11928 sha256=71b28922234e96aec2d72fb9133b2c33519e48032ccc4e2545b5742455d38d58
  Stored in directory: /Users/codingdojo/Library/Caches/pip/wheels/07/4c/fa/a65ed0f9c00360e785327fc370eba55ca575cc3508ab13ed8e
  Building wheel for jellyfish (setup.py) ... [?25ldone
[?25h  Created wheel for jellyfish: filename=jellyfish-0.6.1-py3-none-any.whl size=10379 sha256=43d74b819ccfc6b

- First, get yourself a Census API key: https://api.census.gov/data/key_signup.html

```python
from census import Census
from us import states

c = Census("MY_API_KEY")
c.acs5.get(('NAME', 'B25034_010E'),
          {'for': 'state:{}'.format(states.MD.fips)})
```
