In [None]:
%matplotlib notebook
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import math
import seaborn as sns
from datetime import date
from pprint import pprint

In [None]:
covid_df = pd.read_csv('/home/welling/git/CMU-MS-DAS-Vis-S22/data/covid19cases_test.csv')

In [None]:
covid_df.columns

In [None]:
def date_to_offset(date_str):
    if isinstance(date_str, str):
        return (date.fromisoformat(date_str) - date.fromisoformat('2020-02-01')).days
    else:
        return math.nan

In [None]:
covid_df['date_offset']=covid_df[['date']].applymap(date_to_offset)


In [None]:
copy_df = covid_df.copy()

In [None]:
full_df = None
for this_area, this_df in copy_df.groupby('area'):
    sm_df = this_df.drop(columns=['area', 'area_type',
                                  'date', 'date_offset',
                                  'population']).rolling(window=7).mean()
    name_map = {col:'sm_'+col for col in sm_df.columns}
    sm_df = sm_df.rename(columns=name_map)
    merged_df = pd.concat([this_df, sm_df], axis=1)
    #print(merged_df.columns)
    #display(merged_df.head(20))
    full_df = merged_df if full_df is None else pd.concat([full_df, merged_df], axis=0)
    print(this_area)

In [None]:
sns.relplot(data=full_df, x='date_offset', y='cases', kind='line')

In [None]:
sns.relplot(data=full_df, x='date_offset', y='sm_cases', kind='line')

In [None]:
rslt_recs = []
for this_area, this_df in full_df.groupby('area'):
    try:
        print(this_area)
        rslt = {'area': this_area}
        # zone 1
        df = this_df.query('date_offset>10 & date_offset <= 250' )[['date_offset','sm_cases']]
        #display(df)
        idx = df['sm_cases'].idxmax()
        #display(df.loc[idx])
        rslt['surge_1_offset'] = df.loc[idx]['date_offset']
        # zone 2
        df = this_df.query('date_offset>250 & date_offset <= 500' )[['date_offset','sm_cases']]
        idx = df['sm_cases'].idxmax()
        #display(df.loc[idx])
        rslt['surge_2_offset'] = df.loc[idx]['date_offset']
        # zone 3
        df = this_df.query('date_offset>500 & date_offset <= 675' )[['date_offset','sm_cases']]
        idx = df['sm_cases'].idxmax()
        #display(df.loc[idx])
        rslt['surge_3_offset'] = df.loc[idx]['date_offset']
        # zone 4
        #df = this_df.query('date_offset>675 & date_offset <= 750' )[['date_offset','sm_cases']]
        # The following tweak is to avoid a problem with Alpine's weak signal
        df = this_df.query('date_offset>690 & date_offset <= 750' )[['date_offset','sm_cases']]
        idx = df['sm_cases'].idxmax()
        #display(df.loc[idx])
        rslt['surge_4_offset'] = df.loc[idx]['date_offset']
        rslt_recs.append(rslt)
    except KeyError as e:
        print(f'...skipped due to {e}')
#pprint(rslt_recs)
surge_offsets_df = pd.DataFrame(rslt_recs)

In [None]:
surge_offsets_df.head()

In [None]:
def plot_county(area):
    fig, axes = plt.subplots()
    axes.set_title(area)
    area_df = full_df[full_df['area']==area]
    surge_4_date_offset = int(surge_offsets_df.query(f"area=='{area}'")['surge_4_offset'])
    max_val = 1.1 * float(area_df[area_df['date_offset'] == surge_4_date_offset]['sm_cases'])
    sns.lineplot(data=area_df, x='date_offset', y='sm_cases', ax=axes)
    for col in [c for c in surge_offsets_df.columns if 'offset' in c]:
        loc = float(surge_offsets_df.query(f"area=='{area}'")[col])
        axes.plot([loc,loc],[0.0,max_val])


In [None]:
plot_county('San Francisco')
plot_county('Alpine')
n_samps = 10
for area in np.random.choice(full_df['area'].unique(), size=n_samps):
    plot_county(area)


In [None]:
surge_offsets_df.to_csv('/tmp/covid_surge_offsets.tsv', sep='\t')