In [1]:
import numpy as np
import numpy.polynomial.polynomial as poly      # linear regression
import pandas as pd

from scipy.signal import savgol_filter          # fast smoothing of data

# opening external coordinates
import json
import pickle

# opening urls
from urllib.request import urlopen

# benchmarking
from time import time

# date/time operations
from datetime import datetime, timedelta
from pytz import timezone

import json

# plotting
import matplotlib.pyplot as plt

In [2]:
def optimize(df):
    '''
    Optimizes the data types in a pandas dataframe.
    '''
    dft = df.copy()
    # converts to datetime if possible
    dft = dft.apply(lambda col:
        pd.to_datetime(col, errors='ignore') if col.dtypes=='object' else col)
    
    # if there are less than half as many unique values as there are rows, convert to category
    for col in dft.select_dtypes(include='object'):
        if len(dft[col].unique()) / len(df[col]) < 0.5:
            dft[col] = dft[col].astype('category')
            
    # downcasts numeric columns if possible
    dft = dft.apply(lambda col: 
        pd.to_numeric(col, downcast='integer') if col.dtypes=='int64' else col)
    dft = dft.apply(lambda col: 
        pd.to_numeric(col, downcast='float') if col.dtypes=='float64' else col)
    
    return dft

In [3]:
with urlopen('https://raw.githubusercontent.com/nytimes/covid-19-data/master/us-counties.csv') as response:
    nyt_df_raw = optimize(pd.read_csv(response, dtype={'fips':'str'}))
    
nyt_df_raw['fips'] = nyt_df_raw['fips'].astype('object')
nyt_df_raw.loc[nyt_df_raw['county'] == 'New York City','fips'] = '36NYC'
nyt_df_raw.loc[nyt_df_raw['county'] == 'Kansas City','fips'] = '29KCM'
nyt_df_raw.loc[nyt_df_raw['county'] == 'Joplin','fips'] = '29JOP'
nyt_df_raw['fips'] = nyt_df_raw['fips'].astype('category')
print(nyt_df_raw.shape)
nyt_df_raw.tail()

(489196, 6)


Unnamed: 0,date,county,state,fips,cases,deaths
489191,2020-08-31,Sweetwater,Wyoming,56037,302,2
489192,2020-08-31,Teton,Wyoming,56039,430,1
489193,2020-08-31,Uinta,Wyoming,56041,298,2
489194,2020-08-31,Washakie,Wyoming,56043,107,5
489195,2020-08-31,Weston,Wyoming,56045,19,0


In [4]:
with open('../data/processed/info_df.p', 'rb') as f:
    info_df = pickle.load(f)
print(info_df.shape)
info_df.head()

(3140, 229)


Unnamed: 0,state_fips,state,county,fips,tot_pop,tot_male,tot_female,tot_pop_white_male,tot_pop_white_female,tot_pop_black_male,...,per_pop_hispanic_male,per_pop_hispanic_female,per_pop_white,per_pop_black,per_pop_native,per_pop_asian,per_pop_pacific,per_pop_twoplus,per_pop_hispanic,per_votes
0,1,Alabama,Autauga,1001,55869,27092.0,28777.0,20138.0,21077.0,5171.0,...,0.015823,0.014087,0.737708,0.198643,0.004349,0.011563,0.000716,0.017111,0.029909,0.441408
1,1,Alabama,Baldwin,1003,223234,108247.0,114987.0,89845.0,95902.0,9308.0,...,0.024839,0.022349,0.832073,0.086076,0.006751,0.010509,0.000551,0.016852,0.047188,0.421486
2,1,Alabama,Barbour,1005,24686,13064.0,11622.0,5894.0,5341.0,6260.0,...,0.02548,0.019768,0.455116,0.478287,0.003848,0.004699,0.001256,0.011545,0.045248,0.420886
3,1,Alabama,Bibb,1007,22394,11929.0,10465.0,8482.0,8181.0,2912.0,...,0.015317,0.012503,0.744083,0.210726,0.004064,0.002054,0.000268,0.010985,0.02782,0.39064
4,1,Alabama,Blount,1009,57826,28472.0,29354.0,24494.0,25682.0,453.0,...,0.051015,0.045516,0.867707,0.01508,0.004877,0.002819,0.000363,0.012624,0.096531,0.438972


In [5]:
with open('../data/processed/nyt_df.p', 'rb') as f:
    nyt_df = pickle.load(f)
print(nyt_df.shape)
nyt_df.tail()

(453097, 35)


Unnamed: 0,date,county,state,fips,cases,deaths,cases_per_100k,deaths_per_100k,new_cases,new_deaths,...,delta_new_deaths_15d,delta_new_cases_per_100k_15d,delta_new_deaths_per_100k_15d,delta_new_cases_15sg,delta_new_deaths_15sg,delta_new_cases_per_100k_15sg,delta_new_deaths_per_100k_15sg,days,mortality_rate,mortality_rate_15d
453092,2020-08-24,Sweetwater,Wyoming,56037,290,2,684.882979,4.723331,0,0,...,0.0,-4.0,0.0,-0.583333,0.0,-1.341667,0.0,216,0.006897,0.0
453093,2020-08-24,Teton,Wyoming,56039,408,1,1738.833958,4.261848,0,0,...,0.0,-17.0,0.0,-0.541667,0.0,-2.208333,0.0,216,0.002451,0.0
453094,2020-08-24,Uinta,Wyoming,56041,283,2,1399.189162,9.888263,0,0,...,0.0,-19.0,0.0,0.208333,-0.025,1.033333,-0.1,216,0.007067,0.111111
453095,2020-08-24,Washakie,Wyoming,56043,107,5,1370.916079,64.061499,0,0,...,0.0,-89.0,0.0,-0.016667,0.0,-0.133333,0.0,216,0.046729,0.0
453096,2020-08-24,Weston,Wyoming,56045,12,0,173.235167,0.0,0,0,...,0.0,0.0,0.0,-0.2,0.0,-2.8,0.0,216,0.0,0.0


In [8]:
nyt_df['date'].max() - np.timedelta64(15, 'D')

Timestamp('2020-08-09 00:00:00')

In [10]:
nyt_df_temp = nyt_df_raw[nyt_df_raw['date'] > nyt_df['date'].max() - np.timedelta64(15, 'D')]
print(nyt_df_temp.shape)
nyt_df_temp.head()

(71022, 6)


Unnamed: 0,date,county,state,fips,cases,deaths
418174,2020-08-10,Autauga,Alabama,1001,1222,22
418175,2020-08-10,Baldwin,Alabama,1003,3714,28
418176,2020-08-10,Barbour,Alabama,1005,631,5
418177,2020-08-10,Bibb,Alabama,1007,450,5
418178,2020-08-10,Blount,Alabama,1009,947,4


In [11]:
nyt_df_app = nyt_df_temp.merge(
    info_df[['fips', 'tot_pop']], 
    on='fips', 
    suffixes=('_x','')
)

# df_all = df_all.drop(['county_x', 'state_x'], axis=1)
nyt_df_app[['cases_per_100k', 'deaths_per_100k']] = nyt_df_app[['cases', 'deaths']].div(nyt_df_app['tot_pop'], axis=0) * 100_000
nyt_df_app = nyt_df_app.drop(columns=['tot_pop'])
nyt_df_app = nyt_df_app.sort_values(by=['date', 'fips'])

print(nyt_df_app.shape)
nyt_df_app.head()

(68601, 8)


Unnamed: 0,date,county,state,fips,cases,deaths,cases_per_100k,deaths_per_100k
0,2020-08-10,Autauga,Alabama,1001,1222,22,2187.259482,39.37783
22,2020-08-10,Baldwin,Alabama,1003,3714,28,1663.72506,12.542892
44,2020-08-10,Barbour,Alabama,1005,631,5,2556.104675,20.254395
66,2020-08-10,Bibb,Alabama,1007,450,5,2009.466821,22.327409
88,2020-08-10,Blount,Alabama,1009,947,4,1637.671636,6.917304


In [12]:
cols = ['cases', 'deaths', 'cases_per_100k', 'deaths_per_100k']

def add_change_cols(df, cols, pre='new_', clip=False):
    df = df.sort_values(by=['date', 'fips'])
    new_cols = [pre + c for c in cols]
    df[new_cols] = df[cols] - df.groupby(by='fips')[cols].shift()
    df[new_cols] = df[new_cols].fillna(0)
    df[new_cols] = df[new_cols].astype(int)
    if clip:
        df[new_cols] = df[new_cols].clip(lower=0)
    return (df, new_cols)

def add_window_cols(df, cols, window=7):
    df = df.sort_values(by=['date', 'fips'])
    new_cols = [c + '_' + str(window) + 'd' for c in cols]
    col_dict = dict(zip(cols, new_cols))
    df = (df.merge(df.sort_values(by=['date', 'fips'])
            .groupby('fips')
            .rolling(15, on='date', min_periods=0)[cols].sum()
            .rename(columns=col_dict), on=['fips', 'date']))
#     df[new_cols] = df[new_cols].astype(int)
    return (df, new_cols)

def add_savgol_cols(df, cols, window=7, clip=False):
    def my_savgol(x, w):
        if len(x) >= w:
            return savgol_filter(x, w, 1)
        else:
            new_window = int(np.ceil(len(x) / 2) * 2 - 1)
            if new_window <= 1:
                return x
            else:
                return savgol_filter(x, new_window, 1)
    df = df.sort_values(by=['date', 'fips'])
    new_cols = [c + '_' + str(window) + 'sg' for c in cols]
    df[new_cols] = df.groupby(by='fips')[cols].transform(lambda x: my_savgol(x, window))
    if clip:
        df[new_cols] = df[new_cols].clip(lower=0)
    return (df, new_cols)

In [14]:
tick = time()

nyt_df_app, new_cols = add_change_cols(nyt_df_app, cols, pre='new_', clip=True)
nyt_df_app, cols_15d = add_window_cols(nyt_df_app, new_cols, window=15)
nyt_df_app, new_cols_15sg = add_savgol_cols(nyt_df_app, new_cols, window=15, clip=True)
nyt_df_app, delta_new_cols = add_change_cols(nyt_df_app, new_cols, pre='delta_')
nyt_df_app, delta_cols_15d = add_window_cols(nyt_df_app, delta_new_cols, window=15)
nyt_df_app, delta_new_cols_15sg = add_savgol_cols(nyt_df_app, delta_new_cols, window=15)

tock = time()
print(tock - tick)

21.489539623260498
