In [1]:
import numpy as np
import numpy.polynomial.polynomial as poly      # linear regression
import pandas as pd

from scipy.signal import savgol_filter          # fast smoothing of data

# opening external coordinates
import json
import pickle

# opening urls
from urllib.request import urlopen

# benchmarking
from time import time

# date/time operations
from datetime import datetime, timedelta
from pytz import timezone

import json

# plotting
import matplotlib.pyplot as plt

In [3]:
def optimize(df):
    '''
    Optimizes the data types in a pandas dataframe.
    '''
    dft = df.copy()
    # converts to datetime if possible
    dft = dft.apply(lambda col:
        pd.to_datetime(col, errors='ignore') if col.dtypes=='object' else col)
    
    # if there are less than half as many unique values as there are rows, convert to category
    for col in dft.select_dtypes(include='object'):
        if len(dft[col].unique()) / len(df[col]) < 0.5:
            dft[col] = dft[col].astype('category')
            
    # downcasts numeric columns if possible
    dft = dft.apply(lambda col: 
        pd.to_numeric(col, downcast='integer') if col.dtypes=='int64' else col)
    dft = dft.apply(lambda col: 
        pd.to_numeric(col, downcast='float') if col.dtypes=='float64' else col)
    
    return dft

In [7]:
with urlopen('https://raw.githubusercontent.com/nytimes/covid-19-data/master/us-counties.csv') as response:
    nyt_df_raw = optimize(pd.read_csv(response, dtype={'fips':'str'}))
    
nyt_df_raw['fips'] = nyt_df_raw['fips'].astype('object')
nyt_df_raw.loc[nyt_df_raw['county'] == 'New York City','fips'] = '36NYC'
nyt_df_raw.loc[nyt_df_raw['county'] == 'Kansas City','fips'] = '29KCM'
nyt_df_raw.loc[nyt_df_raw['county'] == 'Joplin','fips'] = '29JOP'
nyt_df_raw['fips'] = nyt_df_raw['fips'].astype('category')
print(nyt_df_raw.shape)
nyt_df_raw.head()

(485962, 6)


Unnamed: 0,date,county,state,fips,cases,deaths
0,2020-01-21,Snohomish,Washington,53061,1,0
1,2020-01-22,Snohomish,Washington,53061,1,0
2,2020-01-23,Snohomish,Washington,53061,1,0
3,2020-01-24,Cook,Illinois,17031,1,0
4,2020-01-24,Snohomish,Washington,53061,1,0


In [6]:
with open('../data/processed/info_df.p', 'rb') as f:
    info_df = pickle.load(f)
print(info_df.shape)
info_df.head()

(3140, 229)


Unnamed: 0,state_fips,state,county,fips,tot_pop,tot_male,tot_female,tot_pop_white_male,tot_pop_white_female,tot_pop_black_male,...,per_pop_hispanic_male,per_pop_hispanic_female,per_pop_white,per_pop_black,per_pop_native,per_pop_asian,per_pop_pacific,per_pop_twoplus,per_pop_hispanic,per_votes
0,1,Alabama,Autauga,1001,55869,27092.0,28777.0,20138.0,21077.0,5171.0,...,0.015823,0.014087,0.737708,0.198643,0.004349,0.011563,0.000716,0.017111,0.029909,0.441408
1,1,Alabama,Baldwin,1003,223234,108247.0,114987.0,89845.0,95902.0,9308.0,...,0.024839,0.022349,0.832073,0.086076,0.006751,0.010509,0.000551,0.016852,0.047188,0.421486
2,1,Alabama,Barbour,1005,24686,13064.0,11622.0,5894.0,5341.0,6260.0,...,0.02548,0.019768,0.455116,0.478287,0.003848,0.004699,0.001256,0.011545,0.045248,0.420886
3,1,Alabama,Bibb,1007,22394,11929.0,10465.0,8482.0,8181.0,2912.0,...,0.015317,0.012503,0.744083,0.210726,0.004064,0.002054,0.000268,0.010985,0.02782,0.39064
4,1,Alabama,Blount,1009,57826,28472.0,29354.0,24494.0,25682.0,453.0,...,0.051015,0.045516,0.867707,0.01508,0.004877,0.002819,0.000363,0.012624,0.096531,0.438972


In [8]:
with open('../data/processed/nyt_df.p', 'rb') as f:
    nyt_df = pickle.load(f)
print(nyt_df.shape)
nyt_df.head()

FileNotFoundError: [Errno 2] No such file or directory: '../data/processed/nyt_df.p'

In [None]:
nyt_df = nyt_df_raw.merge(
    info_df[['fips', 'tot_pop']], 
    on='fips', 
    suffixes=('_x','')
)

# df_all = df_all.drop(['county_x', 'state_x'], axis=1)
nyt_df[['cases_per_100k', 'deaths_per_100k']] = nyt_df[['cases', 'deaths']].div(nyt_df['tot_pop'], axis=0) * 100_000
nyt_df = nyt_df.drop(columns=['tot_pop'])
nyt_df = nyt_df.sort_values(by=['date', 'fips'])

print(nyt_df.shape)
nyt_df.head()