<a href="https://colab.research.google.com/github/jydiw/nyt-covid-19-data/blob/master/eda_nytimes-data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Combining NYTimes data with county census and geoson data

In [1]:
import numpy as np
import numpy.polynomial.polynomial as poly      # linear regression
import pandas as pd

from scipy.signal import savgol_filter          # fast smoothing of data

# opening external coordinates
import json
import pickle

# opening urls
from urllib.request import urlopen

# benchmarking
from time import time

# date/time operations
from datetime import datetime, timedelta
from pytz import timezone

import json

# plotting
import matplotlib.pyplot as plt

# import nytimes data

New York Times data has a few caveats, including treating New York City, Kansas City, and Joplin as single entities rather than including them in their respective counties. Read their [README](https://github.com/nytimes/covid-19-data/blob/master/README.md) for more information.

In [2]:
def optimize(df):
    '''
    Optimizes the data types in a pandas dataframe.
    '''
    dft = df.copy()
    # converts to datetime if possible
    dft = dft.apply(lambda col:
        pd.to_datetime(col, errors='ignore') if col.dtypes=='object' else col)
    
    # if there are less than half as many unique values as there are rows, convert to category
    for col in dft.select_dtypes(include='object'):
        if len(dft[col].unique()) / len(df[col]) < 0.5:
            dft[col] = dft[col].astype('category')
            
    # downcasts numeric columns if possible
    dft = dft.apply(lambda col: 
        pd.to_numeric(col, downcast='integer') if col.dtypes=='int64' else col)
    dft = dft.apply(lambda col: 
        pd.to_numeric(col, downcast='float') if col.dtypes=='float64' else col)
    
    return dft

In [3]:
with urlopen('https://raw.githubusercontent.com/nytimes/covid-19-data/master/us-counties.csv') as response:
    nyt_df_raw = optimize(pd.read_csv(response, dtype={'fips':'str'}))
nyt_df_raw.head()

Unnamed: 0,date,county,state,fips,cases,deaths
0,2020-01-21,Snohomish,Washington,53061,1,0
1,2020-01-22,Snohomish,Washington,53061,1,0
2,2020-01-23,Snohomish,Washington,53061,1,0
3,2020-01-24,Cook,Illinois,17031,1,0
4,2020-01-24,Snohomish,Washington,53061,1,0


In [4]:
nyt_df_raw['fips'] = nyt_df_raw['fips'].astype('object')
nyt_df_raw.loc[nyt_df_raw['county'] == 'New York City','fips'] = '36NYC'
nyt_df_raw.loc[nyt_df_raw['county'] == 'Kansas City','fips'] = '29KCM'
nyt_df_raw.loc[nyt_df_raw['county'] == 'Joplin','fips'] = '29JOP'
nyt_df_raw['fips'] = nyt_df_raw['fips'].astype('category')

In [5]:
nyt_df_raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 460109 entries, 0 to 460108
Data columns (total 6 columns):
 #   Column  Non-Null Count   Dtype         
---  ------  --------------   -----         
 0   date    460109 non-null  datetime64[ns]
 1   county  460109 non-null  category      
 2   state   460109 non-null  category      
 3   fips    456055 non-null  category      
 4   cases   460109 non-null  int32         
 5   deaths  460109 non-null  int16         
dtypes: category(3), datetime64[ns](1), int16(1), int32(1)
memory usage: 8.6 MB


# import `info_df`

In [6]:
with open('../data/processed/info_df.p', 'rb') as f:
    info_df = pickle.load(f)
print(info_df.shape)
info_df.head()

(3140, 229)


Unnamed: 0,state_fips,state,county,fips,tot_pop,tot_male,tot_female,tot_pop_white_male,tot_pop_white_female,tot_pop_black_male,...,per_pop_hispanic_male,per_pop_hispanic_female,per_pop_white,per_pop_black,per_pop_native,per_pop_asian,per_pop_pacific,per_pop_twoplus,per_pop_hispanic,per_votes
0,1,Alabama,Autauga,1001,55869,27092.0,28777.0,20138.0,21077.0,5171.0,...,0.015823,0.014087,0.737708,0.198643,0.004349,0.011563,0.000716,0.017111,0.029909,0.441408
1,1,Alabama,Baldwin,1003,223234,108247.0,114987.0,89845.0,95902.0,9308.0,...,0.024839,0.022349,0.832073,0.086076,0.006751,0.010509,0.000551,0.016852,0.047188,0.421486
2,1,Alabama,Barbour,1005,24686,13064.0,11622.0,5894.0,5341.0,6260.0,...,0.02548,0.019768,0.455116,0.478287,0.003848,0.004699,0.001256,0.011545,0.045248,0.420886
3,1,Alabama,Bibb,1007,22394,11929.0,10465.0,8482.0,8181.0,2912.0,...,0.015317,0.012503,0.744083,0.210726,0.004064,0.002054,0.000268,0.010985,0.02782,0.39064
4,1,Alabama,Blount,1009,57826,28472.0,29354.0,24494.0,25682.0,453.0,...,0.051015,0.045516,0.867707,0.01508,0.004877,0.002819,0.000363,0.012624,0.096531,0.438972


In [7]:
with open('../data/processed/geo_json.json') as f:
    geo_json = json.load(f)

## engineer per capita columns

In [8]:
nyt_df = nyt_df_raw.merge(
    info_df[['fips', 'tot_pop']], 
    on='fips', 
    suffixes=('_x','')
)

# df_all = df_all.drop(['county_x', 'state_x'], axis=1)
nyt_df[['cases_per_100k', 'deaths_per_100k']] = nyt_df[['cases', 'deaths']].div(nyt_df['tot_pop'], axis=0) * 100_000
nyt_df = nyt_df.drop(columns=['tot_pop'])
nyt_df = nyt_df.sort_values(by=['date', 'fips'])

print(nyt_df.shape)
nyt_df.head()

(446859, 8)


Unnamed: 0,date,county,state,fips,cases,deaths,cases_per_100k,deaths_per_100k
0,2020-01-21,Snohomish,Washington,53061,1,0,0.121642,0.0
1,2020-01-22,Snohomish,Washington,53061,1,0,0.121642,0.0
2,2020-01-23,Snohomish,Washington,53061,1,0,0.121642,0.0
215,2020-01-24,Cook,Illinois,17031,1,0,0.019417,0.0
3,2020-01-24,Snohomish,Washington,53061,1,0,0.121642,0.0


## engineer change columns, window sum, and smooth

In [9]:
cols = ['cases', 'deaths', 'cases_per_100k', 'deaths_per_100k']

def add_change_cols(df, cols, pre='new_', clip=False):
    df = df.sort_values(by=['date', 'fips'])
    new_cols = [pre + c for c in cols]
    df[new_cols] = df[cols] - df.groupby(by='fips')[cols].shift()
    df[new_cols] = df[new_cols].fillna(0)
    df[new_cols] = df[new_cols].astype(int)
    if clip:
        df[new_cols] = df[new_cols].clip(lower=0)
    return (df, new_cols)

def add_window_cols(df, cols, window=7):
    df = df.sort_values(by=['date', 'fips'])
    new_cols = [c + '_' + str(window) + 'd' for c in cols]
    col_dict = dict(zip(cols, new_cols))
    df = (df.merge(df.sort_values(by=['date', 'fips'])
            .groupby('fips')
            .rolling(15, on='date', min_periods=0)[cols].sum()
            .rename(columns=col_dict), on=['fips', 'date']))
#     df[new_cols] = df[new_cols].astype(int)
    return (df, new_cols)

def add_savgol_cols(df, cols, window=7, clip=False):
    def my_savgol(x, w):
        if len(x) >= w:
            return savgol_filter(x, w, 1)
        else:
            new_window = int(np.ceil(len(x) / 2) * 2 - 1)
            if new_window <= 1:
                return x
            else:
                return savgol_filter(x, new_window, 1)
    df = df.sort_values(by=['date', 'fips'])
    new_cols = [c + '_' + str(window) + 'sg' for c in cols]
    df[new_cols] = df.groupby(by='fips')[cols].transform(lambda x: my_savgol(x, window))
    if clip:
        df[new_cols] = df[new_cols].clip(lower=0)
    return (df, new_cols)

In [10]:
nyt_df, new_cols = add_change_cols(nyt_df, cols, pre='new_', clip=True)
nyt_df, cols_15d = add_window_cols(nyt_df, new_cols, window=15)
# nyt_df, new_cols_7sg = add_savgol_cols(nyt_df, new_cols, clip=True)
nyt_df, new_cols_15sg = add_savgol_cols(nyt_df, new_cols, window=15, clip=True)

print(nyt_df.columns)
nyt_df.head()

Index(['date', 'county', 'state', 'fips', 'cases', 'deaths', 'cases_per_100k',
       'deaths_per_100k', 'new_cases', 'new_deaths', 'new_cases_per_100k',
       'new_deaths_per_100k', 'new_cases_15d', 'new_deaths_15d',
       'new_cases_per_100k_15d', 'new_deaths_per_100k_15d', 'new_cases_15sg',
       'new_deaths_15sg', 'new_cases_per_100k_15sg',
       'new_deaths_per_100k_15sg'],
      dtype='object')


Unnamed: 0,date,county,state,fips,cases,deaths,cases_per_100k,deaths_per_100k,new_cases,new_deaths,new_cases_per_100k,new_deaths_per_100k,new_cases_15d,new_deaths_15d,new_cases_per_100k_15d,new_deaths_per_100k_15d,new_cases_15sg,new_deaths_15sg,new_cases_per_100k_15sg,new_deaths_per_100k_15sg
0,2020-01-21,Snohomish,Washington,53061,1,0,0.121642,0.0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2020-01-22,Snohomish,Washington,53061,1,0,0.121642,0.0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2020-01-23,Snohomish,Washington,53061,1,0,0.121642,0.0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,2020-01-24,Cook,Illinois,17031,1,0,0.019417,0.0,0,0,0,0,0.0,0.0,0.0,0.0,0.091667,0.0,0.0,0.0
4,2020-01-24,Snohomish,Washington,53061,1,0,0.121642,0.0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [11]:
nyt_df, delta_new_cols = add_change_cols(nyt_df, new_cols, pre='delta_')
nyt_df, delta_cols_15d = add_window_cols(nyt_df, delta_new_cols, window=15)
# nyt_df, delta_new_cols_7sg = add_savgol_cols(nyt_df, delta_new_cols)
nyt_df, delta_new_cols_15sg = add_savgol_cols(nyt_df, delta_new_cols, window=15)

print(nyt_df.columns)
nyt_df.head()

Index(['date', 'county', 'state', 'fips', 'cases', 'deaths', 'cases_per_100k',
       'deaths_per_100k', 'new_cases', 'new_deaths', 'new_cases_per_100k',
       'new_deaths_per_100k', 'new_cases_15d', 'new_deaths_15d',
       'new_cases_per_100k_15d', 'new_deaths_per_100k_15d', 'new_cases_15sg',
       'new_deaths_15sg', 'new_cases_per_100k_15sg',
       'new_deaths_per_100k_15sg', 'delta_new_cases', 'delta_new_deaths',
       'delta_new_cases_per_100k', 'delta_new_deaths_per_100k',
       'delta_new_cases_15d', 'delta_new_deaths_15d',
       'delta_new_cases_per_100k_15d', 'delta_new_deaths_per_100k_15d',
       'delta_new_cases_15sg', 'delta_new_deaths_15sg',
       'delta_new_cases_per_100k_15sg', 'delta_new_deaths_per_100k_15sg'],
      dtype='object')


Unnamed: 0,date,county,state,fips,cases,deaths,cases_per_100k,deaths_per_100k,new_cases,new_deaths,...,delta_new_cases_per_100k,delta_new_deaths_per_100k,delta_new_cases_15d,delta_new_deaths_15d,delta_new_cases_per_100k_15d,delta_new_deaths_per_100k_15d,delta_new_cases_15sg,delta_new_deaths_15sg,delta_new_cases_per_100k_15sg,delta_new_deaths_per_100k_15sg
0,2020-01-21,Snohomish,Washington,53061,1,0,0.121642,0.0,0,0,...,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2020-01-22,Snohomish,Washington,53061,1,0,0.121642,0.0,0,0,...,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2020-01-23,Snohomish,Washington,53061,1,0,0.121642,0.0,0,0,...,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,2020-01-24,Cook,Illinois,17031,1,0,0.019417,0.0,0,0,...,0,0,0.0,0.0,0.0,0.0,0.025,0.0,0.0,0.0
4,2020-01-24,Snohomish,Washington,53061,1,0,0.121642,0.0,0,0,...,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [12]:
nyt_df['days'] = ((nyt_df['date'] - nyt_df['date'].min()) / np.timedelta64(1, 'D')).astype('int')
nyt_df['mortality_rate'] = nyt_df['deaths'] / nyt_df['cases']
nyt_df['mortality_rate_15d'] = nyt_df['new_deaths_15d'] / nyt_df['new_cases_15d']

In [13]:
nyt_df['mortality_rate_15d'].describe()

count    4.009720e+05
mean              inf
std               NaN
min      0.000000e+00
25%      0.000000e+00
50%      0.000000e+00
75%      2.941176e-02
max               inf
Name: mortality_rate_15d, dtype: float64

In [17]:
nyt_df.loc[(nyt_df['mortality_rate_15d'] > 1), 'mortality_rate_15d'] = 1

In [18]:
nyt_df['mortality_rate_15d'].describe()

count    400972.000000
mean          0.034366
std           0.097850
min           0.000000
25%           0.000000
50%           0.000000
75%           0.029412
max           1.000000
Name: mortality_rate_15d, dtype: float64

In [19]:
with open('../data/processed/nyt_df.p', 'wb') as file:
    pickle.dump(nyt_df, file, protocol=pickle.HIGHEST_PROTOCOL)

# making clustered `nyt_df_ac`

In [13]:
# import os
# import sys
# module_path = os.path.abspath(os.path.join('..'))
# if module_path not in sys.path:
#     sys.path.append(module_path)

In [14]:
with open('../data/processed/geo_json_ac.json') as f:
    geo_json_ac = json.load(f)

In [15]:
dem_df_ac = optimize(pd.read_csv('../data/processed/dem_df_ac.csv', 
                                 dtype={'fips':'str', 'cluster':'int'}))
print(dem_df_ac.shape)
dem_df_ac.head()

FileNotFoundError: [Errno 2] File ../data/processed/dem_df_ac.csv does not exist: '../data/processed/dem_df_ac.csv'

In [None]:
dem_df_ac.sort_values(by='total_pop').iloc[[0, -1]]

In [None]:
dem_df_ac.sort_values(by='pop_density').iloc[[0, -1]]

There are several things to consider:

- Raw numbers will show New York City and Los Angeles County with the highest numbers, all other factors being equal.
- Reporting cases as a percentage (or as we'll use later, a per 100k) would better reflect the severity of infections.
- However, a higher population density would likely make it easier to transmit the disease to another person. Looking at the day-to-day change in new cases could allow us to see how quickly the disease is spreading.

The NYTimes dataset deals with New York City, Kansas City, and Joplin as their own entities. As such, they do not have valid `fips` codes; we must add our custom `fips` from our earlier census EDA.

In [None]:
nyt_df['fips'] = nyt_df['fips'].astype('object')
nyt_df.loc[nyt_df['county'] == 'New York City','fips'] = '36NYC'
nyt_df.loc[nyt_df['county'] == 'Kansas City','fips'] = '29KCM'
nyt_df.loc[nyt_df['county'] == 'Joplin','fips'] = '29JOP'
nyt_df['fips'] = nyt_df['fips'].astype('category')

In [None]:
nyt_df[nyt_df['fips']=='36NYC']

# merge `nyt_df` and `pop_df` for feature engineering

In [None]:
dem_df_to_merge = optimize(pd.read_csv('../data/processed/dem_df_to_merge.csv', 
                                 dtype={'fips':'str', 'cluster':'int'}))
print(dem_df_to_merge.shape)
dem_df_to_merge.head()

To reduce the size of the resulting `csv`, we will only merge with the `tot_pop` column to engineer our per capita columns. We can merge with `pop_df` again when plotting our data.

In [None]:
df = nyt_df.merge(
    dem_df_to_merge[['fips', 'cluster', 'state', 'county']], 
    on='fips', 
    suffixes=('_x','')
)

df = df.drop(['county_x', 'state_x'], axis=1)
df = df.sort_values(by=['date', 'fips'])
print(df.shape)
df.head()

In [None]:
df_clustered = df.groupby(by=['state', 'cluster', 'date']).agg(
    cases=('cases', sum),
    deaths=('deaths', sum)
).dropna().reset_index().astype({'cases': 'int', 'deaths':'int'})

df_clustered.head()

In [None]:
df = df_clustered.merge(dem_df_ac, on=['state', 'cluster'], suffixes=('_x',''))
df[['cases_per_100k', 'deaths_per_100k']] = df[['cases', 'deaths']].div(df['total_pop'], axis=0) * 100_000
df = df.sort_values(by=['date', 'fips'])

print(df.shape)
df.head()

In [None]:
cols = ['cases', 'deaths', 'cases_per_100k', 'deaths_per_100k']

def add_change_cols(df, cols, pre='new_', clip=False):
    df = df.sort_values(by=['date', 'fips'])
    new_cols = [pre + c for c in cols]
    df[new_cols] = df[cols] - df.groupby(by='fips')[cols].shift()
    df[new_cols] = df[new_cols].fillna(0)
    if clip:
        df[new_cols] = df[new_cols].clip(lower=0)
    return (df, new_cols)

def add_savgol_cols(df, cols, window=7, clip=False):
    def my_savgol(x, w):
        if len(x) >= w:
            return savgol_filter(x, w, 1)
        else:
            new_window = int(np.ceil(len(x) / 2) * 2 - 1)
            if new_window <= 1:
                return x
            else:
                return savgol_filter(x, new_window, 1)
    df = df.sort_values(by=['date', 'fips'])
    cols_d = [c + '_' + str(window) + 'sg' for c in cols]
    df[cols_d] = df.groupby(by='fips')[cols].transform(lambda x: my_savgol(x, window))
    if clip:
        df[cols_d] = df[cols_d].clip(lower=0)
    return (df, cols_d)

df, new_cols = add_change_cols(df, cols, pre='new_', clip=True)
df, new_cols_7sg = add_savgol_cols(df, new_cols, clip=True)
df, new_cols_15sg = add_savgol_cols(df, new_cols, window=15, clip=True)
df, delta_new_cols = add_change_cols(df, new_cols, pre='delta_')
df, delta_new_cols_7sg = add_savgol_cols(df, delta_new_cols)
df, delta_new_cols_15sg = add_savgol_cols(df, delta_new_cols, window=15)

print(df.columns)
df.head()

In [None]:
county = 'Los Angeles'
plt.plot(np.arange(len(df[df['county'] == county])), 'new_cases_per_100k', data=df[df['county'] == county], color='gray')
plt.plot(np.arange(len(df[df['county'] == county])), 'new_cases_per_100k_7sg', data=df[df['county'] == county], color='red')
plt.plot(np.arange(len(df[df['county'] == county])), 'new_cases_per_100k_15sg', data=df[df['county'] == county], color='blue')

In [None]:
county = 'New York City'
plt.plot(np.arange(len(df[df['county'] == county])), 'new_cases_per_100k', data=df[df['county'] == county], color='gray')
plt.plot(np.arange(len(df[df['county'] == county])), 'new_cases_per_100k_7sg', data=df[df['county'] == county], color='red')
plt.plot(np.arange(len(df[df['county'] == county])), 'new_cases_per_100k_15sg', data=df[df['county'] == county], color='blue')