<a href="https://colab.research.google.com/github/jydiw/nyt-covid-19-data/blob/master/nyt_covid_data_county.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# NYTimes COVID-19 EDA and `pandas` Optimization

In [1]:
import numpy as np
import pandas as pd

from urllib.request import urlopen
import json

import itertools

from time import time
from datetime import datetime, timedelta

from shapely.geometry import Polygon

# import nytimes data

New York Times data has a few caveats, including treating New York City, Kansas City, and Joplin as single entities rather than including them in their respective counties. Read their [README](https://github.com/nytimes/covid-19-data/blob/master/README.md) for more information.

In [2]:
with urlopen('https://raw.githubusercontent.com/nytimes/covid-19-data/master/us-counties.csv') as response:
    nyt_df = pd.read_csv(response)
nyt_df.head()

Unnamed: 0,date,county,state,fips,cases,deaths
0,2020-01-21,Snohomish,Washington,53061.0,1,0
1,2020-01-22,Snohomish,Washington,53061.0,1,0
2,2020-01-23,Snohomish,Washington,53061.0,1,0
3,2020-01-24,Cook,Illinois,17031.0,1,0
4,2020-01-24,Snohomish,Washington,53061.0,1,0


In [3]:
nyt_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 319547 entries, 0 to 319546
Data columns (total 6 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   date    319547 non-null  object 
 1   county  319547 non-null  object 
 2   state   319547 non-null  object 
 3   fips    316227 non-null  float64
 4   cases   319547 non-null  int64  
 5   deaths  319547 non-null  int64  
dtypes: float64(1), int64(2), object(3)
memory usage: 14.6+ MB


In [4]:
for col in nyt_df.select_dtypes(include='object').columns:
    print(col)
    print(len(nyt_df[col].unique()))
    print()

date
173

county
1818

state
55



- `fips` should not be numeric. As we'll see down below, we will have to import this as an object type so that we don't convert `float` values to `str`.
- the max for both `cases` and `deaths` is well below the maximum value of `int64`, so we can downcast.
- by nature of the data, the `state`-`county` combinations (and therefore `fips`) are far fewer than the number of entries in our dataset. We can change `state`, `county`, and `fips` to the `category` data type.
- we should probably change `date` into a `datetime` data type. This gives us some flexibility later.

**Why `category`?** The memory of the `object` data type is proportional to its length, whereas for `category` it is proportional to the number of categories plus the length. Given that we have 300k+ rows and only 1814 unique counties, we save memory when converting that column to `category`.

https://pandas.pydata.org/pandas-docs/stable/user_guide/categorical.html#memory-usage

In [5]:
nyt_df['fips'] = nyt_df['fips'].astype('category')   # doesn't work as intended
nyt_df['cases'] = pd.to_numeric(nyt_df['cases'], downcast='integer')
nyt_df['deaths'] = pd.to_numeric(nyt_df['deaths'], downcast='integer')
nyt_df['state'] = nyt_df['state'].astype('category')
nyt_df['county'] = nyt_df['county'].astype('category')
nyt_df['date'] = pd.to_datetime(nyt_df['date'])
nyt_df.head()

Unnamed: 0,date,county,state,fips,cases,deaths
0,2020-01-21,Snohomish,Washington,53061.0,1,0
1,2020-01-22,Snohomish,Washington,53061.0,1,0
2,2020-01-23,Snohomish,Washington,53061.0,1,0
3,2020-01-24,Cook,Illinois,17031.0,1,0
4,2020-01-24,Snohomish,Washington,53061.0,1,0


In [6]:
nyt_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 319547 entries, 0 to 319546
Data columns (total 6 columns):
 #   Column  Non-Null Count   Dtype         
---  ------  --------------   -----         
 0   date    319547 non-null  datetime64[ns]
 1   county  319547 non-null  category      
 2   state   319547 non-null  category      
 3   fips    316227 non-null  category      
 4   cases   319547 non-null  int32         
 5   deaths  319547 non-null  int16         
dtypes: category(3), datetime64[ns](1), int16(1), int32(1)
memory usage: 6.0 MB


We reduce our memory usage by over half. While we were not using an absolute large amount of memory prior to optimization, it's good practice to optimize your code.

Combining into one function:

In [7]:
def optimize(df):
    
    # converts to datetime if possible
    df = df.apply(lambda col: pd.to_datetime(col, errors='ignore') if col.dtypes=='object' else col)
    
    # if there are less than half as many unique values as there are rows, convert to category
    for col in df.select_dtypes(include='object'):
        if len(df[col].unique()) / len(df[col]) < 0.5:
            df[col] = df[col].astype('category')
    
    # downcasts numeric columns if possible
    df = df.apply(lambda col: pd.to_numeric(col, downcast='integer') if col.dtypes=='int64' else col)
    df = df.apply(lambda col: pd.to_numeric(col, downcast='float') if col.dtypes=='float64' else col)
    
    return df    

In [8]:
with urlopen('https://raw.githubusercontent.com/nytimes/covid-19-data/master/us-counties.csv') as response:
    nyt_df = optimize(pd.read_csv(response, dtype={'fips':'str'}))
nyt_df.head()

Unnamed: 0,date,county,state,fips,cases,deaths
0,2020-01-21,Snohomish,Washington,53061,1,0
1,2020-01-22,Snohomish,Washington,53061,1,0
2,2020-01-23,Snohomish,Washington,53061,1,0
3,2020-01-24,Cook,Illinois,17031,1,0
4,2020-01-24,Snohomish,Washington,53061,1,0


In [9]:
nyt_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 319547 entries, 0 to 319546
Data columns (total 6 columns):
 #   Column  Non-Null Count   Dtype         
---  ------  --------------   -----         
 0   date    319547 non-null  datetime64[ns]
 1   county  319547 non-null  category      
 2   state   319547 non-null  category      
 3   fips    316227 non-null  category      
 4   cases   319547 non-null  int32         
 5   deaths  319547 non-null  int16         
dtypes: category(3), datetime64[ns](1), int16(1), int32(1)
memory usage: 6.0 MB


# import census data

Population data taken from [census.gov](https://www2.census.gov/programs-surveys/popest/datasets/2010-2019/counties/totals/).

Since 2020 Census data have not been released yet, we will use 2019 population estimates.

Looking at the [data dictionary](https://www2.census.gov/programs-surveys/popest/datasets/2010-2019/counties/totals/co-est2019-alldata.pdf), we only want the names and FIPS columns (eg. `STATE`, `STNAME`) and `POPESTIMATE2019`.

In [10]:
with urlopen('https://raw.githubusercontent.com/jydiw/nyt-covid-19-data/master/data/co-est2019-alldata.csv') as response:
    pop_df = optimize(pd.read_csv(
        response, 
        usecols=['STATE', 'COUNTY', 'STNAME', 'CTYNAME', 'POPESTIMATE2019'], 
        encoding='latin-1',        # to avoid unicode error
        dtype={'STATE':'str',
               'COUNTY':'str'}
    ))
pop_df.head()

Unnamed: 0,STATE,COUNTY,STNAME,CTYNAME,POPESTIMATE2019
0,1,0,Alabama,Alabama,4903185
1,1,1,Alabama,Autauga County,55869
2,1,3,Alabama,Baldwin County,223234
3,1,5,Alabama,Barbour County,24686
4,1,7,Alabama,Bibb County,22394


In [11]:
pop_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3193 entries, 0 to 3192
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype   
---  ------           --------------  -----   
 0   STATE            3193 non-null   category
 1   COUNTY           3193 non-null   category
 2   STNAME           3193 non-null   category
 3   CTYNAME          3193 non-null   object  
 4   POPESTIMATE2019  3193 non-null   int32   
dtypes: category(3), int32(1), object(1)
memory usage: 68.4+ KB


Notice that county names provided by the US census contain terms, such as 'County', whereas the NYTimes data does not.

In [12]:
len(list(set(nyt_df['county']) - set(pop_df['CTYNAME'])))

1736

In [13]:
pop_df = pop_df[pop_df['COUNTY'] != '000']
pop_df.rename(
    columns={
        'STATE':'statefips',
        'COUNTY':'countyfips',
        'STNAME':'state',
        'CTYNAME':'county',
        'POPESTIMATE2019':'population'
    }, inplace=True
)
# pop_df = pop_df[pop_df['countyfips'] != '000']
pop_df['fips'] = (pop_df['statefips'].astype('object') + pop_df['countyfips'].astype('object')).astype('category')
pop_df.drop(columns=['statefips', 'countyfips'], inplace=True)
county_terms = ['County', 'Parish', 'Municipality']
for term in county_terms:
    pop_df['county'] = pop_df['county'].str.replace(' ' + term, '')
# pop_df.sort_index(inplace=True)
pop_df.head()

Unnamed: 0,state,county,population,fips
1,Alabama,Autauga,55869,1001
2,Alabama,Baldwin,223234,1003
3,Alabama,Barbour,24686,1005
4,Alabama,Bibb,22394,1007
5,Alabama,Blount,57826,1009


In [14]:
pop_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3142 entries, 1 to 3192
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype   
---  ------      --------------  -----   
 0   state       3142 non-null   category
 1   county      3142 non-null   object  
 2   population  3142 non-null   int32   
 3   fips        3142 non-null   category
dtypes: category(2), int32(1), object(1)
memory usage: 178.0+ KB


We eventually need to merge `nyt_df` and `pop_df`, so let's see how they match with each other:

In [15]:
list(set(nyt_df['county']) - set(pop_df['county']))

['Kansas City', 'Unknown', 'Joplin', 'New York City']

The list is much smaller now. We will deal with 'Unknown' values later.

In [16]:
nyt_df = nyt_df[nyt_df['county'] != 'Unknown']
list(set(nyt_df['county']) - set(pop_df['county']))

['Kansas City', 'Joplin', 'New York City']

We need to add the population data for these three cities. Additional information taken from [census.gov quickfacts]('https://www.census.gov/quickfacts').

We'll use `'nyc'`, `'kc'`, and `'jm'` as our `fips` for these three cities.

In [17]:
pop_df_2 = pd.DataFrame(
    [['New York',
      'New York City',
      8_336_817,
      'nyc'],
     ['Missouri',
      'Kansas City',
      495_327 + 152_960,
      'kc'],
     ['Missouri',
      'Joplin',
      50_925,
      'jm']]
    , columns=pop_df.columns)
pop_df_2

Unnamed: 0,state,county,population,fips
0,New York,New York City,8336817,nyc
1,Missouri,Kansas City,648287,kc
2,Missouri,Joplin,50925,jm


In [18]:
pop_df = optimize(pop_df.append(pop_df_2, ignore_index=True))
pop_df[pop_df['fips'] == 'nyc']

Unnamed: 0,state,county,population,fips
3142,New York,New York City,8336817,nyc


In [19]:
nyt_df['fips'] = nyt_df['fips'].astype('object')
nyt_df.loc[nyt_df['county'] == 'New York City','fips'] = 'nyc'
nyt_df.loc[nyt_df['county'] == 'Kansas City','fips'] = 'kc'
nyt_df.loc[nyt_df['county'] == 'Joplin','fips'] = 'jm'
nyt_df['fips'] = nyt_df['fips'].astype('category')

# import geojson for boundaries and census areas

In [20]:
with urlopen('https://raw.githubusercontent.com/plotly/datasets/master/geojson-counties-fips.json') as response:
    county_json = json.load(response)

In [21]:
county_json['features'][0]

{'type': 'Feature',
 'properties': {'GEO_ID': '0500000US01001',
  'STATE': '01',
  'COUNTY': '001',
  'NAME': 'Autauga',
  'LSAD': 'County',
  'CENSUSAREA': 594.436},
 'geometry': {'type': 'Polygon',
  'coordinates': [[[-86.496774, 32.344437],
    [-86.717897, 32.402814],
    [-86.814912, 32.340803],
    [-86.890581, 32.502974],
    [-86.917595, 32.664169],
    [-86.71339, 32.661732],
    [-86.714219, 32.705694],
    [-86.413116, 32.707386],
    [-86.411172, 32.409937],
    [-86.496774, 32.344437]]]},
 'id': '01001'}

In [22]:
fips_to_add_to_json = list(set(nyt_df['fips']) - set([f['id'] for f in county_json['features']]))
fips_to_add_to_json

['kc', 'nyc', '46102', '02158', 'jm']

The `plotly` county GeoJSON dataset is missing Kusilvak Census Area (`'02158'`) and Oglala Lakota County(`'46102'`), in addition to the three cities included in the NYTimes data (New York City, Kansas City, Joplin). GeoJSON data for these five areas compiled from [nomanatim](https://nominatim.openstreetmap.org/) and [polygons](http://polygons.openstreetmap.fr/).

- Search for the area at [nomanatim](https://nominatim.openstreetmap.org/).
- Select `details` from the relevant entry.
- Copy the numeric `code` under `OSM`, ignoring "relation". Eg. for New York City, copy `175905`.
- Search for the `code` at [polygons](http://polygons.openstreetmap.fr/).
- For our purposes, GeoJSONs were selected according to the following criteria: (1) sparsity of vertices (`NPoints`) and (2) accuracy of shape.

In [23]:
# new york city, ny
with urlopen('https://raw.githubusercontent.com/jydiw/nyt-covid-19-data/master/data/nyc.txt') as response:
    nyc_json = json.load(response)

# kansas city, mo/ks
with urlopen('https://raw.githubusercontent.com/jydiw/nyt-covid-19-data/master/data/kcm.txt') as response:
    kcm_json = json.load(response)
with urlopen('https://raw.githubusercontent.com/jydiw/nyt-covid-19-data/master/data/kck.txt') as response:
    kck_json = json.load(response)
kc_json = dict(
    coordinates = kcm_json['coordinates'] + kck_json['coordinates']
)

# joplin, mo
with urlopen('https://raw.githubusercontent.com/jydiw/nyt-covid-19-data/master/data/jm.txt') as response:
    jm_json = json.load(response)

# oglala lakota county, nd
with urlopen('https://raw.githubusercontent.com/jydiw/nyt-covid-19-data/master/data/olsd.txt') as response:
    olsd_json = json.load(response)

# kusilvak census area, ak
with urlopen('https://raw.githubusercontent.com/jydiw/nyt-covid-19-data/master/data/kca.txt') as response:
    kca_json = json.load(response)

In [24]:
# https://stackoverflow.com/questions/41271146/
def clean_coordinates(c):
    return [list(itertools.chain(*d)) for d in c]

In [25]:
add_to_json_dict = {
    '02158':{'area':17_081.43,
             'name':'Kusilvak Census Area',
             'coordinates':clean_coordinates(kca_json['coordinates'])}, 
    '46102':{'area':2_093.90,
             'name':'Oglala Lakota',
             'coordinates':clean_coordinates(olsd_json['coordinates'])},
    'jm':{'area':35.56,
          'name':'Joplin',
          'coordinates':clean_coordinates(jm_json['geometries'][0]['coordinates'])},
    'kc':{'area':124.81+314.95,
          'name':'Kansas City',
          'coordinates':clean_coordinates(kc_json['coordinates'])},
    'nyc':{'area':302.64,
           'name':'New York City',
           'coordinates':clean_coordinates(nyc_json['coordinates'])}
}

In [26]:
for fips in fips_to_add_to_json:
    county_json['features'].append(
        {
            'geometry': {'coordinates': add_to_json_dict[fips]['coordinates'],
                         'type': 'Polygon'},
            'id': fips,
            'properties': {'NAME': add_to_json_dict[fips]['name'],
                           'CENSUSAREA': add_to_json_dict[fips]['area']},
            'type': 'Feature'
        }
    )

In [27]:
with open('data/county_json.json', 'w') as f:
    json.dump(county_json, f)

In [28]:
def county_area(i, j=county_json):
    for d in j['features']:
        if d['id'] == i:
            return d['properties']['CENSUSAREA']

In [29]:
for fips in fips_to_add_to_json:
    print(county_area(fips))

439.76
302.64
2093.9
17081.43
35.56


# add latitude and longitude coordinates

In [54]:
def centroid(i, j=county_json):
    for d in j['features']:
        if d['id'] == i:
            shapes = np.array(d['geometry']['coordinates']).flatten()
            try:
                areas = [Polygon(shape).area for shape in shapes]
                p = Polygon(shapes[areas.index(max(areas))])
                lon, lat = p.centroid.coords[0]
            except:
                shapes = np.reshape(shapes, (-1, 2))
                p = Polygon(shapes)
                lon, lat = p.centroid.coords[0]
            return lon, lat

In [55]:
for d in county_json['features']:
    if d['id'] == fips_to_add_to_json[0]:
        shapes = d['geometry']['coordinates']

print(len(shapes))
areas = [Polygon(shape).area for shape in shapes]
areas

2


[0.0899984999999991, 0.035515830415977646]

In [56]:
for fips in fips_to_add_to_json:
    print(centroid(fips))

(-94.5544223903731, 39.1271954680726)
(-73.93936847673419, 40.66351613943744)
(-102.72778156028366, 43.18510744680853)
(-163.5144805328244, 62.15824463834583)
(-94.50560539941796, 37.07916084816835)


# add area and coordinates to `pop_df`

In [58]:
tick = time()
pop_df['area'] = pop_df['fips'].apply(county_area)
pop_df['lon'], pop_df['lat'] = zip(*pop_df['fips'].apply(centroid).to_list())
pop_df = optimize(pop_df)
tock = time()
print(tock - tick)

0.794874906539917


In [59]:
pop_df.head()

Unnamed: 0,state,county,population,fips,area,lon,lat
0,Alabama,Autauga,55869,1001,594.435974,-86.641197,32.536152
1,Alabama,Baldwin,223234,1003,1589.784058,-87.723953,30.725863
2,Alabama,Barbour,24686,1005,884.875977,-85.389244,31.867889
3,Alabama,Bibb,22394,1007,622.58197,-87.124962,32.996456
4,Alabama,Blount,57826,1009,644.776001,-86.569756,33.985249


In [60]:
pop_df.to_csv('data/pop_df.csv', index=False)

# merge `nyt_df` and `pop_df` for per capita and per capita * area feature engineering

In [62]:
pop_df.columns.difference(nyt_df.columns)

Index(['area', 'lat', 'lon', 'population'], dtype='object')

In [63]:
tick = time()

df = nyt_df.merge(pop_df[['fips', 'area', 'population', 'lat', 'lon']], on='fips')
df[['cases_per_100k', 'deaths_per_100k']] = df[['cases', 'deaths']].div(df['population'], axis=0) * 100_000
df[['case_density', 'death_density']] = df[['cases', 'deaths']].div(df['population'], axis=0).div(df['area'], axis=0) * 100_000
df = df.sort_values(by=['date', 'fips'])

tock = time()
print(tock - tick)

df.head()

0.2373647689819336


Unnamed: 0,date,county,state,fips,cases,deaths,area,population,lat,lon,cases_per_100k,deaths_per_100k,case_density,death_density
0,2020-01-21,Snohomish,Washington,53061,1,0,2087.273,822083,48.046917,-121.69278,0.121642,0.0,5.8e-05,0.0
1,2020-01-22,Snohomish,Washington,53061,1,0,2087.273,822083,48.046917,-121.69278,0.121642,0.0,5.8e-05,0.0
2,2020-01-23,Snohomish,Washington,53061,1,0,2087.273,822083,48.046917,-121.69278,0.121642,0.0,5.8e-05,0.0
173,2020-01-24,Cook,Illinois,17031,1,0,945.326,5150233,41.839622,-87.81743,0.019417,0.0,2.1e-05,0.0
3,2020-01-24,Snohomish,Washington,53061,1,0,2087.273,822083,48.046917,-121.69278,0.121642,0.0,5.8e-05,0.0


## `join` performance considerations

In [64]:
tick = time()

nyt_df1 = nyt_df.set_index(['fips', 'date'])
pop_df1 = pop_df.set_index('fips')

df1 = nyt_df1.join(pop_df1[['area', 'population', 'lat', 'lon']], on='fips', how='left')
df1[['cases_per_100k', 'deaths_per_100k']] = df1[['cases', 'deaths']].div(df1['population'], axis=0) * 100_000
df1[['case_density', 'death_density']] = df1[['cases', 'deaths']].div(df1['population'], axis=0).div(df1['area'], axis=0) * 100_000
df1 = df1.sort_values(by=['date', 'fips'])

tock = time()
print(tock - tick)

df1.head()

0.18151545524597168


Unnamed: 0_level_0,Unnamed: 1_level_0,county,state,cases,deaths,area,population,lat,lon,cases_per_100k,deaths_per_100k,case_density,death_density
fips,date,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
53061,2020-01-21,Snohomish,Washington,1,0,2087.273,822083,48.046917,-121.69278,0.121642,0.0,5.8e-05,0.0
53061,2020-01-22,Snohomish,Washington,1,0,2087.273,822083,48.046917,-121.69278,0.121642,0.0,5.8e-05,0.0
53061,2020-01-23,Snohomish,Washington,1,0,2087.273,822083,48.046917,-121.69278,0.121642,0.0,5.8e-05,0.0
17031,2020-01-24,Cook,Illinois,1,0,945.326,5150233,41.839622,-87.81743,0.019417,0.0,2.1e-05,0.0
53061,2020-01-24,Snohomish,Washington,1,0,2087.273,822083,48.046917,-121.69278,0.121642,0.0,5.8e-05,0.0


## `groupby` performance considerations

Naively, I looped through each column to make my engineered columns, but found it to be quite slow.

In [65]:
cols = ['cases', 'deaths', 'cases_per_100k', 'deaths_per_100k', 'case_density', 'death_density']

new_cols = ['new_' + c for c in cols]
delta_cols = ['delta_' + c for c in new_cols]
new_cols_7d = [c + '_7d' for c in new_cols]
delta_cols_7d = [c + '_7d' for c in delta_cols]

In [66]:
df2 = df.copy()

ding = time()

cols = ['cases', 'deaths', 'cases_per_100k', 'deaths_per_100k', 'case_density', 'death_density']

# not sure why this can't be more vectorized
for i, col in enumerate(cols):
    print(col)
    tick = time()
    
    # new cases
    df2[new_cols[i]] = df2.groupby(by='fips')[col].diff().fillna(0)
    df2.loc[df2[new_cols[i]] < 0, new_cols[i]] = 0                # some counties revise their numbers
    
    # change in new cases
    df2[delta_cols[i]] = df2.groupby(by='fips')[new_cols[i]].diff().fillna(0)
    
    # rolling average of new cases
    df2[new_cols_7d[i]] = df2.groupby(by='fips')[new_cols[i]].apply(lambda x: x.rolling(7, min_periods=1).mean())
    
    # rolling average of change in new cases
    df2[delta_cols_7d[i]] = df2.groupby(by='fips')[delta_cols[i]].apply(lambda x: x.rolling(7, min_periods=1).mean())
    
    tock = time()
    print(tock - tick)
    print()
    
dong = time()
print(dong - ding)

df2.head()

cases
3.211413860321045

deaths
3.1785025596618652

cases_per_100k
3.2921981811523438

deaths_per_100k
3.4098825454711914

case_density
3.38395357131958

death_density
3.3570241928100586

19.83696413040161


Unnamed: 0,date,county,state,fips,cases,deaths,area,population,lat,lon,...,new_deaths_per_100k_7d,delta_new_deaths_per_100k_7d,new_case_density,delta_new_case_density,new_case_density_7d,delta_new_case_density_7d,new_death_density,delta_new_death_density,new_death_density_7d,delta_new_death_density_7d
0,2020-01-21,Snohomish,Washington,53061,1,0,2087.273,822083,48.046917,-121.69278,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2020-01-22,Snohomish,Washington,53061,1,0,2087.273,822083,48.046917,-121.69278,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2020-01-23,Snohomish,Washington,53061,1,0,2087.273,822083,48.046917,-121.69278,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
173,2020-01-24,Cook,Illinois,17031,1,0,945.326,5150233,41.839622,-87.81743,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,2020-01-24,Snohomish,Washington,53061,1,0,2087.273,822083,48.046917,-121.69278,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Dataframes can be passed when calculating rolling averages, so changing that bit of code significantly improved performance.

In [67]:
df3 = df.copy()

ding = time()

# we have to loop through the columns for groupby.diff
# the line below throws a NotImplementedError
# df1[new_cols] = grouped[cols].diff().fillna(0)

for i, col in enumerate(cols):
    print(col)
    tick = time()
    # new cases
    df3[new_cols[i]] = df3.groupby(by='fips')[col].diff().fillna(0)
    df3.loc[df3[new_cols[i]] < 0, new_cols[i]] = 0
    # change in new cases
    df3[delta_cols[i]] = df3.groupby(by='fips')[new_cols[i]].diff().fillna(0)
    tock = time()
    print(tock - tick)
    print()

# making new_cols_7d and delta_cols_7d
grouped = df3.groupby(by='fips')
df3[new_cols_7d] = grouped[new_cols].apply(lambda x: x.rolling(7, min_periods=1).mean())
df3[delta_cols_7d] = grouped[delta_cols].apply(lambda x: x.rolling(7, min_periods=1).mean())
    
dong = time()
print(dong - ding)
df1.head()

cases
1.0043156147003174

deaths
1.025259256362915

cases_per_100k
1.0701391696929932

deaths_per_100k
0.9873840808868408

case_density
1.0012993812561035

death_density
0.983370304107666

11.966008186340332


Unnamed: 0_level_0,Unnamed: 1_level_0,county,state,cases,deaths,area,population,lat,lon,cases_per_100k,deaths_per_100k,case_density,death_density
fips,date,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
53061,2020-01-21,Snohomish,Washington,1,0,2087.273,822083,48.046917,-121.69278,0.121642,0.0,5.8e-05,0.0
53061,2020-01-22,Snohomish,Washington,1,0,2087.273,822083,48.046917,-121.69278,0.121642,0.0,5.8e-05,0.0
53061,2020-01-23,Snohomish,Washington,1,0,2087.273,822083,48.046917,-121.69278,0.121642,0.0,5.8e-05,0.0
17031,2020-01-24,Cook,Illinois,1,0,945.326,5150233,41.839622,-87.81743,0.019417,0.0,2.1e-05,0.0
53061,2020-01-24,Snohomish,Washington,1,0,2087.273,822083,48.046917,-121.69278,0.121642,0.0,5.8e-05,0.0


`groupby.diff` performs rather slowly. The process is much faster when performing the tasks more explicitly. The following code was adapted from [this stackoverflow answer](https://stackoverflow.com/a/53159140).

In [68]:
df4 = df.copy()

ding = time()

# making new_cols
grouped = df4.groupby(by='fips')
df4[new_cols] = df4[cols] - grouped[cols].shift()
df4[new_cols] = df4[new_cols].fillna(0)
df4[new_cols] = df4[new_cols].clip(lower=0)

# making delta_cols
grouped = df4.groupby(by='fips')
df4[delta_cols] = df4[new_cols] - grouped[new_cols].shift()
df4[delta_cols] = df4[delta_cols].fillna(0)

# making new_cols_7d and delta_cols_7d
grouped = df4.groupby(by='fips')
df4[new_cols_7d] = grouped[new_cols].apply(lambda x: x.rolling(7, min_periods=1).mean())
df4[delta_cols_7d] = grouped[delta_cols].apply(lambda x: x.rolling(7, min_periods=1).mean())

dong = time()
print(dong - ding)
df4.head()

6.179479360580444


Unnamed: 0,date,county,state,fips,cases,deaths,area,population,lat,lon,...,new_cases_per_100k_7d,new_deaths_per_100k_7d,new_case_density_7d,new_death_density_7d,delta_new_cases_7d,delta_new_deaths_7d,delta_new_cases_per_100k_7d,delta_new_deaths_per_100k_7d,delta_new_case_density_7d,delta_new_death_density_7d
0,2020-01-21,Snohomish,Washington,53061,1,0,2087.273,822083,48.046917,-121.69278,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2020-01-22,Snohomish,Washington,53061,1,0,2087.273,822083,48.046917,-121.69278,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2020-01-23,Snohomish,Washington,53061,1,0,2087.273,822083,48.046917,-121.69278,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
173,2020-01-24,Cook,Illinois,17031,1,0,945.326,5150233,41.839622,-87.81743,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,2020-01-24,Snohomish,Washington,53061,1,0,2087.273,822083,48.046917,-121.69278,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [69]:
df2.sort_index(axis=1).equals(df3.sort_index(axis=1))

True

In [70]:
df2.sort_index(axis=1).equals(df4.sort_index(axis=1))

True

In [73]:
df4.to_csv('data/df.csv', index=False)

## using multiindex df1

In [72]:
df5 = df1.copy()

ding = time()

# making new_cols
grouped = df5.groupby(level=0)
df5[new_cols] = df5[cols] - grouped[cols].shift()
df5[new_cols] = df5[new_cols].fillna(0)
df5[new_cols] = df5[new_cols].clip(lower=0)

# making delta_cols
grouped = df5.groupby(level=0)
df5[delta_cols] = df5[new_cols] - grouped[new_cols].shift()
df5[delta_cols] = df5[delta_cols].fillna(0)

# making new_cols_7d and delta_cols_7d
grouped = df5.groupby(level=0)
df5[new_cols_7d] = grouped[new_cols].apply(lambda x: x.rolling(7, min_periods=1).mean())
df5[delta_cols_7d] = grouped[delta_cols].apply(lambda x: x.rolling(7, min_periods=1).mean())

dong = time()
print(dong - ding)
df5.head()

21.007855892181396


Unnamed: 0_level_0,Unnamed: 1_level_0,county,state,cases,deaths,area,population,lat,lon,cases_per_100k,deaths_per_100k,...,new_cases_per_100k_7d,new_deaths_per_100k_7d,new_case_density_7d,new_death_density_7d,delta_new_cases_7d,delta_new_deaths_7d,delta_new_cases_per_100k_7d,delta_new_deaths_per_100k_7d,delta_new_case_density_7d,delta_new_death_density_7d
fips,date,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
53061,2020-01-21,Snohomish,Washington,1,0,2087.273,822083,48.046917,-121.69278,0.121642,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
53061,2020-01-22,Snohomish,Washington,1,0,2087.273,822083,48.046917,-121.69278,0.121642,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
53061,2020-01-23,Snohomish,Washington,1,0,2087.273,822083,48.046917,-121.69278,0.121642,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
17031,2020-01-24,Cook,Illinois,1,0,945.326,5150233,41.839622,-87.81743,0.019417,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
53061,2020-01-24,Snohomish,Washington,1,0,2087.273,822083,48.046917,-121.69278,0.121642,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


This process is slower using multiindex.

In [74]:
def optimize(df):
    # converts to datetime if possible
    df = df.apply(lambda col: pd.to_datetime(col, errors='ignore') if col.dtypes=='object' else col)
    # if there are less than half as many unique values as there are rows, convert to category
    for col in df.select_dtypes(include='object'):
        if len(df[col].unique()) / len(df[col]) < 0.5:
            df[col] = df[col].astype('category')
    # downcasts numeric columns if possible
    df = df.apply(lambda col: pd.to_numeric(col, downcast='integer') if col.dtypes=='int64' else col)
    df = df.apply(lambda col: pd.to_numeric(col, downcast='float') if col.dtypes=='float64' else col)
    return df   


with urlopen('https://raw.githubusercontent.com/nytimes/covid-19-data/master/us-counties.csv') as response:
    nyt_df = optimize(pd.read_csv(response, dtype={'fips':'str'}))
    
    
pop_df = pd.read_csv('data/pop_df.csv')


df = nyt_df.merge(pop_df[['fips', 'area', 'population', 'lat', 'lon']], on='fips')
df[['cases_per_100k', 'deaths_per_100k']] = df[['cases', 'deaths']].div(df['population'], axis=0) * 100_000
df[['case_density', 'death_density']] = df[['cases', 'deaths']].div(df['population'], axis=0).div(df['area'], axis=0) * 100_000
df = df.sort_values(by=['date', 'fips'])

ding = time()
# making new_cols
df[new_cols] = df[cols] - df.groupby(by='fips')[cols].shift()
df[new_cols] = df[new_cols].fillna(0)
df[new_cols] = df[new_cols].clip(lower=0)

# making delta_cols
df[delta_cols] = df[new_cols] - df.groupby(by='fips')[new_cols].shift()
df[delta_cols] = df[delta_cols].fillna(0)

# making new_cols_7d and delta_cols_7d
df[new_cols_7d] = df.groupby(by='fips')[new_cols].apply(lambda x: x.rolling(7, min_periods=1).mean())
df[delta_cols_7d] = df.groupby(by='fips')[delta_cols].apply(lambda x: x.rolling(7, min_periods=1).mean())

dong = time()
print(dong - ding)

df.to_csv('data/df.csv', index=False)
df.head()

6.213387966156006


Unnamed: 0,date,county,state,fips,cases,deaths,area,population,lat,lon,...,new_cases_per_100k_7d,new_deaths_per_100k_7d,new_case_density_7d,new_death_density_7d,delta_new_cases_7d,delta_new_deaths_7d,delta_new_cases_per_100k_7d,delta_new_deaths_per_100k_7d,delta_new_case_density_7d,delta_new_death_density_7d
0,2020-01-21,Snohomish,Washington,53061,1,0,2087.273,822083,48.046917,-121.69278,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2020-01-22,Snohomish,Washington,53061,1,0,2087.273,822083,48.046917,-121.69278,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2020-01-23,Snohomish,Washington,53061,1,0,2087.273,822083,48.046917,-121.69278,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
173,2020-01-24,Cook,Illinois,17031,1,0,945.326,5150233,41.839622,-87.81743,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,2020-01-24,Snohomish,Washington,53061,1,0,2087.273,822083,48.046917,-121.69278,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
