# `pandas` performance comparison

In [6]:
import numpy as np
import pandas as pd

from urllib.request import urlopen
import json

from time import time
from datetime import datetime, timedelta

In [4]:
def optimize(df):
    '''
    Optimizes the data types in a pandas dataframe.
    '''
    dft = df.copy()
    # converts to datetime if possible
    dft = dft.apply(lambda col: pd.to_datetime(col, errors='ignore') if col.dtypes=='object' else col)
    # if there are less than half as many unique values as there are rows, convert to category
    for col in dft.select_dtypes(include='object'):
        if len(dft[col].unique()) / len(df[col]) < 0.5:
            dft[col] = dft[col].astype('category')
    # downcasts numeric columns if possible
    dft = dft.apply(lambda col: pd.to_numeric(col, downcast='integer') if col.dtypes=='int64' else col)
    dft = dft.apply(lambda col: pd.to_numeric(col, downcast='float') if col.dtypes=='float64' else col)
    return dft

In [7]:
with urlopen('https://raw.githubusercontent.com/nytimes/covid-19-data/master/us-counties.csv') as response:
    nyt_df = optimize(pd.read_csv(response, dtype={'fips':'str'}))

nyt_df['fips'] = nyt_df['fips'].astype('object')
nyt_df.loc[nyt_df['county'] == 'New York City','fips'] = 'nyc'
nyt_df.loc[nyt_df['county'] == 'Kansas City','fips'] = 'kc'
nyt_df.loc[nyt_df['county'] == 'Joplin','fips'] = 'jm'
nyt_df['fips'] = nyt_df['fips'].astype('category')

pop_df = pd.read_csv('data/pop_df.csv')

## `merge` and `join` performance considerations

In [8]:
comparisons = []

In [9]:
tick = time()

df = nyt_df.merge(pop_df, on='fips', suffixes=('_x','')).drop(['county_x', 'state_x'], axis=1)

# person density
df[['cases_per_100k', 'deaths_per_100k']] = df[['cases', 'deaths']].div(df['population'], axis=0) * 100_000

df = df.sort_values(by=['date', 'fips'])

tock = time()
print(tock - tick)

df.head()

0.26927900314331055


Unnamed: 0,date,fips,cases,deaths,state,county,population,area,lon,lat,pop_per_area,cases_per_100k,deaths_per_100k
0,2020-01-21,53061,1,0,Washington,Snohomish,822083,2087.273,-121.69278,48.046917,393.855054,0.121642,0.0
1,2020-01-22,53061,1,0,Washington,Snohomish,822083,2087.273,-121.69278,48.046917,393.855054,0.121642,0.0
2,2020-01-23,53061,1,0,Washington,Snohomish,822083,2087.273,-121.69278,48.046917,393.855054,0.121642,0.0
176,2020-01-24,17031,1,0,Illinois,Cook,5150233,945.326,-87.81743,41.839622,5448.102624,0.019417,0.0
3,2020-01-24,53061,1,0,Washington,Snohomish,822083,2087.273,-121.69278,48.046917,393.855054,0.121642,0.0


In [10]:
tick = time()

nyt_df1 = nyt_df.set_index(['fips', 'date'])
pop_df1 = pop_df.set_index('fips')

df1 = nyt_df1.join(pop_df1, on='fips', how='left', lsuffix='_x', rsuffix='').drop(['county_x', 'state_x'], axis=1)
df1[['cases_per_100k', 'deaths_per_100k']] = df1[['cases', 'deaths']].div(df1['population'], axis=0) * 100_000
df1 = df1.sort_values(by=['date', 'fips'])

tock = time()
print(tock - tick)

df1.head()

0.1874980926513672


Unnamed: 0_level_0,Unnamed: 1_level_0,cases,deaths,state,county,population,area,lon,lat,pop_per_area,cases_per_100k,deaths_per_100k
fips,date,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
53061,2020-01-21,1,0,Washington,Snohomish,822083.0,2087.273,-121.69278,48.046917,393.855054,0.121642,0.0
53061,2020-01-22,1,0,Washington,Snohomish,822083.0,2087.273,-121.69278,48.046917,393.855054,0.121642,0.0
53061,2020-01-23,1,0,Washington,Snohomish,822083.0,2087.273,-121.69278,48.046917,393.855054,0.121642,0.0
17031,2020-01-24,1,0,Illinois,Cook,5150233.0,945.326,-87.81743,41.839622,5448.102624,0.019417,0.0
53061,2020-01-24,1,0,Washington,Snohomish,822083.0,2087.273,-121.69278,48.046917,393.855054,0.121642,0.0


## `groupby` performance considerations

Naively, I looped through each column to make my engineered columns, but found it to be quite slow.

In [11]:
cols = ['cases', 'deaths', 'cases_per_100k', 'deaths_per_100k']

new_cols = ['new_' + c for c in cols]
delta_cols = ['delta_' + c for c in new_cols]
new_cols_7d = [c + '_7d' for c in new_cols]
delta_cols_7d = [c + '_7d' for c in delta_cols]

In [12]:
df2 = df.copy()

ding = time()

# not sure why this can't be more vectorized
for i, col in enumerate(cols):
    print(col)
    tick = time()
    
    # new cases
    df2[new_cols[i]] = df2.groupby(by='fips')[col].diff().fillna(0)
    df2.loc[df2[new_cols[i]] < 0, new_cols[i]] = 0                # some counties revise their numbers
    
    # change in new cases
    df2[delta_cols[i]] = df2.groupby(by='fips')[new_cols[i]].diff().fillna(0)
    
    # rolling average of new cases
    df2[new_cols_7d[i]] = df2.groupby(by='fips')[new_cols[i]].apply(lambda x: x.rolling(7, min_periods=1).mean())
    
    # rolling average of change in new cases
    df2[delta_cols_7d[i]] = df2.groupby(by='fips')[delta_cols[i]].apply(lambda x: x.rolling(7, min_periods=1).mean())
    
    tock = time()
    print(tock - tick)
    print()
    
dong = time()
print(dong - ding)

df2.head()

cases
3.232357978820801

deaths
3.240337371826172

cases_per_100k
3.2712535858154297

deaths_per_100k
3.2672932147979736

13.011242151260376


Unnamed: 0,date,fips,cases,deaths,state,county,population,area,lon,lat,...,new_deaths_7d,delta_new_deaths_7d,new_cases_per_100k,delta_new_cases_per_100k,new_cases_per_100k_7d,delta_new_cases_per_100k_7d,new_deaths_per_100k,delta_new_deaths_per_100k,new_deaths_per_100k_7d,delta_new_deaths_per_100k_7d
0,2020-01-21,53061,1,0,Washington,Snohomish,822083,2087.273,-121.69278,48.046917,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2020-01-22,53061,1,0,Washington,Snohomish,822083,2087.273,-121.69278,48.046917,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2020-01-23,53061,1,0,Washington,Snohomish,822083,2087.273,-121.69278,48.046917,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
176,2020-01-24,17031,1,0,Illinois,Cook,5150233,945.326,-87.81743,41.839622,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,2020-01-24,53061,1,0,Washington,Snohomish,822083,2087.273,-121.69278,48.046917,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Dataframes can be passed when calculating rolling averages, so changing that bit of code significantly improved performance.

In [14]:
df3 = df.copy()

ding = time()

# we have to loop through the columns for groupby.diff
# the line below throws a NotImplementedError
# df1[new_cols] = grouped[cols].diff().fillna(0)

for i, col in enumerate(cols):
    print(col)
    tick = time()
    # new cases
    df3[new_cols[i]] = df3.groupby(by='fips')[col].diff().fillna(0)
    df3.loc[df3[new_cols[i]] < 0, new_cols[i]] = 0
    # change in new cases
    df3[delta_cols[i]] = df3.groupby(by='fips')[new_cols[i]].diff().fillna(0)
    tock = time()
    print(tock - tick)
    print()

# making new_cols_7d and delta_cols_7d
grouped = df3.groupby(by='fips')
df3[new_cols_7d] = grouped[new_cols].apply(lambda x: x.rolling(7, min_periods=1).mean())
df3[delta_cols_7d] = grouped[delta_cols].apply(lambda x: x.rolling(7, min_periods=1).mean())
    
dong = time()
print(dong - ding)
df3.head()

cases
0.9814045429229736

deaths
0.9664511680603027

cases_per_100k
1.0023140907287598

deaths_per_100k
0.9983029365539551

9.511599063873291


Unnamed: 0,date,fips,cases,deaths,state,county,population,area,lon,lat,...,new_deaths_per_100k,delta_new_deaths_per_100k,new_cases_7d,new_deaths_7d,new_cases_per_100k_7d,new_deaths_per_100k_7d,delta_new_cases_7d,delta_new_deaths_7d,delta_new_cases_per_100k_7d,delta_new_deaths_per_100k_7d
0,2020-01-21,53061,1,0,Washington,Snohomish,822083,2087.273,-121.69278,48.046917,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2020-01-22,53061,1,0,Washington,Snohomish,822083,2087.273,-121.69278,48.046917,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2020-01-23,53061,1,0,Washington,Snohomish,822083,2087.273,-121.69278,48.046917,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
176,2020-01-24,17031,1,0,Illinois,Cook,5150233,945.326,-87.81743,41.839622,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,2020-01-24,53061,1,0,Washington,Snohomish,822083,2087.273,-121.69278,48.046917,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


`groupby.diff` performs rather slowly. The process is much faster when performing the tasks more explicitly. The following code was adapted from [this stackoverflow answer](https://stackoverflow.com/a/53159140).

In [17]:
df4 = df.copy()

ding = time()

# making new_cols
df4[new_cols] = df4[cols] - df4.groupby(by='fips')[cols].shift()
df4[new_cols] = df4[new_cols].fillna(0)
df4[new_cols] = df4[new_cols].clip(lower=0)

# making delta_cols
df4[delta_cols] = df4[new_cols] - df4.groupby(by='fips')[new_cols].shift()
df4[delta_cols] = df4[delta_cols].fillna(0)

# making new_cols_7d and delta_cols_7d
df4[new_cols_7d] = df4.groupby(by='fips')[new_cols].apply(lambda x: x.rolling(7, min_periods=1).mean())
df4[delta_cols_7d] = df4.groupby(by='fips')[delta_cols].apply(lambda x: x.rolling(7, min_periods=1).mean())

dong = time()
print(dong - ding)
df4.head()

5.727657079696655


Unnamed: 0,date,fips,cases,deaths,state,county,population,area,lon,lat,...,delta_new_cases_per_100k,delta_new_deaths_per_100k,new_cases_7d,new_deaths_7d,new_cases_per_100k_7d,new_deaths_per_100k_7d,delta_new_cases_7d,delta_new_deaths_7d,delta_new_cases_per_100k_7d,delta_new_deaths_per_100k_7d
0,2020-01-21,53061,1,0,Washington,Snohomish,822083,2087.273,-121.69278,48.046917,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2020-01-22,53061,1,0,Washington,Snohomish,822083,2087.273,-121.69278,48.046917,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2020-01-23,53061,1,0,Washington,Snohomish,822083,2087.273,-121.69278,48.046917,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
176,2020-01-24,17031,1,0,Illinois,Cook,5150233,945.326,-87.81743,41.839622,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,2020-01-24,53061,1,0,Washington,Snohomish,822083,2087.273,-121.69278,48.046917,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [18]:
df2.sort_index(axis=1).equals(df3.sort_index(axis=1))

True

In [19]:
df2.sort_index(axis=1).equals(df4.sort_index(axis=1))

True

## using multi-indexed dataframe

In [20]:
df5 = df1.copy()
df5.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,cases,deaths,state,county,population,area,lon,lat,pop_per_area,cases_per_100k,deaths_per_100k
fips,date,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
53061,2020-01-21,1,0,Washington,Snohomish,822083.0,2087.273,-121.69278,48.046917,393.855054,0.121642,0.0
53061,2020-01-22,1,0,Washington,Snohomish,822083.0,2087.273,-121.69278,48.046917,393.855054,0.121642,0.0
53061,2020-01-23,1,0,Washington,Snohomish,822083.0,2087.273,-121.69278,48.046917,393.855054,0.121642,0.0
17031,2020-01-24,1,0,Illinois,Cook,5150233.0,945.326,-87.81743,41.839622,5448.102624,0.019417,0.0
53061,2020-01-24,1,0,Washington,Snohomish,822083.0,2087.273,-121.69278,48.046917,393.855054,0.121642,0.0


In [23]:
df5 = df1.copy()

ding = time()

# making new_cols
grouped = df5.groupby(level=0)
df5[new_cols] = df5[cols] - grouped[cols].shift()
df5[new_cols] = df5[new_cols].fillna(0)
df5[new_cols] = df5[new_cols].clip(lower=0)

# making delta_cols
grouped = df5.groupby(level=0)
df5[delta_cols] = df5[new_cols] - grouped[new_cols].shift()
df5[delta_cols] = df5[delta_cols].fillna(0)

# making new_cols_7d and delta_cols_7d
grouped = df5.groupby(level=0)
df5[new_cols_7d] = grouped[new_cols].apply(lambda x: x.rolling(7, min_periods=1).mean())
df5[delta_cols_7d] = grouped[delta_cols].apply(lambda x: x.rolling(7, min_periods=1).mean())

dong = time()
print(dong - ding)
df5.head()

ValueError: cannot handle a non-unique multi-index!

## rolling regression tests

In [28]:
df4['days'] = ((df4['date'] - df4['date'].min()) / np.timedelta64(1, 'D')).astype('int')

In [32]:
df4['new_cases']

0           0.0
1           0.0
2           0.0
176         0.0
3           0.0
          ...  
321835      0.0
321906      2.0
330706     25.0
101398     58.0
3933      316.0
Name: new_cases, Length: 331236, dtype: float64

In [33]:
df4.groupby(by='fips')[['new_cases', 'days']].apply(
    lambda x: x.rolling(7, min_periods=1).apply(
        lambda x: x['new_cases'].cov(x['days'])
    )
)

KeyError: 'new_cases'

In [27]:
df4.groupby(by='fips')['new_cases'].rolling(7, min_periods=1).var()

fips         
01001  145519            NaN
       145520       4.500000
       145521       2.333333
       145522       2.250000
       145523       2.000000
                    ...     
nyc    3929      2467.809524
       3930      1477.809524
       3931      1525.904762
       3932      1408.571429
       3933      1306.571429
Name: new_cases, Length: 331236, dtype: float64