## Data extraction

Extract only the required data from raw CSV and save to Pickle files.

In [1]:
import os
import pandas as pd
import numpy as np
import json
import pickle

from data_paths import RAW, OUT

Define the fles to be read in.

In [2]:
# define the required cbg_group_regex
CBG_CODE = '09009'
COUNTY_NAME = 'new haven county'

# define the data to be extracted

# census files
CBG_B01 = {
    'file': 'cbg_b01.csv',
    'cols': [0, 159, 160],
    'names': ['cbg', 'B01003e1', 'B01003m1'],
    'dtypes': {0: 'string', 159: np.int32, 160: np.int32}
}

CBG_B25 = {
    'file': 'cbg_b25.csv',
    'cols': [0, 187, 188],
    'names': ['cbg', 'B25010e1', 'B25010m1'],
    'dtypes': {0: 'string', 187: np.float32, 188: np.float32}
}

# patterns files
PATTERNS_FEB = {
    'file': 'feb2020_core_poi-patterns.csv',
    'cols': [0, 25, 35],
    'names': ['placekey', 'visitor_cbg', 'cbg'],
    'dtypes': {0: 'string', 35: 'string', 25: 'string'},
}

PATTERNS_APR = {
    'file': 'apr2020_core_poi-patterns.csv',
    'cols': [0, 25, 35],
    'names': ['placekey', 'visitor_cbg', 'cbg'],
    'dtypes': {0: 'string', 35: 'string', 25: 'string'},
}

# Google mobility data
GOOGLE_MOBILITY = {
    'file': '2020_US_Region_Mobility_Report.csv',
    'cols': [3, 8, 9, 10, 11, 12, 13, 14],
    'names': ['county', 'date', 'retail_recreation', 'grocery_pharmacy', 'park', 
              'transit', 'workplace', 'residential'],
    'date_cols': ['date'],
    'dtypes': {3: 'string'},
    'google': True
}

Define the function to read the data.

In [3]:
def read(data: dict) -> pd.DataFrame:
    """
    Read raw data from a csv file.
    :param data: contains info on the data to extract.
    :returns: data in a pandas data frame.
    """
    
    if not 'date_cols' in data:
        data['date_cols'] = False
    
    iter_csv = pd.read_csv(f"{RAW}{data['file']}", usecols=data['cols'], dtype=data['dtypes'], 
                           parse_dates=data['date_cols'], header=0, names=data['names'], iterator=True, 
                           chunksize=1000)
    
    # google mobility data
    if 'google' in data.keys():
        
        # filter new ha
        df = pd.concat([chunk[chunk['county'].apply(lambda x: x.lower() == COUNTY_NAME
                                                    if not pd.isnull(x) else False)] 
                        for chunk in iter_csv])
    
    else:
        df = pd.concat([chunk[chunk['cbg'].apply(lambda x: x.startswith(CBG_CODE) 
                                                 if not pd.isnull(x) else False)] 
                        for chunk in iter_csv])
    
    return df

Read in the raw data.

In [4]:
%%time
    
# read from csv
df_google = read(GOOGLE_MOBILITY)
df_pat_feb = read(PATTERNS_FEB)
df_pat_apr = read(PATTERNS_APR)
df_b01 = read(CBG_B01)
df_b25 = read(CBG_B25)

CPU times: user 26.5 s, sys: 17.2 s, total: 43.7 s
Wall time: 47.5 s


Save data to Pickle

In [5]:
%%time

# save to pickle
df_google.to_pickle(f'{OUT}df_google.pkl')
df_pat_feb.to_pickle(f'{OUT}df_pat_feb.pkl')
df_pat_apr.to_pickle(f'{OUT}df_pat_apr.pkl')
df_b01.to_pickle(f'{OUT}df_b01.pkl')
df_b25.to_pickle(f'{OUT}df_b25.pkl')

CPU times: user 51.1 ms, sys: 14.8 ms, total: 65.9 ms
Wall time: 69.8 ms
