## Data transformation

Perform some transformations on the extracted data to allow for the network
creation in the next step.

In [1]:
# to allow relative imports
import os
import sys

module = os.path.abspath(os.path.join('..'))
if module not in sys.path:
    sys.path.append(module)

import pandas as pd
import pickle
import json
from typing import Tuple, Dict
from functools import partial

from data_processing.data_paths import OUT

Define the months to be extracted (January=1, February=2, ...).

In [2]:
MONTHS = [2, 4]

Read in the pickled extracted data.

In [3]:
df_b01 = pd.read_pickle(f'{OUT}df_b01.pkl')
df_b25 = pd.read_pickle(f'{OUT}df_b25.pkl')
df_pat_pre = pd.read_pickle(f'{OUT}df_pat_pre.pkl')
df_pat_post = pd.read_pickle(f'{OUT}df_pat_post.pkl')
df_google = pd.read_pickle(f'{OUT}df_google.pkl')

Transformations of the census data.

In [4]:
# rename columns
B01_COL_NAMES = {'B01003e1': 'population', 'B01003m1': 'population_me'}
B25_COL_NAMES = {'B25010e1': 'household_size', 'B25010m1': 'household_size_me'}

df_b01.rename(columns=B01_COL_NAMES, inplace=True)
df_b25.rename(columns=B25_COL_NAMES, inplace=True)

# convert margin of error to standard error
Z_SCORE = 1.95996

df_b01['population_se'] = df_b01['population_me'] / Z_SCORE
df_b25['household_size_se'] = df_b25['household_size_me'] / Z_SCORE

# merge data frames on census block
df_merged = pd.merge(df_b01, df_b25, on='cbg', how='outer')

# add proportional population
df_merged['population_prop'] = df_merged['population'] / \
                               df_merged['population'].sum()

Check the data makes sense.

In [5]:
# sanity checks with data from
# https://censusreporter.org/profiles/05000US09009-new-haven-county-ct/

# population should be ~850,000 
print('Population:', df_merged.sum()['population'])

# mean household should be ~2.5
print('Household size:', df_merged.mean()['household_size'])

Population: 857513.0
Household size: 2.5217413902282715


In [6]:
# check the summary stats for plausibility
df_merged.describe()

Unnamed: 0,population,population_me,population_se,household_size,household_size_me,household_size_se,population_prop
count,628.0,628.0,628.0,626.0,626.0,626.0,628.0
mean,1365.466561,342.968153,174.987323,2.521741,0.448115,0.228635,0.001592
std,652.336689,137.22772,70.015572,0.476634,0.192414,0.098173,0.000761
min,0.0,12.0,6.122574,1.21,0.09,0.045919,0.0
25%,886.0,249.5,127.298516,2.19,0.32,0.163269,0.001033
50%,1253.0,325.0,165.819711,2.52,0.41,0.209188,0.001461
75%,1716.25,405.25,206.764424,2.86,0.54,0.275516,0.002001
max,4063.0,1047.0,534.194575,4.0,1.37,0.698994,0.004738


In [7]:
# check for NA values
df_merged.iloc[[x > 0 for x in df_merged.isna().sum(axis=1)], :]

Unnamed: 0,cbg,population,population_me,population_se,household_size,household_size_me,household_size_se,population_prop
622,90093614022,1476,218,111.22676,,,,0.001721
627,90099900000,0,12,6.122574,,,,0.0


In [8]:
# since we only have two rows with NA, we can impute them with the mean
df_merged.fillna(df_merged.mean(), inplace=True)

Save demographics data to pickled file.

In [9]:
# save demographics to pickle
demographics = df_merged.set_index('cbg').to_dict('index')
pickle.dump(demographics, open(f'{OUT}demographics.pkl', 'wb'))

Transform the extracted Patterns files.

In [10]:
# convert JSON data to python dict
df_pat_pre.visitor_cbg = df_pat_pre.visitor_cbg.apply(\
    lambda x: json.loads(x))
df_pat_post.visitor_cbg = df_pat_post.visitor_cbg.apply(\
    lambda x: json.loads(x))

# calculate total visitors
df_pat_pre['total_visitors'] = df_pat_pre.visitor_cbg.apply(\
    lambda x: sum(x.values()))
df_pat_post['total_visitors'] = df_pat_post.visitor_cbg.apply(\
    lambda x: sum(x.values()))

Tranform Google mobility data.

In [11]:
def _month_filter(keep: [], x):
    return x in keep

# keep only required months
month_filter = partial(_month_filter, MONTHS)

df_google = df_google[df_google.date.apply(\
    lambda x: month_filter(x.month))].set_index('date')

df_google.drop('county', axis=1, inplace=True)

# rebase
df_google += 100

Save Google mobility data to pickled file.

In [12]:
google_mobility_agg = {}

for month in [2, 4]:
    google_mobility_agg[month] = {}
    df_sub = df_google[df_google.index.month == month]
    for col in df_google.columns.tolist():
        google_mobility_agg[month][col] = df_sub[col].values.tolist()
        
pickle.dump(google_mobility_agg, open(f'{OUT}google_mobility_agg.pkl', 'wb'))

Count the number of trips from one CBG to another and total trips leaving
each CBG.

In [13]:
def create_count_hashmaps(df: pd.DataFrame, all_cbgs: set) -> Tuple[Dict, Dict]:
    """
    Create two hashmaps:
    - comb_counts: total counts of visits between two CBGs
    - trip_counts: total counts of all outgoing trips from each CBG
    :param df: pattern data frame
    :param all_cbgs: set containing all CBGs
    :returns: Tuple with two hashmaps
    """
    
    comb_counts = {}
    trip_counts = {}
    
    # iterate over POIs
    for i, row in df.iterrows():

        # CBG of the POI
        poi_cbg = row['cbg']

        for cbg, count in row['visitor_cbg'].items():
            
            # ignore visits from other counties
            if cbg not in all_cbgs or poi_cbg not in all_cbgs:
                continue
            
            # Combination from visitor CBG to POI CBG
            cbg_comb = (cbg, poi_cbg)
            
            # add count to combination counts
            if cbg_comb in comb_counts:
                comb_counts[cbg_comb] += count
            else:
                comb_counts[cbg_comb] = count
            
            # add count to trip counts
            if cbg in trip_counts:
                trip_counts[cbg] += count

            else:
                trip_counts[cbg] = count
                
    return comb_counts, trip_counts

In [14]:
%%time
comb_counts_pre, trip_counts_pre = create_count_hashmaps(
    df_pat_pre, set(df_merged.cbg.tolist()))

comb_counts_post, trip_counts_post = create_count_hashmaps(
    df_pat_post, set(df_merged.cbg.tolist()))

CPU times: user 3.05 s, sys: 6.25 ms, total: 3.05 s
Wall time: 3.05 s


Save the count files to pickled files.

In [15]:
pickle.dump(comb_counts_pre, open(f'{OUT}comb_counts_pre.pkl', 'wb'))
pickle.dump(comb_counts_post, open(f'{OUT}comb_counts_post.pkl', 'wb'))

pickle.dump(trip_counts_pre, open(f'{OUT}trip_counts_pre.pkl', 'wb'))
pickle.dump(trip_counts_post, open(f'{OUT}trip_counts_post.pkl', 'wb'))