### Data processing for network creation

Imports...

In [1]:
import os
from sys import getsizeof
import pandas as pd
import numpy as np
import json
import pickle

Some parameters...

In [2]:
# set this to True if you want to reload from CSV, which takes longer. Otherwise data is loaded from pickled files.
from_csv = False

Define loading behaviour ...

In [3]:
# define the required cbg_group_regex
cbg_group_identifier = '09009'

# define required columns and names for the two census files
cbg_b01 = {
    'file': 'cbg_b01.csv',
    'cols': [0, 159, 160],
    'names': ['cbg', 'B01003e1', 'B01003m1'],
    'dtypes': {0: 'string', 159: np.int32, 160: np.int32}
}

cbg_b25 = {
    'file': 'cbg_b25.csv',
    'cols': [0, 187, 188],
    'names': ['cbg', 'B25010e1', 'B25010m1'],
    'dtypes': {0: 'string', 187: np.float32, 188: np.float32}
}

patterns_feb = {
    'file': 'feb2020_core_poi-patterns.csv',
    'cols': [0, 25, 35],
    'names': ['placekey', 'visitor_cbg', 'cbg'],
    'dtypes': {0: 'string', 35: 'string', 25: 'string'},
}

patterns_apr = {
    'file': 'apr2020_core_poi-patterns.csv',
    'cols': [0, 25, 35],
    'names': ['placekey', 'visitor_cbg', 'cbg'],
    'dtypes': {0: 'string', 35: 'string', 25: 'string'},
}

def read(data: dict) -> pd.DataFrame:
    """
    Read raw data from a csv file.
    :param data: contains info on the data to extract.
    :returns: data in a pandas data frame.
    """
    iter_csv = pd.read_csv(f"data/{data['file']}", usecols=data['cols'], dtype=data['dtypes'], 
                           header=0, names=data['names'], iterator=True, chunksize=1000)
    
    df = pd.concat([chunk[chunk['cbg'].apply(lambda x: x.startswith(cbg_group_identifier) if not pd.isnull(x) else False)] 
                    for chunk in iter_csv])
    
    return df


Load data ...

In [4]:
%%time

if from_csv:
    
    # load from csv
    df_pat_feb = read(patterns_feb)
    df_pat_apr = read(patterns_apr)
    df_b01 = read(cbg_b01)
    df_b25 = read(cbg_b25)
    
    # save to pickle
    df_pat_feb.to_pickle('data/df_pat_feb.pkl')
    df_pat_apr.to_pickle('data/df_pat_apr.pkl')
    df_b01.to_pickle('data/df_b01.pkl')
    df_b25.to_pickle('data/df_b25.pkl')
    
else:
    
    # load from pickle
    df_b01 = pd.read_pickle('data/df_b01.pkl')
    df_b25 = pd.read_pickle('data/df_b25.pkl')
    df_pat_feb = pd.read_pickle('data/df_pat_feb.pkl')
    df_pat_apr = pd.read_pickle('data/df_pat_apr.pkl')

CPU times: user 17.9 s, sys: 8.66 s, total: 26.6 s
Wall time: 27 s


Transform the CBG files...

In [5]:
# rename columns
b01_col_names = {'B01003e1': 'population', 'B01003m1': 'population_me'}
b25_col_names = {'B25010e1': 'household_size', 'B25010m1': 'household_size_me'}

df_b01.rename(columns=b01_col_names, inplace=True)
df_b25.rename(columns=b25_col_names, inplace=True)

# convert margin of error to standard error
z_score = 1.95996

df_b01['population_se'] = df_b01['population_me'] / z_score
df_b25['household_size_se'] = df_b25['household_size_me'] / z_score

# merge data frames on census block
df_merged = pd.merge(df_b01, df_b25, on='cbg', how='outer')

In [6]:
# sanity checks with data from https://censusreporter.org/profiles/05000US09009-new-haven-county-ct/

# population should be ~850,000 
print('Population:', df_merged.sum()['population'])

# mean household should be ~2.5
print('Household size:', df_merged.mean()['household_size'])

Population: 857513.0
Household size: 2.5217413902282715


Save merged file...

In [None]:
df_merged.to_pickle('data/df_pate.pkl')

Transform pattern files ...

In [7]:
# convert JSON data to python dict
df_pat_feb.visitor_cbg = df_pat_feb.visitor_cbg.apply(lambda x: json.loads(x))
df_pat_apr.visitor_cbg = df_pat_apr.visitor_cbg.apply(lambda x: json.loads(x))

# calculate total visitors
df_pat_feb['total_visitors'] = df_pat_feb.visitor_cbg.apply(lambda x: sum(x.values()))
df_pat_apr['total_visitors'] = df_pat_apr.visitor_cbg.apply(lambda x: sum(x.values()))

Create hashmap of total `CBG-CBG` connections...

In [8]:
def create_cbg_cpg_hashmap(df: pd.DataFrame) -> dict:
    """
    Create a hashmap that contains the total counts of visits between two CBGs.
    :param df: pattern data frame
    :returns: hashmap with counts of visits
    """
    hashmap = {}
    for i, row in df.iterrows():
        poi_cbg = row['cbg']
        for cbg, count in row['visitor_cbg'].items():
            cbg_cbg_id = f'{min(cbg, poi_cbg)}-{max(cbg, poi_cbg)}'
            if cbg_cbg_id in hashmap:
                hashmap[cbg_cbg_id] += count
            else:
                hashmap[cbg_cbg_id] = count
    return hashmap

def print_visit_count_info(title: str, visit_counts: dict, all_cbgs: list) -> None:
    """
    Print info about the visit counts provided.
    :param title: title to print out.
    :param visit_counts: hashmap containing the count data.
    :param all_cbgs: list of all cbgs.
    """
    print(f'{title}:')
    print(f"{len(visit_counts)} out of a possible {len(all_cbgs)**2} edges.")
    print(f"{sum(visit_counts.values())} unique visits.\n")

In [9]:
%%time
visit_counts_feb = create_cbg_cpg_hashmap(df_pat_feb)
visit_counts_apr = create_cbg_cpg_hashmap(df_pat_apr)

print_visit_count_info("Febuary", visit_counts_feb, df_merged.cbg.tolist())
print_visit_count_info("April", visit_counts_apr, df_merged.cbg.tolist())

Febuary:
88686 out of a possible 394384 edges.
1042364 unique visits.

April:
41335 out of a possible 394384 edges.
411954 unique visits.

CPU times: user 2.9 s, sys: 96.7 ms, total: 3 s
Wall time: 2.99 s


Save visit counts to file...

In [10]:
pickle.dump(visit_counts_feb, open('data/visit_counts_feb.pkl', 'wb'))
pickle.dump(visit_counts_apr, open('data/visit_counts_apr.pkl', 'wb'))

# read like this:
# visit_counts_feb = pickle.load(open('data/visit_counts_feb.pkl', 'rb'))
# visit_counts_apr = pickle.load(open('data/visit_counts_apr.pkl', 'rb'))