In [None]:
import glob
from tqdm import tqdm
import pickle
import json
from collections import Counter
import pandas as pd
import numpy as np

In [None]:
from google.colab import drive as mountGoogleDrive
mountGoogleDrive.mount('/content/drive')

Mounted at /content/drive


In [None]:
in_dir = r'/content/drive/MyDrive/Research/Safegraph/scratch/20221205/'
out_dir = r'/content/drive/MyDrive/Research/Safegraph/scratch/20240415/'

In [None]:
def print_bad_files(in_dir):
    """Combines pickle files of lists of json files that didn't load
    """
    pickle_files = glob.glob(f'{in_dir}/*.pickle')
    bad_files = []
    for picklepath in pickle_files:
        with open(picklepath, 'rb') as f:
            bad_files.append(pickle.load(f))
    bad_files = [item for sublist in bad_files for item in sublist]
    print(bad_files)


def combine_batch_csv(dir):
    """Combines Safegraph dataframes into one
    """
    csv_files = glob.glob(f'{dir}/*.csv')

    df_list = []

    for filename in tqdm(csv_files):
        df = pd.read_csv(filename, header=0, index_col=0)
        df = df[~df['node.safegraph_core.postal_code'].isnull()]
        df = df[~df['poi_cbg'].isnull()]
        df_list.append(df)

    df = pd.concat(df_list, axis=0)

    return df

In [None]:
print_bad_files(f"{in_dir}/visitor_home_aggregation/")

['New York,NYfrom2021-06-14to2021-06-21PART51.json', 'New York,NYfrom2021-07-05to2021-07-12PART138.json', 'New York,NYfrom2021-07-12to2021-07-19PART7.json', 'New York,NYfrom2021-11-01to2021-11-08PART49.json', 'New York,NYfrom2021-11-08to2021-11-15PART70.json']


In [None]:
df = combine_batch_csv(f"{in_dir}/visitor_home_aggregation/")

100%|██████████| 9/9 [01:23<00:00,  9.30s/it]


In [None]:
df['node.safegraph_core.postal_code'] = df['node.safegraph_core.postal_code'].astype(int)

In [None]:
zip_code_map = pd.read_csv('https://raw.githubusercontent.com/erikgregorywebb/nyc-housing/master/Data/nyc-zip-codes.csv')

tract_zip_map = pd.read_csv(r'/content/drive/MyDrive/Research/Safegraph/scratch/TRACT_ZIP_122021.csv',
                            dtype={'tract': str, 'zip': str})
tract_zip_map = tract_zip_map.drop_duplicates(subset='tract', keep="first")
tract_zip_map = tract_zip_map[['tract', 'zip']].rename(columns={"zip": "ZCTA"})

In [None]:
df = df.merge(zip_code_map[['ZipCode','Borough']], left_on='node.safegraph_core.postal_code', right_on='ZipCode', how='left')

In [None]:
df.loc[df.Borough.isnull()].shape[0] / df.shape[0]

0.01831489669050727

In [None]:
df = df[~df['Borough'].isnull()]

In [None]:
df['start'] = pd.to_datetime(df.start, format='%Y-%m-%d')
df['end'] = pd.to_datetime(df.end, format='%Y-%m-%d')
df['poi_cbg'] = df['poi_cbg'].astype(int).astype(str)

In [None]:
import os

START_DATE = "12/31/2020"
END_DATE = "6/30/2021"

weeks = np.unique(df['end'])
weeks = [week for week in weeks if week <= pd.to_datetime(END_DATE) and week >= pd.to_datetime(START_DATE)]

print(weeks)

[numpy.datetime64('2021-01-04T00:00:00.000000000'), numpy.datetime64('2021-01-11T00:00:00.000000000'), numpy.datetime64('2021-01-18T00:00:00.000000000'), numpy.datetime64('2021-01-25T00:00:00.000000000'), numpy.datetime64('2021-02-01T00:00:00.000000000'), numpy.datetime64('2021-02-08T00:00:00.000000000'), numpy.datetime64('2021-02-15T00:00:00.000000000'), numpy.datetime64('2021-02-22T00:00:00.000000000'), numpy.datetime64('2021-03-01T00:00:00.000000000'), numpy.datetime64('2021-03-08T00:00:00.000000000'), numpy.datetime64('2021-03-15T00:00:00.000000000'), numpy.datetime64('2021-03-22T00:00:00.000000000'), numpy.datetime64('2021-03-29T00:00:00.000000000'), numpy.datetime64('2021-04-05T00:00:00.000000000'), numpy.datetime64('2021-04-12T00:00:00.000000000'), numpy.datetime64('2021-04-19T00:00:00.000000000'), numpy.datetime64('2021-04-26T00:00:00.000000000'), numpy.datetime64('2021-05-03T00:00:00.000000000'), numpy.datetime64('2021-05-10T00:00:00.000000000'), numpy.datetime64('2021-05-17T0

In [None]:
batchsize = 10
batch_list = []
for i in range(0, len(weeks), batchsize):
    batch = weeks[i:i+batchsize]
    batch_list.append(batch)
print(len(batch_list))

3


In [None]:
zip_code_map.ZipCode = zip_code_map.ZipCode.astype(str)

In [None]:
def create_mobility_matrix(weeks):
  for week in tqdm(weeks):
    week_df = df.loc[df['end'] == week, ['Borough','visitor_home_aggregation']]
    week_df['visitor_home_aggregation'] = week_df['visitor_home_aggregation'].str.replace("\'", "\"")

    mobility_df = pd.DataFrame()
    for index, row in week_df.iterrows():
      row_dict = json.loads(row['visitor_home_aggregation'])
      row_df = pd.DataFrame(list(row_dict.items()), columns=['tract','visitor_home_aggregation'])
      row_df['destination'] = row['Borough']
      mobility_df = pd.concat([mobility_df, row_df])

    mobility_df = mobility_df.merge(tract_zip_map, on='tract', how='left')
    mobility_df = mobility_df.merge(zip_code_map[['ZipCode','Borough']], left_on='ZCTA', right_on='ZipCode')

    mobility_df = mobility_df[~mobility_df['Borough'].isnull()]

    mobility_df = mobility_df.groupby(['destination','Borough'])['visitor_home_aggregation'].sum().reset_index()

    mobility_df['end'] = week
    mobility_df = mobility_df.rename(columns={"Borough":"origin"})

    mobility_df.to_csv(f"{out_dir}/mobility/{pd.to_datetime(week).date()}_mobility.csv", index=False)

In [None]:
create_mobility_matrix(batch_list[0])

100%|██████████| 10/10 [09:04<00:00, 54.43s/it]


In [None]:
create_mobility_matrix(batch_list[1])

100%|██████████| 10/10 [11:46<00:00, 70.70s/it]


In [None]:
create_mobility_matrix(batch_list[2])

100%|██████████| 6/6 [06:43<00:00, 67.28s/it]
