# SafeGraph foot traffic data processing (batch version)
Outputs foot_traffic_time_series.csv, time series data of number of daily visits of each place
Also information about each place in places_info.csv

Requires Spark and pandas

SafeGraph information about their data:
https://docs.google.com/spreadsheets/u/1/d/1UNWvPzkUTTlXBZ6M6iGhM_7sr8h-MxsZdE7iOszkAmk/htmlview#

Open latest weeks file and filter out zip codes according to Zip_Codes__LA_County_.txt

In [5]:
from pyspark.sql import SparkSession
import os
import pandas as pd
import time
import datetime

ss = SparkSession \
    .builder \
    .appName("Foot_Traffic") \
    .config("spark.master", "local[12]") \
    .getOrCreate()

start_time = time.time()
file = "2020-05-18-weekly-patterns.csv"
main_df = ss.read.csv(file, header=True)

with open("Zip_Codes__LA_County_.txt") as file:
    zip_codes = file.read().splitlines()
main_df = main_df.filter(main_df.postal_code.isin(zip_codes)).persist()
main_df.show()

+--------------------+--------------------+--------------------+--------------+------+-----------+----------------+--------------------+-------------------+--------------------+--------------------+----------------+------------------+--------------------+--------------------+------------+--------------------+--------------------+-------------------------+--------------------+--------------------+--------------------+----------------------+-----------------------+--------------------+
|  safegraph_place_id|       location_name|      street_address|          city|region|postal_code|iso_country_code| safegraph_brand_ids|             brands|    date_range_start|      date_range_end|raw_visit_counts|raw_visitor_counts|       visits_by_day| visits_by_each_hour|     poi_cbg|   visitor_home_cbgs|visitor_daytime_cbgs|visitor_country_of_origin|  distance_from_home|        median_dwell|bucketed_dwell_times|related_same_day_brand|related_same_week_brand|         device_type|
+--------------------+

Selecting the columns of interest
Drop visits_by_day from main_df and output csv

In [6]:
this_week_df = main_df.select('safegraph_place_id', 'visits_by_day').persist()
this_week_df.show()

+--------------------+--------------------+
|  safegraph_place_id|       visits_by_day|
+--------------------+--------------------+
|sg:05e114ea81c04f...|[8,14,8,13,11,11,13]|
|sg:0ee32c6bd1444f...|     [2,2,2,0,1,1,0]|
|sg:0f41c833a6e244...|     [2,2,0,1,2,0,2]|
|sg:1565a84dcbc144...|   [4,2,5,9,7,25,11]|
|sg:186f5017a7ed40...|     [4,8,5,8,4,4,6]|
|sg:242ecc445fce41...|     [0,1,0,1,1,0,0]|
|sg:2acd40b4f01641...|     [3,0,5,6,3,4,1]|
|sg:32ab764b731f45...|     [7,2,1,5,4,5,3]|
|sg:33e59f728dfc46...|     [4,2,6,2,2,2,2]|
|sg:3cacf516735646...|     [0,0,1,1,0,0,0]|
|sg:3dbd76d1be5649...|     [0,2,2,0,2,2,0]|
|sg:3e6ebb6322cf41...|     [3,1,1,1,1,3,1]|
|sg:4191791d657d49...|     [2,2,2,2,1,2,1]|
|sg:431dc367d3814e...|     [8,5,7,2,7,3,9]|
|sg:43714b6d8e2543...|     [3,4,3,5,4,3,0]|
|sg:5319c8cee9ed48...|     [0,0,0,1,0,0,0]|
|sg:689efcfa6dca4a...|     [0,0,0,1,1,0,3]|
|sg:75c40b727cb14e...|     [2,1,2,1,3,1,2]|
|sg:7d1eec578b2143...|     [0,0,1,0,1,1,3]|
|sg:7e01f8184f0248...|[20,10,7,1

Method to Join last weeks data to current. Loop this for every weekly file

In [7]:
def join_last_week_data(last_week_file, this_week_df, zip_codes):
    last_week_df = ss.read.csv(last_week_file, header=True)
    last_week_df = last_week_df.filter(last_week_df.postal_code.isin(zip_codes)) \
        .select('safegraph_place_id', 'visits_by_day')
    this_week_df = this_week_df.withColumnRenamed("visits_by_day", "visits_by_day_current")
    this_week_df = this_week_df.join(last_week_df, how='inner', on='safegraph_place_id')
    this_week_df = this_week_df.rdd \
    .map(lambda x: (x['safegraph_place_id'], (x['visits_by_day'] + x['visits_by_day_current']).replace("][", ","))) \
    .toDF(["safegraph_place_id", "visits_by_day_current"])
    return this_week_df

In [8]:
last_week_file_list = ["2020-05-11-weekly-patterns.csv", "2020-05-04-weekly-patterns.csv", "2020-04-27-weekly-patterns.csv", "2020-04-20-weekly-patterns.csv"]
for last_week_file in last_week_file_list:
    this_week_df = join_last_week_data(last_week_file, this_week_df, zip_codes).persist()
this_week_df.show()

+--------------------+---------------------+
|  safegraph_place_id|visits_by_day_current|
+--------------------+---------------------+
|sg:05c00ab60e2444...| [2,3,2,3,3,9,2,3,...|
|sg:08efd9dac9cf42...| [3,0,1,1,2,1,0,0,...|
|sg:090cf1c0f62e41...| [2,4,5,2,1,0,0,5,...|
|sg:0afb6633a4224e...| [1,2,0,0,0,0,0,0,...|
|sg:0c4e86749cd244...| [3,1,7,4,3,5,3,9,...|
|sg:0f1d0e10c73f41...| [1,1,3,1,2,0,0,4,...|
|sg:1059afe48a3945...| [1,0,0,2,5,2,4,7,...|
|sg:13023a144d7d45...| [0,0,1,0,0,3,4,4,...|
|sg:13c456f99f1d45...| [2,2,3,0,1,3,2,1,...|
|sg:15dc29d1952844...| [5,2,1,3,1,1,0,3,...|
|sg:17951e62dfb949...| [2,5,4,3,4,2,4,2,...|
|sg:193e08aca17e43...| [0,0,0,1,3,3,0,0,...|
|sg:1a4d75c19bf049...| [0,2,0,4,0,2,0,2,...|
|sg:209f866b4f654d...| [4,5,3,3,5,4,1,4,...|
|sg:25445cd5a95d4f...| [5,2,4,1,1,3,2,1,...|
|sg:289ac2aa272548...| [5,2,3,2,6,9,3,4,...|
|sg:2fb419debc154c...| [2,6,8,3,10,6,5,2...|
|sg:328d5e0fe5b546...| [0,3,0,1,2,1,0,0,...|
|sg:36302dbd8d9d46...| [2,2,0,0,0,0,0,1,...|
|sg:368f54

Convert spark df to pandas, remove brackets and split columns by comma

In [9]:
traffic_df = this_week_df.toPandas()

In [10]:
temp = traffic_df.visits_by_day_current.str.replace("[\[\]]", "", regex=True).str.split(",", expand=True)
traffic_df1 = pd.concat([traffic_df.safegraph_place_id, temp], axis=1)
traffic_df1 = traffic_df1.melt(id_vars="safegraph_place_id",
        var_name="Day",
        value_name="Visits")
# Convert day to proper date
day1 = datetime.datetime(2020, 4, 20)
traffic_df1.Day = traffic_df1.Day.map(lambda x: (day1 + datetime.timedelta(days=x)).strftime("%m/%d/%Y"))
traffic_df1

Unnamed: 0,safegraph_place_id,Day,Visits
0,sg:05c00ab60e24448a8c15304873a7833f,04/20/2020,2
1,sg:08efd9dac9cf425aad8f8fbac4e56a43,04/20/2020,3
2,sg:090cf1c0f62e4128825d72d9949fdbed,04/20/2020,2
3,sg:0afb6633a4224e2789661b49d2e7c4a0,04/20/2020,1
4,sg:0c4e86749cd244d2a574043f0c6f799b,04/20/2020,3
...,...,...,...
831350,sg:f33e0c3db6ed49529bfd9f3e81233922,05/24/2020,4
831351,sg:f4552203e5884b01a5cc47e8dac2cc09,05/24/2020,4
831352,sg:f5b503c9124e40a78879974d7e614d8c,05/24/2020,2
831353,sg:f8172d602e0343ad930ae1257860897f,05/24/2020,2


Adding zip code to the traffic data
Export to csv

In [11]:
main_df_pd = main_df.toPandas()
traffic_df2 = traffic_df1.merge(main_df_pd, how="inner", on="safegraph_place_id") \
    [['safegraph_place_id', 'postal_code', 'Day', 'Visits']]
traffic_df2

Unnamed: 0,safegraph_place_id,postal_code,Day,Visits
0,sg:05c00ab60e24448a8c15304873a7833f,90029,04/20/2020,2
1,sg:05c00ab60e24448a8c15304873a7833f,90029,04/21/2020,3
2,sg:05c00ab60e24448a8c15304873a7833f,90029,04/22/2020,2
3,sg:05c00ab60e24448a8c15304873a7833f,90029,04/23/2020,3
4,sg:05c00ab60e24448a8c15304873a7833f,90029,04/24/2020,3
...,...,...,...,...
831350,sg:f81bd5555e0a438b8c587b2092d32c1e,90057,05/20/2020,3
831351,sg:f81bd5555e0a438b8c587b2092d32c1e,90057,05/21/2020,4
831352,sg:f81bd5555e0a438b8c587b2092d32c1e,90057,05/22/2020,4
831353,sg:f81bd5555e0a438b8c587b2092d32c1e,90057,05/23/2020,4


In [12]:
traffic_df2.to_csv(os.path.join("output", "foot_traffic_time_series.csv"), index=False)

Total Running time (s)

In [13]:
time.time() - start_time

293.4430000782013

## Collecting the data for the places of interest
Read the core places dataset and combine the 5 parts, then filter by LA zip codes,
and filter the places present in traffic data df by left_anti join
https://docs.safegraph.com/docs#section-core-places

In [14]:
start_time = time.time()

file1 = os.path.join("CorePlaces", "core_poi-part1.csv")
file_list = [os.path.join("CorePlaces", "core_poi-part2.csv"), os.path.join("CorePlaces", "core_poi-part3.csv"), os.path.join("CorePlaces", "core_poi-part4.csv"), os.path.join("CorePlaces", "core_poi-part5.csv")]

places_df = ss.read.csv(file1, header=True)
for file in file_list:
    temp = ss.read.csv(file, header=True)
    places_df = places_df.union(temp)

places_df = places_df.filter(places_df.postal_code.isin(zip_codes))
places_df.count()

47000

In [15]:
places_df = places_df.join(this_week_df, how='inner', on='safegraph_place_id').persist()
places_df.count()

23753

Exporting to csv

In [16]:
places_df_pd = places_df.toPandas()
places_df_pd.to_csv(os.path.join("output", "places_info.csv"), index=False)

Total Running time (s)

In [17]:
time.time() - start_time

10.740000009536743