# SafeGraph foot traffic data processing

Read the most recent week csv from Safeguard
s3://sg-c19-response/weekly-patterns/
SafeGraph information about their data:
https://docs.google.com/spreadsheets/u/1/d/1UNWvPzkUTTlXBZ6M6iGhM_7sr8h-MxsZdE7iOszkAmk/htmlview#

In [1]:
from pyspark.sql import SparkSession
import time

start_time = time.time()

ss = SparkSession \
    .builder \
    .appName("Foot_Traffic") \
    .config("spark.master", "local[12]") \
    .getOrCreate()

file = "2020-05-11-weekly-patterns.csv"
week = file[0:10]
main_df = ss.read.csv(file, header=True)
main_df = main_df.filter((main_df.postal_code >= 90001) & (main_df.postal_code <= 90089))
main_df.show()

+--------------------+--------------------+--------------------+--------------+------+-----------+----------------+--------------------+-------------------+--------------------+--------------------+----------------+------------------+--------------------+--------------------+------------+--------------------+--------------------+-------------------------+--------------------+--------------------+--------------------+----------------------+-----------------------+--------------------+
|  safegraph_place_id|       location_name|      street_address|          city|region|postal_code|iso_country_code| safegraph_brand_ids|             brands|    date_range_start|      date_range_end|raw_visit_counts|raw_visitor_counts|       visits_by_day| visits_by_each_hour|     poi_cbg|   visitor_home_cbgs|visitor_daytime_cbgs|visitor_country_of_origin|  distance_from_home|        median_dwell|bucketed_dwell_times|related_same_day_brand|related_same_week_brand|         device_type|
+--------------------+

Selecting the columns of interest

In [2]:
main_df = main_df.select('safegraph_place_id', 'location_name', 'street_address', 'city', 'postal_code',
                         'raw_visit_counts', 'raw_visitor_counts', 'visits_by_day',
                   'median_dwell')
main_df.show()

+--------------------+--------------------+--------------------+--------------+-----------+----------------+------------------+--------------------+--------------------+
|  safegraph_place_id|       location_name|      street_address|          city|postal_code|raw_visit_counts|raw_visitor_counts|       visits_by_day|        median_dwell|
+--------------------+--------------------+--------------------+--------------+-----------+----------------+------------------+--------------------+--------------------+
|sg:05e114ea81c04f...|          McDonald's|4947 Huntington Dr N|   Los Angeles|      90032|              96|                82|[17,11,16,14,10,1...|  ""060375355012"":4|
|sg:0ee32c6bd1444f...|     Go Get Em Tiger|230 N Larchmont Blvd|   Los Angeles|      90004|               9|                 8|     [2,4,0,1,1,0,1]|  ""060371923003"":4|
|sg:0f41c833a6e244...| Bloomfield Creamery|  1427 Westwood Blvd|   Los Angeles|      90024|               2|                 2|     [1,0,0,0,0,0,1]|  

In [3]:
this_week_df = main_df.select('safegraph_place_id', 'visits_by_day')
last_week_file = "2020-05-04-weekly-patterns.csv"
last_week_df = ss.read.csv(last_week_file, header=True)
last_week_df = last_week_df.filter((last_week_df.postal_code >= 90001) & (last_week_df.postal_code <= 90089))
last_week_df = last_week_df.select('safegraph_place_id', 'visits_by_day')
last_week_df.show()


+--------------------+--------------------+
|  safegraph_place_id|       visits_by_day|
+--------------------+--------------------+
|sg:05e114ea81c04f...|[12,21,7,10,19,20...|
|sg:0ee32c6bd1444f...|     [0,0,0,3,2,1,2]|
|sg:0f41c833a6e244...|     [0,0,0,0,0,0,2]|
|sg:1565a84dcbc144...| [7,10,5,8,10,11,11]|
|sg:186f5017a7ed40...|     [3,4,5,5,5,5,1]|
|sg:242ecc445fce41...|     [0,0,0,0,2,0,0]|
|sg:2acd40b4f01641...|     [3,0,1,0,1,4,1]|
|sg:32ab764b731f45...|     [0,3,1,1,1,7,5]|
|sg:33e59f728dfc46...|     [5,4,7,3,4,5,5]|
|sg:3dbd76d1be5649...|     [0,1,0,2,0,1,0]|
|sg:3e6ebb6322cf41...|     [1,1,2,0,1,5,2]|
|sg:4191791d657d49...|     [0,2,1,2,2,2,2]|
|sg:431dc367d3814e...|     [2,3,5,4,4,1,1]|
|sg:43714b6d8e2543...|     [3,4,3,2,3,2,5]|
|sg:4bede0c4b7a44f...|     [0,0,0,1,0,0,0]|
|sg:4e32601b03af49...|     [0,0,0,0,1,0,0]|
|sg:5319c8cee9ed48...|     [0,0,0,1,0,0,0]|
|sg:6208cc6b313840...|     [0,0,0,0,0,0,1]|
|sg:689efcfa6dca4a...|     [0,0,2,0,0,0,1]|
|sg:6ef287e037d046...|     [0,0,

Rename visits by day column as current
Merging this week with last weeks data

In [5]:
this_week_df = this_week_df.withColumnRenamed("visits_by_day", "visits_by_day_current")
this_week_df = this_week_df.join(last_week_df, how='inner', on='safegraph_place_id')
this_week_df.show()

+--------------------+---------------------+----------------+
|  safegraph_place_id|visits_by_day_current|   visits_by_day|
+--------------------+---------------------+----------------+
|sg:05c00ab60e2444...|      [0,1,2,0,0,2,0]| [2,1,0,0,1,1,0]|
|sg:08efd9dac9cf42...|      [3,2,5,4,4,1,0]| [1,1,3,3,1,1,1]|
|sg:090cf1c0f62e41...|      [4,8,5,4,7,0,0]| [4,3,4,4,4,4,0]|
|sg:0afb6633a4224e...|      [0,2,0,1,1,0,0]| [0,2,0,1,2,0,0]|
|sg:0c4e86749cd244...|     [3,7,4,4,5,11,3]| [1,3,3,3,4,4,8]|
|sg:0f1d0e10c73f41...|      [2,1,2,0,0,0,0]| [0,1,4,1,1,1,1]|
|sg:1059afe48a3945...|      [3,3,3,0,3,1,2]| [2,2,7,4,2,6,0]|
|sg:13023a144d7d45...|      [2,1,3,2,3,6,3]| [5,1,3,1,3,3,3]|
|sg:13c456f99f1d45...|      [4,3,4,4,7,4,3]|[2,1,2,0,2,10,3]|
|sg:15dc29d1952844...|      [3,0,4,1,2,4,0]| [5,1,1,5,2,4,2]|
|sg:17951e62dfb949...|      [3,3,2,4,7,1,1]| [3,5,2,4,2,0,3]|
|sg:193e08aca17e43...|      [0,0,1,0,2,1,1]| [0,0,0,1,3,0,4]|
|sg:1a4d75c19bf049...|      [0,0,0,0,2,0,0]| [0,1,1,3,0,0,1]|
|sg:209f

Combine last week and this weeks columns by rdd map

In [None]:
this_week_df = this_week_df.rdd \
    .map(lambda x: (x['safegraph_place_id'], (x['visits_by_day'] + x['visits_by_day_current']).replace("][", ","))) \
    .toDF(["safegraph_place_id", "visits_by_day_current"])
this_week_df.show()

Total Running time (s)

In [None]:
time.time() - start_time