# SafeGraph foot traffic data processing (fast version)

In [3]:
from pyspark.sql import SparkSession

ss = SparkSession \
    .builder \
    .appName("Foot_Traffic") \
    .config("spark.master", "local[12]") \
    .getOrCreate()

file = "2020-05-11-weekly-patterns.csv"
week = file[0:10]
main_df = ss.read.csv(file, header=True)
main_df = main_df.filter((main_df.postal_code >= 90001) & (main_df.postal_code <= 90089))
main_df.show()

+--------------------+--------------------+--------------------+--------------+------+-----------+----------------+--------------------+-------------------+--------------------+--------------------+----------------+------------------+--------------------+--------------------+------------+--------------------+--------------------+-------------------------+--------------------+--------------------+--------------------+----------------------+-----------------------+--------------------+
|  safegraph_place_id|       location_name|      street_address|          city|region|postal_code|iso_country_code| safegraph_brand_ids|             brands|    date_range_start|      date_range_end|raw_visit_counts|raw_visitor_counts|       visits_by_day| visits_by_each_hour|     poi_cbg|   visitor_home_cbgs|visitor_daytime_cbgs|visitor_country_of_origin|  distance_from_home|        median_dwell|bucketed_dwell_times|related_same_day_brand|related_same_week_brand|         device_type|
+--------------------+

Selecting the columns of interest

In [4]:
main_df = main_df.select('safegraph_place_id', 'location_name', 'street_address', 'city', 'postal_code',
                         'raw_visit_counts', 'raw_visitor_counts', 'visits_by_day',
                   'median_dwell')
this_week_df = main_df.select('safegraph_place_id', 'visits_by_day')
this_week_df.show()

+--------------------+--------------------+
|  safegraph_place_id|       visits_by_day|
+--------------------+--------------------+
|sg:05e114ea81c04f...|[17,11,16,14,10,1...|
|sg:0ee32c6bd1444f...|     [2,4,0,1,1,0,1]|
|sg:0f41c833a6e244...|     [1,0,0,0,0,0,1]|
|sg:1565a84dcbc144...|  [5,6,12,11,12,9,9]|
|sg:186f5017a7ed40...|     [3,4,6,4,6,6,1]|
|sg:242ecc445fce41...|     [0,1,0,1,1,0,0]|
|sg:2acd40b4f01641...|     [4,4,5,6,1,0,1]|
|sg:32ab764b731f45...|     [1,1,5,0,2,9,3]|
|sg:33d770a759f547...|     [0,1,0,0,0,0,0]|
|sg:33e59f728dfc46...|     [5,1,3,2,3,3,1]|
|sg:3cacf516735646...|     [0,0,1,1,0,0,0]|
|sg:3dbd76d1be5649...|     [0,0,0,0,0,1,0]|
|sg:3e6ebb6322cf41...|     [2,1,2,1,1,2,2]|
|sg:4191791d657d49...|     [0,0,0,2,1,1,2]|
|sg:431dc367d3814e...|    [5,4,9,1,6,4,10]|
|sg:43714b6d8e2543...|    [2,3,2,4,4,10,5]|
|sg:4bede0c4b7a44f...|     [0,1,0,0,0,0,0]|
|sg:5319c8cee9ed48...|     [0,1,0,0,1,0,0]|
|sg:6208cc6b313840...|     [0,1,0,0,0,0,0]|
|sg:689efcfa6dca4a...|     [1,0,

Join last weeks data to current

In [5]:
def join_last_week_data(last_week_file, this_week_df):
    last_week_df = ss.read.csv(last_week_file, header=True)
    last_week_df = last_week_df.filter((last_week_df.postal_code >= 90001) & (last_week_df.postal_code <= 90089)) \
        .select('safegraph_place_id', 'visits_by_day')
    this_week_df = this_week_df.withColumnRenamed("visits_by_day", "visits_by_day_current")
    this_week_df = this_week_df.join(last_week_df, how='inner', on='safegraph_place_id')
    this_week_df = this_week_df.rdd \
    .map(lambda x: (x['safegraph_place_id'], (x['visits_by_day'] + x['visits_by_day_current']).replace("][", ","))) \
    .toDF(["safegraph_place_id", "visits_by_day_current"])
    return this_week_df

In [6]:
last_week_file_list = ["2020-05-04-weekly-patterns.csv", "2020-04-27-weekly-patterns.csv", "2020-04-20-weekly-patterns.csv"]
for last_week_file in last_week_file_list:
    this_week_df = join_last_week_data(last_week_file, this_week_df)
this_week_df.show()

+--------------------+---------------------+
|  safegraph_place_id|visits_by_day_current|
+--------------------+---------------------+
|sg:05c00ab60e2444...| [2,3,2,3,3,9,2,3,...|
|sg:08efd9dac9cf42...| [3,0,1,1,2,1,0,0,...|
|sg:090cf1c0f62e41...| [2,4,5,2,1,0,0,5,...|
|sg:0afb6633a4224e...| [1,2,0,0,0,0,0,0,...|
|sg:0c4e86749cd244...| [3,1,7,4,3,5,3,9,...|
|sg:0f1d0e10c73f41...| [1,1,3,1,2,0,0,4,...|
|sg:1059afe48a3945...| [1,0,0,2,5,2,4,7,...|
|sg:13023a144d7d45...| [0,0,1,0,0,3,4,4,...|
|sg:13c456f99f1d45...| [2,2,3,0,1,3,2,1,...|
|sg:15dc29d1952844...| [5,2,1,3,1,1,0,3,...|
|sg:17951e62dfb949...| [2,5,4,3,4,2,4,2,...|
|sg:193e08aca17e43...| [0,0,0,1,3,3,0,0,...|
|sg:1a4d75c19bf049...| [0,2,0,4,0,2,0,2,...|
|sg:209f866b4f654d...| [4,5,3,3,5,4,1,4,...|
|sg:25445cd5a95d4f...| [5,2,4,1,1,3,2,1,...|
|sg:289ac2aa272548...| [5,2,3,2,6,9,3,4,...|
|sg:2fb419debc154c...| [2,6,8,3,10,6,5,2...|
|sg:328d5e0fe5b546...| [0,3,0,1,2,1,0,0,...|
|sg:36302dbd8d9d46...| [2,2,0,0,0,0,0,1,...|
|sg:368f54

In [8]:
import pandas
traffic_df = this_week_df.toPandas()

safegraph_place_id       object
visits_by_day_current    object
dtype: object