# Top 10 locations by month


####  Import packages and start the session

In [1]:
%idle_timeout 2880
%glue_version 3.0
%worker_type G.1X
%number_of_workers 2
%extra_py_files s3://pedestrian-analysis-working-bucket/glue-job-scripts/util.py

import sys, io, zipfile, pandas as pd, util
from datetime import datetime

from pyspark.sql.functions import sum, col, rank, desc, lit, month
from pyspark.sql.window import Window
from pyspark.sql.functions import row_number

from pyspark.context import SparkContext
from pyspark.sql.types import (
    StructType, StructField, StringType, IntegerType, DoubleType, TimestampType
)

from awsglue.utils import getResolvedOptions
from awsglue.context import GlueContext
from awsglue.job import Job

sc = SparkContext.getOrCreate()
glueContext = GlueContext(sc)
spark = glueContext.spark_session
job = Job(glueContext)

Welcome to the Glue Interactive Sessions Kernel
For more information on available magic commands, please type %help in any new cell.

Please view our Getting Started page to access the most up-to-date information on the Interactive Sessions kernel: https://docs.aws.amazon.com/glue/latest/dg/interactive-sessions.html
Installed kernel version: 0.37.0 
Current idle_timeout is 2800 minutes.
idle_timeout has been set to 2880 minutes.
Setting Glue version to: 3.0
Previous worker type: G.1X
Setting new worker type to: G.1X
Previous number of workers: 5
Setting new number of workers to: 2
Extra py files to be included:
s3://pedestrian-analysis-working-bucket/glue-job-scripts/util.py
Authenticating with environment variables and user-defined glue_role_arn: arn:aws:iam::632753217422:role/pedestrians-analysis-notebook-role
Trying to create a Glue session for the kernel.
Worker Type: G.1X
Number of Workers: 2
Session ID: f9bb013e-c03d-4a89-b755-99aca0dfc9a1
Job Type: glueetl
Applying the following

####  Create output glue table if it doesn't already exist
##### The results of this notebook will be loaded into this table

In [2]:
BUCKET_NAME = 'pedestrian-analysis-working-bucket'
DATABASE_NAME = 'pedestrian_analysis_report'
OUTPUT_TABLE_NAME = 'report_top_10_locations_by_month'

schema = StructType([
    StructField("month",StringType(),True),
    StructField("rank",IntegerType(),True),
    StructField("sensor_id",IntegerType(),True),
    StructField("location_name",StringType(),True),
    StructField("monthly_count",IntegerType(),True),
])

s3_path = f"s3://{BUCKET_NAME}/report/{OUTPUT_TABLE_NAME}/"
util.create_glue_catalog_table(DATABASE_NAME, OUTPUT_TABLE_NAME, schema, s3_path)

Table pedestrian_analysis_report.report_top_10_locations_by_month already exists in the Glue Data Catalog


####  Load sensor_counts_by_day

In [3]:
sensor_count_by_day_df = glueContext.create_dynamic_frame.from_catalog(
    database="pedestrian_analysis_report",
    table_name="report_top_10_locations_by_day"
).toDF()

sensor_count_by_day_df.show(10, truncate=False)

+----------+----+---------+--------------------------+-----------+
|date      |rank|sensor_id|location_name             |daily_count|
+----------+----+---------+--------------------------+-----------+
|2014-09-14|8   |15       |null                      |17467      |
|2010-05-23|7   |15       |null                      |12199      |
|2010-08-12|2   |2        |Bourke Street Mall (South)|25877      |
|2011-11-17|1   |4        |Town Hall (West)          |40499      |
|2016-04-08|2   |4        |Town Hall (West)          |50777      |
|2014-03-14|8   |2        |Bourke Street Mall (South)|31466      |
|2011-06-24|8   |15       |null                      |21495      |
|2017-12-27|5   |5        |Princes Bridge            |35427      |
|2009-12-23|2   |2        |Bourke Street Mall (South)|40892      |
|2011-07-19|8   |5        |Princes Bridge            |15870      |
+----------+----+---------+--------------------------+-----------+
only showing top 10 rows


####  Load sensor_reference_data

In [14]:
sensor_reference_df = glueContext.create_dynamic_frame.from_catalog(
    database="pedestrian_analysis_raw",
    table_name="sensor_reference_data"
).toDF()

sensor_reference_df.show(10, truncate=False)

+-----------+-----------+-----------------+------------+------------------------------------------+-----------+-------------+------------+---------------------------------------------+---------------------------------+-----------+------+
|direction_1|direction_2|installation_date|latitude    |location                                  |location_id|location_type|longitude   |note                                         |sensor_description               |sensor_name|status|
+-----------+-----------+-----------------+------------+------------------------------------------+-----------+-------------+------------+---------------------------------------------+---------------------------------+-----------+------+
|North      |South      |2009-03-25       |-37.81101524|{lon -> 144.96429485, lat -> -37.81101524}|3          |Outdoor      |144.96429485|null                                         |Melbourne Central                |Swa295_T   |A     |
|North      |South      |2009-03-23       |-37.8

####  'date' is currently a full date string
####  This extracts the year and month

In [5]:
sensor_count_by_day_df = sensor_count_by_day_df \
    .withColumn("year_month", col("date").substr(1, 7))

sensor_count_by_day_df.show(10, truncate=False)

+----------+----+---------+--------------------------+-----------+----------+
|date      |rank|sensor_id|location_name             |daily_count|year_month|
+----------+----+---------+--------------------------+-----------+----------+
|2014-09-14|8   |15       |null                      |17467      |2014-09   |
|2010-05-23|7   |15       |null                      |12199      |2010-05   |
|2010-08-12|2   |2        |Bourke Street Mall (South)|25877      |2010-08   |
|2011-11-17|1   |4        |Town Hall (West)          |40499      |2011-11   |
|2016-04-08|2   |4        |Town Hall (West)          |50777      |2016-04   |
|2014-03-14|8   |2        |Bourke Street Mall (South)|31466      |2014-03   |
|2011-06-24|8   |15       |null                      |21495      |2011-06   |
|2017-12-27|5   |5        |Princes Bridge            |35427      |2017-12   |
|2009-12-23|2   |2        |Bourke Street Mall (South)|40892      |2009-12   |
|2011-07-19|8   |5        |Princes Bridge            |15870     

#### Group by 'year_month' and 'sensor_id' and sum the 'daily_count' per group

In [6]:
grouped_sensor_count_df = sensor_count_by_day_df \
    .groupBy("year_month", "sensor_id") \
    .agg(sum("daily_count").alias("monthly_count"))

grouped_sensor_count_df.show(10, truncate=False)

+----------+---------+-------------+
|year_month|sensor_id|monthly_count|
+----------+---------+-------------+
|2014-09   |15       |316922       |
|2013-05   |5        |673187       |
|2009-06   |4        |1009857      |
|2014-06   |5        |655945       |
|2021-08   |4        |198056       |
|2013-04   |4        |1134511      |
|2018-12   |4        |1225432      |
|2015-09   |5        |821938       |
|2018-08   |5        |928151       |
|2010-03   |4        |1145962      |
+----------+---------+-------------+
only showing top 10 rows


#### Add a new column 'rank' that ranks the rows within each partition based on their monthly_count

In [7]:
window_spec = Window.partitionBy("year_month") \
    .orderBy(desc("monthly_count"))
ranked_sensor_count_df = grouped_sensor_count_df \
    .withColumn("rank", row_number().over(window_spec))

ranked_sensor_count_df.show(10, truncate=False)

+----------+---------+-------------+----+
|year_month|sensor_id|monthly_count|rank|
+----------+---------+-------------+----+
|2010-03   |4        |1145962      |1   |
|2010-03   |3        |908894       |2   |
|2010-03   |5        |823435       |3   |
|2010-03   |2        |819131       |4   |
|2010-03   |6        |778368       |5   |
|2010-03   |15       |622935       |6   |
|2010-03   |1        |620191       |7   |
|2010-03   |13       |570661       |8   |
|2010-03   |16       |535246       |9   |
|2010-03   |17       |311616       |10  |
+----------+---------+-------------+----+
only showing top 10 rows


#### Filter the rows where rank <= 10 to get the top 10 sensor_ids for each month

In [8]:
top_10_sensors_by_month_df = ranked_sensor_count_df.filter(col("rank") <= 10) \
    .orderBy(col("year_month").desc(), col("rank"))

top_10_sensors_by_month_df.show(10)

+----------+---------+-------------+----+
|year_month|sensor_id|monthly_count|rank|
+----------+---------+-------------+----+
|   2022-10|       41|      1179043|   1|
|   2022-10|        4|       969871|   2|
|   2022-10|       35|       953917|   3|
|   2022-10|       84|       764000|   4|
|   2022-10|        3|       712606|   5|
|   2022-10|       66|       669021|   6|
|   2022-10|        1|       666886|   7|
|   2022-10|        6|       630154|   8|
|   2022-10|       25|       545899|   9|
|   2022-10|       47|       433171|  10|
+----------+---------+-------------+----+
only showing top 10 rows


#### Left join the reference data in to obtain the correct sensor_description

In [9]:
top_10_sensors_by_month_df = top_10_sensors_by_month_df.join(
    sensor_reference_df,
    col("sensor_id") == col("location_id"),
    "left"
)




#### Select and format final report

In [13]:
top_10_sensors_by_month_df = top_10_sensors_by_month_df.select(
    col('year_month').cast('string'),
    col('rank'),
    col('sensor_id'),
    col('sensor_description').alias('location_name'),
    col('monthly_count').cast('int').alias('monthly_count')
)

top_10_sensors_by_month_df.show(100, truncate=False)

+----------+----+---------+---------------------------------+-------------+
|year_month|rank|sensor_id|location_name                    |monthly_count|
+----------+----+---------+---------------------------------+-------------+
|2010-03   |1   |4        |Town Hall (West)                 |1145962      |
|2010-03   |2   |3        |Melbourne Central                |908894       |
|2010-03   |3   |5        |Princes Bridge                   |823435       |
|2010-03   |4   |2        |Bourke Street Mall (South)       |819131       |
|2010-03   |5   |6        |Flinders Street Station Underpass|778368       |
|2010-03   |6   |15       |null                             |622935       |
|2010-03   |7   |1        |Bourke Street Mall (North)       |620191       |
|2010-03   |8   |13       |null                             |570661       |
|2010-03   |9   |16       |null                             |535246       |
|2010-03   |10  |17       |Collins Place (South)            |311616       |
|2010-05   |

#### Upload to S3

In [15]:
util.upload_to_s3(glueContext, top_10_sensors_by_month_df, s3_path)

Successfully Uploaded to s3 in path: s3://pedestrian-analysis-working-bucket/report/report_top_10_locations_by_month/
