In [63]:
from pyspark.sql import SparkSession
from pyspark.sql.types import (
    StructType,
    StructField,
    StringType,
    DoubleType,
    TimestampType,
    IntegerType,
    ArrayType,
)
from pyspark.sql.functions import col, udf, lag, lit, when
from pyspark.sql import functions as F
from pyspark.sql.window import Window
import numpy as np

In [64]:
INPUT_PATH = "/Users/harshdeepsingh/ASU/Lab_V2/GSN/data/anl_port_events_000000000000.csv"
OUTPUT_PATH = "temp"

spark = (
    SparkSession.builder.appName("MyApp")
    .config("spark.driver.memory", "2000g")
    .master("local[*]")
    .getOrCreate()
)

In [65]:
df = spark.read.csv(INPUT_PATH, header=True)
df.printSchema()

root
 |-- mmsi: string (nullable = true)
 |-- imo: string (nullable = true)
 |-- AOI_Name: string (nullable = true)
 |-- ATA: string (nullable = true)
 |-- collection_type: string (nullable = true)
 |-- speed_first_stop: string (nullable = true)
 |-- extract_time: string (nullable = true)
 |-- last_inport_timestamp: string (nullable = true)
 |-- ATD: string (nullable = true)
 |-- in_port_position_count: string (nullable = true)
 |-- created_at: string (nullable = true)
 |-- updated_at: string (nullable = true)
 |-- port_locode: string (nullable = true)
 |-- polygon_id: string (nullable = true)
 |-- polygon_type: string (nullable = true)
 |-- draft_first_stop: string (nullable = true)
 |-- draft_after_left: string (nullable = true)
 |-- UUID: string (nullable = true)



In [66]:
df = df.withColumn("timestamp", col("ATA").cast(TimestampType())).withColumn("Date", F.date_format("timestamp", "yyyy-MM-dd")).drop("timestamp")

In [67]:
ship_count_df = df.groupBy("Date").agg(F.countDistinct("MMSI").alias("ship_count"))

In [68]:
grouped_df = df.groupBy("Date","port_locode").count()

In [69]:
window_spec = Window.partitionBy("Date").orderBy(F.desc("count"))


In [70]:
popular_port_df = (
    grouped_df.withColumn("row_num", F.row_number().over(window_spec))
    .filter(col("row_num") == 1)
    .drop("row_num")
    .withColumnRenamed("count", "popular_port_count")
    .withColumnRenamed("port_locode", "popular_port")
)

In [71]:
final_df = ship_count_df.join(popular_port_df, on="Date", how="left")

In [72]:
final_df.coalesce(1).write.csv(OUTPUT_PATH, header=True)

                                                                                

+----------+----------+------------+------------------+
|      Date|ship_count|popular_port|popular_port_count|
+----------+----------+------------+------------------+
|2022-10-05|       610|       SGSIN|                16|
|2021-11-03|         4|       ISREY|                 1|
|2021-12-23|        11|       CNCGM|                 1|
|2022-10-07|       593|       SGSIN|                22|
|2022-05-17|       600|       SGSIN|                19|
|2022-03-30|       622|       SGSIN|                23|
|2021-10-25|         2|       ESCAR|                 1|
|2021-11-15|         4|       GRPIR|                 1|
|2022-07-04|       575|       NLRTM|                16|
|2022-01-20|       569|       NLRTM|                21|
|2022-07-08|       642|       SGSIN|                26|
|2022-09-03|       546|       NLRTM|                18|
|2022-07-30|       540|       USMSY|                22|
|2022-10-24|       648|       NLRTM|                22|
|2022-03-21|       556|       USMSY|            