
- <b>Author:</b> Juan David Escobar Escobar
- <b>Linkedin:</b> <a href="https://www.linkedin.com/in/jdescobar/">https://www.linkedin.com/in/jdescobar/</a>


1. Build a string of flights .i.e. next flights based on a turn of the aircraft
2. Build a dataset to know in every 15 minutes, how many flights coming in and out of an airport in 2 hours before


# Solution

In [0]:
from pyspark.sql.functions import col, to_timestamp, expr, to_date
from pyspark.sql.types import StructType, StructField, StringType, TimestampType
from datetime import datetime, timedelta
from pyspark.sql import functions as F


# schema
_flight_schema = StructType([
    StructField("orig", StringType(), True),
    StructField("dest", StringType(), True),
    StructField("id", StringType(), True),
    StructField("actl_dep_lcl_tms", TimestampType(), True),
    StructField("actl_arr_lcl_tms", TimestampType(), True),
    StructField("flight_num", StringType(), True),
    StructField("flights", StringType(), True),
    StructField("acft_regs_cde", StringType(), True),
    StructField("airborne_lcl_tms", TimestampType(), True),
    StructField("landing_lcl_tms", TimestampType(), True),
    StructField("next_flight_id", StringType(), True)
])

# mock data
_flight_data = [
    ('YYZ','YVR','1','2022-12-31T20:36:00','2022-12-31T22:28:00','127','1','737','2022-12-31T21:02:00','2022-12-31T22:17:00',''),
    ('YYZ','YVR','2','2022-12-31T19:39:00','2022-12-31T21:22:00','185','1','451','2022-12-31T20:05:00','2022-12-31T21:14:00',None),
    ('YYZ','YVR','3','2022-12-31T18:53:00','2022-12-31T20:33:00','123','1','843','2022-12-31T19:10:00','2022-12-31T20:22:00',''),
    ('YYZ','YVR','4','2022-12-31T17:27:00','2022-12-31T19:00:00','121','1','747','2022-12-31T17:43:00','2022-12-31T18:53:00',''),
    ('YYZ','YVR','5','2022-12-31T16:44:00','2022-12-31T18:31:00','119','1','464','2022-12-31T16:56:00','2022-12-31T18:23:00',''),
    ('YYZ','YVR','6','2022-12-31T14:35:00','2022-12-31T16:43:00','113','1','743','2022-12-31T15:04:00','2022-12-31T16:35:00',''),
    ('YYZ','YVR','7','2022-12-31T13:08:00','2022-12-31T14:53:00','111','1','462','2022-12-31T13:20:00','2022-12-31T14:46:00',''),
    ('YYZ','YVR','8','2022-12-31T11:23:00','2022-12-31T13:03:00','105','1','735','2022-12-31T11:40:00','2022-12-31T12:54:00',''),
    ('YYZ','YVR','9','2022-12-31T10:18:00','2022-12-31T12:27:00','107','1','457','2022-12-31T10:39:00','2022-12-31T12:08:00',''),
    ('YYZ','YVR','10','2022-12-31T08:50:00','2022-12-31T10:38:00','103','1','451','2022-12-31T09:13:00','2022-12-31T10:28:00','16'),
    ('YVR','YYZ','12','2022-12-31T16:02:00','2022-12-31T23:20:00','120','1','462','2022-12-31T16:15:00','2022-12-31T23:10:00',''),
    ('YVR','YYZ','13','2022-12-31T14:58:00','2022-12-31T22:20:00','116','1','735','2022-12-31T15:16:00','2022-12-31T22:00:00',''),
    ('YVR','YYZ','14','2022-12-31T14:02:00','2022-12-31T21:28:00','118','1','457','2022-12-31T14:13:00','2022-12-31T21:15:00',''),
    ('YVR','YYZ','15','2022-12-31T12:47:00','2022-12-31T19:59:00','114','1','738','2022-12-31T13:05:00','2022-12-31T19:48:00',''),
    ('YVR','YYZ','16','2022-12-31T11:55:00','2022-12-31T19:10:00','106','1','451','2022-12-31T12:10:00','2022-12-31T18:55:00','2'),
    ('YVR','YYZ','17','2022-12-31T10:08:00','2022-12-31T17:27:00','110','1','737','2022-12-31T10:23:00','2022-12-31T17:15:00',''),
    ('YVR','YYZ','18','2022-12-31T09:48:00','2022-12-31T16:58:00','108','1','747','2022-12-31T10:02:00','2022-12-31T16:50:00',''),
    ('YVR','YYZ','19','2022-12-31T08:07:00','2022-12-31T15:19:00','104','1','843','2022-12-31T08:24:00','2022-12-31T15:10:00',''),
    ('YVR','YYZ','20','2022-12-31T06:04:00','2022-12-31T13:24:00','100','1','743','2022-12-31T06:23:00','2022-12-31T13:16:00',''),
    ('YVR','YYZ','21','2022-12-31T00:29:00','2022-12-31T08:27:00','128','1','451','2022-12-31T01:01:00','2022-12-31T08:11:00','10')
]


def print_flight_sequence(flight_num: int, flights_df) -> None:
    """
    Print the sequence of flights following a specific flight.

    Args:
        flight_num (str): The flight number for which the sequence is to be found.
        flights_df (DataFrame): The flight DataFrame.

    Returns:
        None
    """

    # filter flights based on the target flight number and order by actual departure time
    target_flight = flights_df.filter(col('flight_num') == flight_num).first()
    target_arrival_time = target_flight['actl_arr_lcl_tms'] 
    target_dest = target_flight['dest']
    next_flight_str = f'Next filghts after flight_num ({flight_num}): \n\n'

    # get next flight
    for row in flights_df.collect():   
        current_next_flight = (flights_df.filter((col('actl_dep_lcl_tms') > target_arrival_time) & (col('orig') == target_dest))
                                         .orderBy(col('actl_dep_lcl_tms'))
                                         .first())

        if current_next_flight:        
            target_arrival_time = current_next_flight['actl_arr_lcl_tms'] 
            target_dest = current_next_flight['dest'] 
            next_flight_str = next_flight_str + f"Next flight: {current_next_flight['flight_num']}" + '\n'

    print(next_flight_str[:-2])

def calculate_time_series_df(flight_dates) -> DataFrame:
  """
  Calculate a time series DataFrame for 15-minute intervals.

  Args:
      flight_dates (DataFrame): DataFrame containing flight dates.

  Returns:
      DataFrame: Time series DataFrame with 15-minute intervals.
  """
  result_data = []
  for row in flight_dates.collect():
    airport_code = row['airport_code_fd']
    tms = row['timestamp']
    result_data.append((airport_code, tms))

    for x in range(0, 95):      
      tms = tms + timedelta(minutes=15)
      result_data.append((airport_code, tms))

  time_series_df = spark.createDataFrame(result_data, ["airport_code_fd", "tms"])
  return time_series_df

def get_ds_fifteen_minutes_flights(flights_df) -> None:
  """
  Generate a dataset to count incoming and outgoing flights for every 15-minute interval, two hours before.

  Args:
      flights_df (DataFrame): The flight DataFrame.

  Returns:
      None
  """
  flights_df = flights_df.withColumn("dep_date", to_date(col("actl_dep_lcl_tms")))
  flights_df = flights_df.withColumn("arr_date", to_date(col("actl_arr_lcl_tms")))
  
  arrivals_df = (flights_df.select("dest", "actl_arr_lcl_tms")
                           .distinct()
                           .withColumnRenamed("dest", "airport_code")
                           .withColumn("date", to_date(col("actl_arr_lcl_tms"))))
  
  departures_df = (flights_df.select("orig", "actl_dep_lcl_tms") 
                             .distinct() 
                             .withColumnRenamed("orig", "airport_code") 
                             .withColumn("date", to_date(col("actl_dep_lcl_tms"))))
  
  flight_dates = arrivals_df.union(departures_df).select("airport_code", "date").distinct()
  flight_dates = flight_dates.withColumnRenamed("airport_code", "airport_code_fd")
  flight_dates = flight_dates.withColumn("timestamp", to_timestamp(col("date"), "%Y-%m-%d %H:%M:%S"))   

  time_series_df = calculate_time_series_df(flight_dates)
  time_series_df = time_series_df.withColumn("tms_minus_2_hours", expr("tms - INTERVAL 2 HOURS"))

  # count out
  time_series_df = time_series_df.join(
    departures_df,
    (departures_df.airport_code == time_series_df.airport_code_fd) &
    (departures_df.actl_dep_lcl_tms <= time_series_df.tms) &
    (departures_df.actl_dep_lcl_tms > time_series_df.tms_minus_2_hours),
    "left"
  ).groupBy("airport_code_fd", "tms", "tms_minus_2_hours").agg(F.count("actl_dep_lcl_tms").alias("out"))

  # count in
  time_series_df = time_series_df.join(
    arrivals_df,
    (arrivals_df.airport_code == time_series_df.airport_code_fd) &
    (arrivals_df.actl_arr_lcl_tms <= time_series_df.tms) &
    (arrivals_df.actl_arr_lcl_tms > time_series_df.tms_minus_2_hours),
    "left"
  ).groupBy("airport_code_fd", "tms", "out").agg(F.count("actl_arr_lcl_tms").alias("in"))


  display(time_series_df)

def main():
    print('TASK 1')
    print('String of flights .i.e. next flights based on a turn of the aircraft: \n')
    
    # Format timestamps
    flight_data = [
        (orig,
        dest, 
        id, 
        datetime.strptime(actl_dep_lcl_tms, "%Y-%m-%dT%H:%M:%S"), 
        datetime.strptime(actl_arr_lcl_tms, "%Y-%m-%dT%H:%M:%S"), 
        flight_num, flights, acft_regs_cde, 
        datetime.strptime(airborne_lcl_tms, "%Y-%m-%dT%H:%M:%S"), 
        datetime.strptime(landing_lcl_tms, "%Y-%m-%dT%H:%M:%S"), 
        next_flight_id)
        for orig, dest, id, actl_dep_lcl_tms, actl_arr_lcl_tms, flight_num, flights, acft_regs_cde, airborne_lcl_tms, landing_lcl_tms, next_flight_id in _flight_data
    ]

    # Crearte DF sorted by departure
    flights_df = spark.createDataFrame(flight_data, _flight_schema).orderBy('actl_dep_lcl_tms')

    # get first flight
    target_flight_number = '128'

    # Task 1: print next flights sequence
    print_flight_sequence(target_flight_number, flights_df)

    print('\n')
    print('TASK 2')
    print('Dataset to know in every 15 minutes, how many flights coming in and out of an airport in 2 hours before: \n')
    
    # Task 2: get_ds_fifteen_minutes_flights
    get_ds_fifteen_minutes_flights(flights_df)

main()

TASK 1
String of flights .i.e. next flights based on a turn of the aircraft: 

Next filghts after flight_num (128): 

Next flight: 103
Next flight: 106
Next flight: 18


TASK 2
Dataset to know in every 15 minutes, how many flights coming in and out of an airport in 2 hours before: 



airport_code_fd,tms,out,in
YYZ,2022-12-31T00:00:00.000+0000,0,0
YYZ,2022-12-31T00:15:00.000+0000,0,0
YYZ,2022-12-31T00:30:00.000+0000,0,0
YYZ,2022-12-31T00:45:00.000+0000,0,0
YYZ,2022-12-31T01:00:00.000+0000,0,0
YYZ,2022-12-31T01:15:00.000+0000,0,0
YYZ,2022-12-31T01:30:00.000+0000,0,0
YYZ,2022-12-31T01:45:00.000+0000,0,0
YYZ,2022-12-31T02:00:00.000+0000,0,0
YYZ,2022-12-31T02:15:00.000+0000,0,0
