In [None]:
import pyspark
from pyspark.sql import functions as F

spark = pyspark.sql.SparkSession.builder \
    .appName("Project 4") \
    .config("spark.driver.memory", "4g") \
    .config("spark.executor.memory", "4g") \
    .config("spark.sql.shuffle.partitions", "100") \
    .getOrCreate()

In [17]:
df = spark.read.csv('./data/2009.csv', header=True)
df.show()


+----------+----------+-----------------+------+----+------------+--------+---------+--------+----------+---------+-------+------------+--------+---------+---------+-----------------+--------+----------------+-------------------+--------+--------+-------------+-------------+---------+--------------+-------------------+-----------+
|   FL_DATE|OP_CARRIER|OP_CARRIER_FL_NUM|ORIGIN|DEST|CRS_DEP_TIME|DEP_TIME|DEP_DELAY|TAXI_OUT|WHEELS_OFF|WHEELS_ON|TAXI_IN|CRS_ARR_TIME|ARR_TIME|ARR_DELAY|CANCELLED|CANCELLATION_CODE|DIVERTED|CRS_ELAPSED_TIME|ACTUAL_ELAPSED_TIME|AIR_TIME|DISTANCE|CARRIER_DELAY|WEATHER_DELAY|NAS_DELAY|SECURITY_DELAY|LATE_AIRCRAFT_DELAY|Unnamed: 27|
+----------+----------+-----------------+------+----+------------+--------+---------+--------+----------+---------+-------+------------+--------+---------+---------+-----------------+--------+----------------+-------------------+--------+--------+-------------+-------------+---------+--------------+-------------------+-----------+
|

In [None]:
from pathlib import Path

data_path = "./output/2009.parquet"
data_file = Path(data_path)
if not data_file.exists():

    df = spark.read.csv('./data/2009.csv', header=True)
    print(f"Total flights: {df.count()}")

    df_2009 = df.filter(F.year(F.col("FL_DATE")) == 2009)
    print(f"2009 flights: {df_2009.count()}")

    df_2009.write.partitionBy("FL_DATE").mode("overwrite").parquet(data_path)

df_flights = spark.read.load(data_path)
df_flights.show()
df_flights.count()

+----------+-----------------+------+----+------------+--------+---------+--------+----------+---------+-------+------------+--------+---------+---------+-----------------+--------+----------------+-------------------+--------+--------+-------------+-------------+---------+--------------+-------------------+-----------+----------+
|OP_CARRIER|OP_CARRIER_FL_NUM|ORIGIN|DEST|CRS_DEP_TIME|DEP_TIME|DEP_DELAY|TAXI_OUT|WHEELS_OFF|WHEELS_ON|TAXI_IN|CRS_ARR_TIME|ARR_TIME|ARR_DELAY|CANCELLED|CANCELLATION_CODE|DIVERTED|CRS_ELAPSED_TIME|ACTUAL_ELAPSED_TIME|AIR_TIME|DISTANCE|CARRIER_DELAY|WEATHER_DELAY|NAS_DELAY|SECURITY_DELAY|LATE_AIRCRAFT_DELAY|Unnamed: 27|   FL_DATE|
+----------+-----------------+------+----+------------+--------+---------+--------+----------+---------+-------+------------+--------+---------+---------+-----------------+--------+----------------+-------------------+--------+--------+-------------+-------------+---------+--------------+-------------------+-----------+----------+
|

6429338

In [20]:

df_renamed = df_flights \
    .withColumnRenamed("FL_DATE", "FlightDate") \
    .withColumnRenamed("OP_CARRIER", "UniqueCarrier") \
    .withColumnRenamed("OP_CARRIER_FL_NUM", "FlightNum") \
    .withColumnRenamed("ORIGIN", "Origin") \
    .withColumnRenamed("DEST", "Destination") \
    .withColumnRenamed("CRS_DEP_TIME", "ScheduledDepartureTime") \
    .withColumnRenamed("DEP_TIME", "DepartureTime") \
    .withColumnRenamed("DEP_DELAY", "DepartureDelay") \
    .withColumnRenamed("TAXI_OUT", "TaxiOut") \
    .withColumnRenamed("WHEELS_OFF", "WheelsOff") \
    .withColumnRenamed("WHEELS_ON", "WheelsOn") \
    .withColumnRenamed("TAXI_IN", "TaxiIn") \
    .withColumnRenamed("CRS_ARR_TIME", "ScheduledArrivalTime") \
    .withColumnRenamed("ARR_TIME", "ArrivalTime") \
    .withColumnRenamed("ARR_DELAY", "ArrivalDelay") \
    .withColumnRenamed("CANCELLED", "Cancelled") \
    .withColumnRenamed("CANCELLATION_CODE", "CancellationCode") \
    .withColumnRenamed("DIVERTED", "Diverted") \
    .withColumnRenamed("CRS_ELAPSED_TIME", "ScheduledElapsedTime") \
    .withColumnRenamed("ACTUAL_ELAPSED_TIME", "ActualElapsedTime") \
    .withColumnRenamed("AIR_TIME", "AirTime") \
    .withColumnRenamed("DISTANCE", "Distance") \
    .withColumnRenamed("CARRIER_DELAY", "CarrierDelay") \
    .withColumnRenamed("WEATHER_DELAY", "WeatherDelay") \
    .withColumnRenamed("NAS_DELAY", "NASDelay") \
    .withColumnRenamed("SECURITY_DELAY", "SecurityDelay") \
    .withColumnRenamed("LATE_AIRCRAFT_DELAY", "LateAircraftDelay") \
    .drop("Unnamed: 27")

df_renamed.show()

+-------------+---------+------+-----------+----------------------+-------------+--------------+-------+---------+--------+------+--------------------+-----------+------------+---------+----------------+--------+--------------------+-----------------+-------+--------+------------+------------+--------+-------------+-----------------+----------+
|UniqueCarrier|FlightNum|Origin|Destination|ScheduledDepartureTime|DepartureTime|DepartureDelay|TaxiOut|WheelsOff|WheelsOn|TaxiIn|ScheduledArrivalTime|ArrivalTime|ArrivalDelay|Cancelled|CancellationCode|Diverted|ScheduledElapsedTime|ActualElapsedTime|AirTime|Distance|CarrierDelay|WeatherDelay|NASDelay|SecurityDelay|LateAircraftDelay|FlightDate|
+-------------+---------+------+-----------+----------------------+-------------+--------------+-------+---------+--------+------+--------------------+-----------+------------+---------+----------------+--------+--------------------+-----------------+-------+--------+------------+------------+--------+---

In [22]:
df_with_features = df_renamed \
    .withColumn("DayOfWeek", F.dayofweek(F.col("FlightDate"))) \
    .withColumn("Month", F.month(F.col("FlightDate")))

df_with_features.show()

+-------------+---------+------+-----------+----------------------+-------------+--------------+-------+---------+--------+------+--------------------+-----------+------------+---------+----------------+--------+--------------------+-----------------+-------+--------+------------+------------+--------+-------------+-----------------+----------+---------+-----+
|UniqueCarrier|FlightNum|Origin|Destination|ScheduledDepartureTime|DepartureTime|DepartureDelay|TaxiOut|WheelsOff|WheelsOn|TaxiIn|ScheduledArrivalTime|ArrivalTime|ArrivalDelay|Cancelled|CancellationCode|Diverted|ScheduledElapsedTime|ActualElapsedTime|AirTime|Distance|CarrierDelay|WeatherDelay|NASDelay|SecurityDelay|LateAircraftDelay|FlightDate|DayOfWeek|Month|
+-------------+---------+------+-----------+----------------------+-------------+--------------+-------+---------+--------+------+--------------------+-----------+------------+---------+----------------+--------+--------------------+-----------------+-------+--------+------

In [23]:

null_columns = [c for c in df_with_features.columns if df_with_features.filter(F.col(c).isNull()).count() > 0]
null_columns


['DepartureTime',
 'DepartureDelay',
 'TaxiOut',
 'WheelsOff',
 'WheelsOn',
 'TaxiIn',
 'ArrivalTime',
 'ArrivalDelay',
 'CancellationCode',
 'ActualElapsedTime',
 'AirTime',
 'CarrierDelay',
 'WeatherDelay',
 'NASDelay',
 'SecurityDelay',
 'LateAircraftDelay']