In [69]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from operator import add
import sys
import operator
from pyspark.sql.functions import broadcast
from pyspark.sql.types import StringType, IntegerType
from pyspark.sql.window import Window
from pyspark.sql.utils import AnalysisException

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [36]:
size="massive"

flights_file_path="s3://usyddata3404/ontimeperformance_flights_{}.csv".format(size)
country_file_path="s3://air-traffic-dataset/ontimeperformance_airlines.csv"

al = spark.read.format("csv").options(header="true").load(country_file_path)
fd = spark.read.format("csv").options(header="true").load(flights_file_path)

airlines_all_counts = (
    al.join(fd, al.carrier_code == fd.carrier_code)
    .drop(fd.carrier_code)
    .filter(F.col("actual_departure_time").isNotNull())
    .groupBy("name", "carrier_code", "tail_number", "country")
    .count()
)

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [38]:
output_file_path="s3://air-traffic-dataset/data-partitioned/ontimeperformance_flights_{}_T3".format(size)

maxRow = 800
spark.conf.set("spark.sql.files.maxRecordsPerFile", maxRow)

airlines_all_counts \
    .repartition("country") \
    .write.partitionBy("country") \
    .option("header", "true") \
    .csv(output_file_path)

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [10]:
airlines_all_counts.printSchema()

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

root
 |-- name: string (nullable = true)
 |-- carrier_code: string (nullable = true)
 |-- tail_number: string (nullable = true)
 |-- country: string (nullable = true)
 |-- count: long (nullable = false)

In [32]:
airlines_all_counts.count()

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

9337

In [31]:
airlines_all_counts.select("tail_number").count()

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

9337

In [26]:
airlines_all_counts.groupBy(F.col("country"), F.col("carrier_code")).count().show(200)

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+-------------+------------+-----+
|      country|carrier_code|count|
+-------------+------------+-----+
|United States|          DL| 1187|
|United States|          HA|   75|
|United States|          NK|  135|
|United States|          FL|  146|
|United States|          G4|  120|
|United States|          CO|  431|
|United States|          B6|  340|
|United States|          AA| 2325|
|United States|          MQ|  658|
|United States|          EV|  488|
|United States|          XE|  275|
|United States|          OH|  662|
|United States|          US|  766|
|United States|          F9|  197|
|United States|          VX|   68|
|United States|          NW|  402|
|United States|          OO|  603|
|United States|          WN| 1329|
|United States|          9E|  523|
|United States|          UA| 1016|
|     Tanzania|          TZ|   30|
|United States|          AS|  314|
|United States|          YV|  268|
|United States|          AQ|   30|
+-------------+------------+-----+

In [33]:
al.groupBy("country").count().show()

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+--------------------+-----+
|             country|count|
+--------------------+-----+
|              Russia|    6|
|              Sweden|    5|
|   Republic of Korea|    1|
|         Philippines|    2|
|           Singapore|    1|
|            Malaysia|    1|
|                Iraq|    1|
|             Germany|    4|
|            Maldives|    1|
|         Ivory Coast|    1|
|               Sudan|    1|
|              France|    4|
|              Greece|    1|
|              Taiwan|    1|
|                null|    1|
|           Argentina|    2|
|             Belgium|    3|
|             Ecuador|    1|
|São Tomé and Prín...|    1|
|             Finland|    1|
+--------------------+-----+
only showing top 20 rows

In [72]:
def parsePopularModels(airlines_model_ranking):
    results = airlines_model_ranking.collect()

    if len(results) == 0:
        return

    results.sort(key=operator.itemgetter(0, 4))
    currAirline = results[0][0]

    i = 0
    hasPrintOnce = False
    aircraftTypeString = ""
    while i < len(results):
        if currAirline == results[i][0]:
            aircraftTypeString += "{} {}, ".format(results[i][1], results[i][2])
            i += 1
        else:
            hasPrintOnce = True
            print(
                "{} \t [{}]".format(currAirline, aircraftTypeString[:-2]),
                file=sys.stdout,
            )
            aircraftTypeString = ""
            currAirline = results[i][0]

    if not hasPrintOnce:
        print(
            "{} \t [{}]".format(currAirline, aircraftTypeString[:-2]), file=sys.stdout
        )


def airline_top_five(spark, flights_file_path, other_files_path, country):

    aircraft_df = (
        spark.read.format("csv")
        .options(header="true")
        .load("{}/ontimeperformance_aircrafts.csv".format(other_files_path))
    )

    try:
        airlines_all_counts = (
            spark.read.format("csv")
            .options(header="true")
            .load(
                "{}/country={}".format(
                    flights_file_path, country,
                )
            )
        )
    except AnalysisException:
        print("ERROR: Analysis Exception is caught. Either file was not found or country not in file.", file=sys.stdout)
        return

    aircraft_col = [
        "manufacturer",
        "model",
        "modelName",
        "tailnum",
    ]

    aircraft_modelName = (
        aircraft_df.filter(F.col("manufacturer").isNotNull())
        .filter(F.col("model").isNotNull())
        .withColumn("modelName", F.regexp_extract(F.col("model"), "\d{3}", 0))
        .select(aircraft_col)
    )

    airlines_all_counts = airlines_all_counts.withColumn(
        "count", airlines_all_counts["count"].cast(IntegerType())
    )

    airlines_all_counts = (
        airlines_all_counts.join(
            aircraft_modelName,
            airlines_all_counts.tail_number == aircraft_modelName.tailnum,
            how="left",
        )
        .filter(F.col("manufacturer").isNotNull())
        .filter(F.col("model").isNotNull())
        .groupBy("name", "manufacturer", "model")
        .sum("count")
        .withColumnRenamed("sum(count)", "numFlights")
        .cache()
    )

    window = Window.partitionBy(airlines_all_counts["name"]).orderBy(
        airlines_all_counts["numFlights"].desc()
    )

    airlines_model_ranking = (
        airlines_all_counts.select("*", F.rank().over(window).alias("rank"))
        .filter(F.col("rank") <= 5)
        .sort(["name", "rank"], ascending=True)
    )
    
    print("hello", file=sys.stdout)
    airlines_model_ranking.show()

    parsePopularModels(airlines_model_ranking)


VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [73]:
size="large"
country="bob"
flights_file_path="s3://air-traffic-dataset/data-partitioned/ontimeperformance_flights_{}_T3".format(size)
other_files_path="s3://air-traffic-dataset"

airline_top_five(spark, flights_file_path, other_files_path, country)

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

ERROR: Analysis Exception is caught. Either file was not found or