In [1]:
# Import Spark modules
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import *

# Setup Spark session
spark = SparkSession.builder.appName("Explore_Flight_Number).getOrCreate()

In [2]:
# Read parquet files:
df_ft_i94 = spark.read.parquet('s3/ft_i94')

# Create a temp view for exploring it
df_ft_i94.createOrReplaceTempView('ft_i94')

# See a sample of dataset
print('--------')
print('ft_i94')
print('--------')
spark.sql("""
    SELECT *
    FROM ft_i94
""").show(5)


--------
ft_i94
--------
+---------+------+------+------+------+-------+-------+-------+-------+-------+------+-------+-----+--------+--------+-----+-------+-------+-------+-------+-------+--------+------+------+-------+---------------+-----+--------+
|    cicid| i94yr|i94mon|i94cit|i94res|i94port|arrdate|i94mode|i94addr|depdate|i94bir|i94visa|count|dtadfile|visapost|occup|entdepa|entdepd|entdepu|matflag|biryear| dtaddto|gender|insnum|airline|         admnum|fltno|visatype|
+---------+------+------+------+------+-------+-------+-------+-------+-------+------+-------+-----+--------+--------+-----+-------+-------+-------+-------+-------+--------+------+------+-------+---------------+-----+--------+
|4815887.0|2016.0|  11.0| 110.0| 110.0|    MIA|20783.0|    1.0|     FL|20793.0|  35.0|    1.0|  1.0|20161125|    null| null|      O|      O|   null|      M| 1981.0|02222017|     F|  null|     AA|1.4567885985E10|   39|      WB|
|4815888.0|2016.0|  11.0| 209.0| 209.0|    HHW|20783.0|    1.0|    

In [4]:
spark.sql("""
    SELECT fltno, airline
    FROM ft_i94
""").show(5)

+-----+-------+
|fltno|airline|
+-----+-------+
|   39|     AA|
|  186|     NH|
|   39|     AA|
|  456|     LH|
|  601|     KL|
+-----+-------+
only showing top 5 rows



In [9]:
# Checking to see if duplicates of airline code also in flight no or not
# Using duplicate codes found in Airlines.dat to see if any issue with this dataset:
spark.sql("""
    SELECT COUNT(airline), airline
    FROM ft_i94
    WHERE airline IN ('G3','RA','VY','5D','C3','8Q','ZA','WA','8M','LH','CC','TL','JL','ZB','CP','1I','CO','I9','SQ')
    GROUP BY airline
    HAVING COUNT(airline) > 1
    ORDER BY 1 DESC
""").show(100)

+--------------+-------+
|count(airline)|airline|
+--------------+-------+
|       1414865|     LH|
|        907182|     JL|
|        191746|     SQ|
|         41518|     CP|
|          3944|     G3|
|            67|     CC|
|            44|     TL|
|            22|     WA|
|            16|     CO|
|            15|     5D|
|             5|     I9|
|             3|     ZB|
|             2|     VY|
+--------------+-------+



In [None]:
"""
For Above ft_i94 and according to data in Airlines.dat
LH = Lufthansa (can assume this to be non-Cargo only as taking persons here) - So, can be safely clubbed
JL = Japan Airlines and Japan Airlines (J-Bird). Same airline with variation - So, can be safely clubbed
SQ = Singapore Airlines and SINGCARGO variations but part of same airline. - So, can safely clubbed
CP = Canadian Airlines OR Compass Airlines which was 'regional' airline headquartered, no longer active. - So, CP can be assumed to be Canadian Airlines only.
G3 = City Connexion Airlines (based in Burundi) OR Sky Express (Greece) OR Gol Transportes Aereos (Brazil)
    City Connexion Airlines: now no longer active), Sky Express (Greece) is domestic only
    Gol Transportes Aereos (Brazil), 3rd largest international airline in Brazil - So, G3 can be assumed to be Gol Transportes Aereos
Rest, not significant counts, so ignoring unless needed for specific observations
"""