In [14]:
import json
from pyspark.sql import SparkSession, Row
from pyspark.sql.functions import udf, from_json
from pyspark.sql.types import StructType, StructField, StringType
from pyspark.sql import SQLContext

In [15]:
spark = SparkSession \
        .builder \
        .appName("FilterFlightInfo") \
        .getOrCreate()


In [16]:

raw_events = spark \
    .read \
    .format("kafka") \
    .option("kafka.bootstrap.servers", "kafka:29092") \
    .option("subscribe", "planes1") \
    .option("startingOffsets", "earliest") \
    .option("endingOffsets", "latest") \
    .load()

In [17]:
raw_events.cache()

DataFrame[key: binary, value: binary, topic: string, partition: int, offset: bigint, timestamp: timestamp, timestampType: int]

In [18]:
events = raw_events.select(raw_events.value.cast('string'))

In [19]:
#explode the Json into base dataframe
from pyspark.sql.functions import from_json, col
json_schema = spark.read.json(events.rdd.map(lambda row: row.value)).schema
flight_info = events.rdd.map(lambda x: json.loads(x.value)).toDF(schema=json_schema)
flight_info.show()

+-------------+--------+------------+------+------------+--------+---------+---------+--------------+---------------+-----+----------+-------------+----------+--------+-------------+
|baro_altitude|callsign|geo_altitude|icao24|last_contact|latitude|longitude|on_ground|origin_country|position_source|  spi|      time|time_position|true_track|velocity|vertical_rate|
+-------------+--------+------------+------+------------+--------+---------+---------+--------------+---------------+-----+----------+-------------+----------+--------+-------------+
|       701.04|72150   |      655.32|ae1fa5|  1626911540| 36.2166|-115.2025|    false| United States|              0|false|1626911540|   1626911540|    108.43|    6.51|        -0.98|
|      1310.64|N1540M  |     1379.22|a0db6f|  1626911539| 41.5771| -93.5474|    false| United States|              0|false|1626911540|   1626911538|      null|    66.5|        -0.33|
|      9631.68|LPE2006 |      4838.7|e8027e|  1626911496|-12.8591| -75.2564|    false

In [20]:
flight_info_on_ground=flight_info.filter(flight_info.on_ground ==True)
flight_info_on_ground.show()

+-------------+--------+------------+------+------------+--------+---------+---------+--------------+---------------+-----+----------+-------------+----------+--------+-------------+
|baro_altitude|callsign|geo_altitude|icao24|last_contact|latitude|longitude|on_ground|origin_country|position_source|  spi|      time|time_position|true_track|velocity|vertical_rate|
+-------------+--------+------------+------+------------+--------+---------+---------+--------------+---------------+-----+----------+-------------+----------+--------+-------------+
|         null|        |        null|ade18c|  1626911523|  39.873| -75.2434|     true| United States|              0|false|1626911540|   1626911523|     75.94|    0.26|         null|
|         null|SWR154F |        null|4b1800|  1626911522| 47.4419|   8.5643|     true|   Switzerland|              0|false|1626911540|   1626911516|    146.25|    0.06|         null|
|         null|JST228  |        null|7c6b31|  1626911531|-43.4877| 172.5369|     true

In [21]:
flight_info_on_ground.write.parquet("/tmp/flight_info_on_ground")

AnalysisException: 'path hdfs://cloudera/tmp/flight_info_on_ground already exists.;'

In [22]:
flight_info_off_ground=flight_info.filter(flight_info.on_ground !=True)
flight_info_off_ground.write.parquet("/tmp/flight_info_off_ground")

AnalysisException: 'path hdfs://cloudera/tmp/flight_info_off_ground already exists.;'

In [23]:
flight_info_domestic=flight_info.filter(flight_info.origin_country =='United States')
flight_info_domestic.show()

+-------------+--------+------------+------+------------+--------+---------+---------+--------------+---------------+-----+----------+-------------+----------+--------+-------------+
|baro_altitude|callsign|geo_altitude|icao24|last_contact|latitude|longitude|on_ground|origin_country|position_source|  spi|      time|time_position|true_track|velocity|vertical_rate|
+-------------+--------+------------+------+------------+--------+---------+---------+--------------+---------------+-----+----------+-------------+----------+--------+-------------+
|       701.04|72150   |      655.32|ae1fa5|  1626911540| 36.2166|-115.2025|    false| United States|              0|false|1626911540|   1626911540|    108.43|    6.51|        -0.98|
|      1310.64|N1540M  |     1379.22|a0db6f|  1626911539| 41.5771| -93.5474|    false| United States|              0|false|1626911540|   1626911538|      null|    66.5|        -0.33|
|      5280.66|UAL1626 |     5311.14|aa56db|  1626911539| 29.4873| -95.5204|    false

In [24]:
flight_info_domestic.write.parquet("/tmp/flight_info_domestic")

In [25]:
flight_info_international=flight_info.filter(flight_info.origin_country !='United States')
flight_info_domestic.write.parquet("/tmp/flight_info_international")
