# Load data through pyspark

In [7]:
from pyspark.sql import SparkSession
from sedona.register import SedonaRegistrator
from sedona.utils import SedonaKryoRegistrator, KryoSerializer
import geopandas as gpd

spark = SparkSession. \
    builder. \
    appName('Load Data'). \
    master('local[*]'). \
    config("spark.serializer", KryoSerializer.getName). \
    config("spark.kryo.registrator", SedonaKryoRegistrator.getName). \
    config('spark.jars.packages',
           'org.apache.sedona:sedona-python-adapter-3.0_2.12:1.0.1-incubating,'
           'org.datasyslab:geotools-wrapper:geotools-24.1'). \
    getOrCreate()

SedonaRegistrator.registerAll(spark)

21/12/07 16:46:03 WARN UDTRegistration: Cannot register UDT for org.locationtech.jts.geom.Geometry, which is already registered.
21/12/07 16:46:03 WARN UDTRegistration: Cannot register UDT for org.locationtech.jts.index.SpatialIndex, which is already registered.
21/12/07 16:46:03 WARN SimpleFunctionRegistry: The function st_pointfromtext replaced a previously registered function.
21/12/07 16:46:03 WARN SimpleFunctionRegistry: The function st_polygonfromtext replaced a previously registered function.
21/12/07 16:46:03 WARN SimpleFunctionRegistry: The function st_linestringfromtext replaced a previously registered function.
21/12/07 16:46:03 WARN SimpleFunctionRegistry: The function st_geomfromtext replaced a previously registered function.
21/12/07 16:46:03 WARN SimpleFunctionRegistry: The function st_geomfromwkt replaced a previously registered function.
21/12/07 16:46:03 WARN SimpleFunctionRegistry: The function st_geomfromwkb replaced a previously registered function.
21/12/07 16:46:

True

In [8]:
states_tsv = spark.read.option("delimiter", "\t").option("header", "false").csv("clean_data/boundary-each-state.tsv").toDF("State","Polygon")
states_tsv.show()

+-------------+--------------------+
|        State|             Polygon|
+-------------+--------------------+
|       Alaska|POLYGON((-141.020...|
|      Alabama|POLYGON((-88.1955...|
|     Arkansas|POLYGON((-94.0416...|
|      Arizona|POLYGON((-112.598...|
|   California|POLYGON((-124.400...|
|     Colorado|POLYGON((-109.044...|
|  Connecticut|POLYGON((-73.4875...|
|     Delaware|POLYGON((-75.7919...|
|      Florida|POLYGON((-87.6050...|
|      Georgia|POLYGON((-85.6082...|
|       Hawaii|POLYGON((-154.628...|
|         Iowa|POLYGON((-95.7623...|
|        Idaho|POLYGON((-117.031...|
|     Illinois|POLYGON((-90.6290...|
|      Indiana|POLYGON((-87.5253...|
|       Kansas|POLYGON((-102.050...|
|     Kentucky|POLYGON((-89.5372...|
|    Louisiana|POLYGON((-94.0430...|
|Massachusetts|POLYGON((-72.7789...|
|     Maryland|POLYGON((-79.4778...|
+-------------+--------------------+
only showing top 20 rows



In [9]:
airports_dat = spark.read.option("delimiter", ",").option("header", "false").csv("raw_data/airports.dat") \
                  .toDF("Airport ID","Name","City","Country","IATA","ICAO","Latitude","Longitude","Altitude","Timezone","DST","Tz database timezone","Type","Source")
airports_dat.show()

+----------+--------------------+--------------+----------------+----+----+------------------+-------------------+--------+--------+---+--------------------+-------+-----------+
|Airport ID|                Name|          City|         Country|IATA|ICAO|          Latitude|          Longitude|Altitude|Timezone|DST|Tz database timezone|   Type|     Source|
+----------+--------------------+--------------+----------------+----+----+------------------+-------------------+--------+--------+---+--------------------+-------+-----------+
|         1|      Goroka Airport|        Goroka|Papua New Guinea| GKA|AYGA|-6.081689834590001|      145.391998291|    5282|      10|  U|Pacific/Port_Moresby|airport|OurAirports|
|         2|      Madang Airport|        Madang|Papua New Guinea| MAG|AYMD|    -5.20707988739|      145.789001465|      20|      10|  U|Pacific/Port_Moresby|airport|OurAirports|
|         3|Mount Hagen Kagam...|   Mount Hagen|Papua New Guinea| HGU|AYMH|-5.826789855957031| 144.29600524902

In [10]:
airlines_dat = spark.read.option("delimiter", ",").option("header", "false").csv("raw_data/airlines.dat") \
                  .toDF("Airline ID","Name","Alias","IATA","ICAO","Callsign","Country","Active")
airlines_dat.show()

+----------+--------------------+-----+----+----+--------------+--------------+------+
|Airline ID|                Name|Alias|IATA|ICAO|      Callsign|       Country|Active|
+----------+--------------------+-----+----+----+--------------+--------------+------+
|        -1|             Unknown|   \N|   -| N/A|            \N|            \N|     Y|
|         1|      Private flight|   \N|   -| N/A|          null|          null|     Y|
|         2|         135 Airways|   \N|null| GNL|       GENERAL| United States|     N|
|         3|       1Time Airline|   \N|  1T| RNX|       NEXTIME|  South Africa|     Y|
|         4|2 Sqn No 1 Elemen...|   \N|null| WYT|          null|United Kingdom|     N|
|         5|     213 Flight Unit|   \N|null| TFU|          null|        Russia|     N|
|         6|223 Flight Unit S...|   \N|null| CHD|CHKALOVSK-AVIA|        Russia|     N|
|         7|   224th Flight Unit|   \N|null| TTF|    CARGO UNIT|        Russia|     N|
|         8|         247 Jet Ltd|   \N|null

In [11]:
routes_dat = spark.read.option("delimiter", ",").option("header", "false").csv("raw_data/routes.dat") \
                  .toDF("Airline","Airline ID","Source airport","Source airport ID","Destination airport","Destination airport ID","Codeshare","Stops","Equipment")
routes_dat.show()

+-------+----------+--------------+-----------------+-------------------+----------------------+---------+-----+---------+
|Airline|Airline ID|Source airport|Source airport ID|Destination airport|Destination airport ID|Codeshare|Stops|Equipment|
+-------+----------+--------------+-----------------+-------------------+----------------------+---------+-----+---------+
|     2B|       410|           AER|             2965|                KZN|                  2990|     null|    0|      CR2|
|     2B|       410|           ASF|             2966|                KZN|                  2990|     null|    0|      CR2|
|     2B|       410|           ASF|             2966|                MRV|                  2962|     null|    0|      CR2|
|     2B|       410|           CEK|             2968|                KZN|                  2990|     null|    0|      CR2|
|     2B|       410|           CEK|             2968|                OVB|                  4078|     null|    0|      CR2|
|     2B|       

In [12]:
cities_csv = spark.read.option("delimiter", ",").option("header", "false").csv("raw_data/cities.csv").toDF("c_name","c_loc")
cities_csv.show()

+-------+--------------------+
| c_name|               c_loc|
+-------+--------------------+
|Pullman|-117.167126_46.73...|
|Phoenix|-112.092128_33.50...|
+-------+--------------------+

