In [1]:
import pandas as pd
import re
import pyspark
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession
from pyspark.sql import types as t
from pyspark.sql.functions import to_date, to_timestamp
from datetime import datetime, timedelta

from all_etl import *
from hurdat_etl import *
from sql_queries import *

In [2]:
spark_verify()

3.1656


In [3]:
print("START ETL pipeline process")
results_all = []

START ETL pipeline process


In [4]:
path_d = define_paths()

In [5]:
# Create Spark session for the pipeline.
spark = create_spark_session()

Create Spark session


In [6]:
# Process all input files
hurdat_df_spark = process_hurdat_data(spark, path_d)
hurdat_storms_df_spark = hurdat_df_spark[0]
hurdat_tracks_df_spark = hurdat_df_spark[1]


Processing HURDAT data
Reading HURDAT to Spark
HURDAT processing complete


In [7]:
hurdat_table = process_joined_hurdat_data( spark, path_d, hurdat_storms_df_spark, hurdat_tracks_df_spark)

Creating table
HURDAT schema:
root
 |-- storm_id: string (nullable = false)
 |-- storm_name: string (nullable = false)
 |-- sample_count: string (nullable = false)
 |-- datetime: date (nullable = false)
 |-- s2_cell_id: string (nullable = false)
 |-- s2_region: string (nullable = false)
 |-- record_id: string (nullable = false)
 |-- system_status: string (nullable = false)
 |-- latitude: string (nullable = false)
 |-- longitude: string (nullable = false)
 |-- max_sust_wind: string (nullable = false)
 |-- max_pressure: string (nullable = false)
 |-- NE34: string (nullable = false)
 |-- SE34: string (nullable = false)
 |-- SW34: string (nullable = false)
 |-- NW34: string (nullable = false)
 |-- NE50: string (nullable = false)
 |-- SE50: string (nullable = false)
 |-- SW50: string (nullable = false)
 |-- NW50: string (nullable = false)
 |-- NE64: string (nullable = false)
 |-- SE64: string (nullable = false)
 |-- SW64: string (nullable = false)
 |-- NW64: string (nullable = false)

HURDA

In [8]:
results = check_data_quality( spark, hurdat_table)
results_all.append(results)

print(results_all)

Checking HURDAT table...
NULLS:
+--------+
|count(1)|
+--------+
|       0|
+--------+

ROWS:
+--------+
|count(1)|
+--------+
|   51840|
+--------+

Checking data quality complete
[{'hurdat_count': 51840, 'hurdat': 'OK'}]


In [9]:
print("ETL pipeline complete")

ETL pipeline complete


In [10]:
hurdattime_table_createquery = """
    SELECT DISTINCT  dt             AS datetime, 
                     year(dt)       AS year,
                     month(dt)      AS month,
                     day(dt)        AS day, 
                     hour(dt)       AS hour,                      
                     minute(dt)     AS minute
    FROM time_table_DF
"""

In [11]:
hurdat_tracks_df_spark = hurdat_tracks_df_spark.withColumn("dt", to_timestamp(hurdat_tracks_df_spark.Tdatetime))

hurdat_tracks_df_spark.createOrReplaceTempView("time_table_DF")

time_table = spark.sql(hurdattime_table_createquery)

In [12]:
time_table.show()

+-------------------+----+-----+---+----+------+
|           datetime|year|month|day|hour|minute|
+-------------------+----+-----+---+----+------+
|1862-11-24 00:00:00|1862|   11| 24|   0|     0|
|1865-09-28 00:00:00|1865|    9| 28|   0|     0|
|1867-06-21 00:00:00|1867|    6| 21|   0|     0|
|1867-09-01 00:00:00|1867|    9|  1|   0|     0|
|1879-08-22 00:00:00|1879|    8| 22|   0|     0|
|1887-06-14 00:00:00|1887|    6| 14|   0|     0|
|1889-09-08 00:00:00|1889|    9|  8|   0|     0|
|1893-11-06 00:00:00|1893|   11|  6|   0|     0|
|1894-09-29 00:00:00|1894|    9| 29|   0|     0|
|1908-09-09 00:00:00|1908|    9|  9|   0|     0|
|1908-10-02 00:00:00|1908|   10|  2|   0|     0|
|1908-10-19 00:00:00|1908|   10| 19|   0|     0|
|1921-09-11 00:00:00|1921|    9| 11|   0|     0|
|1926-08-03 00:00:00|1926|    8|  3|   0|     0|
|1934-08-22 00:00:00|1934|    8| 22|   0|     0|
|1935-11-12 00:00:00|1935|   11| 12|   0|     0|
|1936-12-14 00:00:00|1936|   12| 14|   0|     0|
|1939-09-24 00:00:00

In [13]:
#hurdatspace_table_createquery = """
#    SELECT DISTINCT  S2CellID     AS s2_cell_id, 
#                     S2Region      AS s2_region
#    FROM space_table_DF
#"""

In [14]:
#hurdat_tracks_df_spark.createOrReplaceTempView("space_table_DF")
#space_table = spark.sql(hurdatspace_table_createquery)

In [32]:
hurdat_tracks_df = hurdat_df_spark[2]
space_table = hurdat_tracks_df[['S2CellID','S2Region']].copy()

for s2level in range(29,0,-1):
    colname = 'S2_L' + str(s2level).zfill(2)
    space_table[colname] = [z.parent(s2level) for z in space_table['S2CellID']]
    
space_table

Unnamed: 0,S2CellID,S2Region,S2_L29,S2_L28,S2_L27,S2_L26,S2_L25,S2_L24,S2_L23,S2_L22,...,S2_L10,S2_L09,S2_L08,S2_L07,S2_L06,S2_L05,S2_L04,S2_L03,S2_L02,S2_L01
2,CellId: 86152368a918e181,CellId: 8615230000000000,CellId: 86152368a918e184,CellId: 86152368a918e190,CellId: 86152368a918e1c0,CellId: 86152368a918e100,CellId: 86152368a918e400,CellId: 86152368a918f000,CellId: 86152368a918c000,CellId: 86152368a9190000,...,CellId: 8615230000000000,CellId: 8615240000000000,CellId: 8615300000000000,CellId: 8615400000000000,CellId: 8615000000000000,CellId: 8614000000000000,CellId: 8610000000000000,CellId: 8640000000000000,CellId: 8700000000000000,CellId: 8400000000000000
3,CellId: 866ae7fbf0357351,CellId: 866ae70000000000,CellId: 866ae7fbf0357354,CellId: 866ae7fbf0357350,CellId: 866ae7fbf0357340,CellId: 866ae7fbf0357300,CellId: 866ae7fbf0357400,CellId: 866ae7fbf0357000,CellId: 866ae7fbf0354000,CellId: 866ae7fbf0350000,...,CellId: 866ae70000000000,CellId: 866ae40000000000,CellId: 866af00000000000,CellId: 866ac00000000000,CellId: 866b000000000000,CellId: 866c000000000000,CellId: 8670000000000000,CellId: 8640000000000000,CellId: 8700000000000000,CellId: 8400000000000000
4,CellId: 866a461410edca43,CellId: 866a470000000000,CellId: 866a461410edca44,CellId: 866a461410edca50,CellId: 866a461410edca40,CellId: 866a461410edcb00,CellId: 866a461410edcc00,CellId: 866a461410edd000,CellId: 866a461410edc000,CellId: 866a461410ed0000,...,CellId: 866a470000000000,CellId: 866a440000000000,CellId: 866a500000000000,CellId: 866a400000000000,CellId: 866b000000000000,CellId: 866c000000000000,CellId: 8670000000000000,CellId: 8640000000000000,CellId: 8700000000000000,CellId: 8400000000000000
5,CellId: 86698c876c05db93,CellId: 86698d0000000000,CellId: 86698c876c05db94,CellId: 86698c876c05db90,CellId: 86698c876c05dbc0,CellId: 86698c876c05db00,CellId: 86698c876c05dc00,CellId: 86698c876c05d000,CellId: 86698c876c05c000,CellId: 86698c876c050000,...,CellId: 86698d0000000000,CellId: 86698c0000000000,CellId: 8669900000000000,CellId: 8669c00000000000,CellId: 8669000000000000,CellId: 866c000000000000,CellId: 8670000000000000,CellId: 8640000000000000,CellId: 8700000000000000,CellId: 8400000000000000
6,CellId: 8669c10cd9e94ef3,CellId: 8669c10000000000,CellId: 8669c10cd9e94ef4,CellId: 8669c10cd9e94ef0,CellId: 8669c10cd9e94ec0,CellId: 8669c10cd9e94f00,CellId: 8669c10cd9e94c00,CellId: 8669c10cd9e95000,CellId: 8669c10cd9e94000,CellId: 8669c10cd9e90000,...,CellId: 8669c10000000000,CellId: 8669c40000000000,CellId: 8669d00000000000,CellId: 8669c00000000000,CellId: 8669000000000000,CellId: 866c000000000000,CellId: 8670000000000000,CellId: 8640000000000000,CellId: 8700000000000000,CellId: 8400000000000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
53729,CellId: 484528fae276a51d,CellId: 4845290000000000,CellId: 484528fae276a51c,CellId: 484528fae276a510,CellId: 484528fae276a540,CellId: 484528fae276a500,CellId: 484528fae276a400,CellId: 484528fae276b000,CellId: 484528fae276c000,CellId: 484528fae2770000,...,CellId: 4845290000000000,CellId: 48452c0000000000,CellId: 4845300000000000,CellId: 4845400000000000,CellId: 4845000000000000,CellId: 4844000000000000,CellId: 4850000000000000,CellId: 4840000000000000,CellId: 4900000000000000,CellId: 4c00000000000000
53730,CellId: 4844cf56e1e86175,CellId: 4844cf0000000000,CellId: 4844cf56e1e86174,CellId: 4844cf56e1e86170,CellId: 4844cf56e1e86140,CellId: 4844cf56e1e86100,CellId: 4844cf56e1e86400,CellId: 4844cf56e1e87000,CellId: 4844cf56e1e84000,CellId: 4844cf56e1e90000,...,CellId: 4844cf0000000000,CellId: 4844cc0000000000,CellId: 4844d00000000000,CellId: 4844c00000000000,CellId: 4845000000000000,CellId: 4844000000000000,CellId: 4850000000000000,CellId: 4840000000000000,CellId: 4900000000000000,CellId: 4c00000000000000
53731,CellId: 48437e1f689d6943,CellId: 48437f0000000000,CellId: 48437e1f689d6944,CellId: 48437e1f689d6950,CellId: 48437e1f689d6940,CellId: 48437e1f689d6900,CellId: 48437e1f689d6c00,CellId: 48437e1f689d7000,CellId: 48437e1f689d4000,CellId: 48437e1f689d0000,...,CellId: 48437f0000000000,CellId: 48437c0000000000,CellId: 4843700000000000,CellId: 4843400000000000,CellId: 4843000000000000,CellId: 4844000000000000,CellId: 4850000000000000,CellId: 4840000000000000,CellId: 4900000000000000,CellId: 4c00000000000000
53732,CellId: 4869edaa6db6fdff,CellId: 4869ed0000000000,CellId: 4869edaa6db6fdfc,CellId: 4869edaa6db6fdf0,CellId: 4869edaa6db6fdc0,CellId: 4869edaa6db6fd00,CellId: 4869edaa6db6fc00,CellId: 4869edaa6db6f000,CellId: 4869edaa6db6c000,CellId: 4869edaa6db70000,...,CellId: 4869ed0000000000,CellId: 4869ec0000000000,CellId: 4869f00000000000,CellId: 4869c00000000000,CellId: 4869000000000000,CellId: 486c000000000000,CellId: 4870000000000000,CellId: 4840000000000000,CellId: 4900000000000000,CellId: 4c00000000000000
