In [1]:
import pandas as pd
import re
import pyspark
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession
from pyspark.sql import types as t
from pyspark.sql.functions import to_date, to_timestamp
from datetime import datetime, timedelta

from all_etl import *
from hurdat_etl import *
from sql_queries import *

In [2]:
spark_verify()

3.1484


In [3]:
print("START ETL pipeline process")
results_all = []

START ETL pipeline process


In [4]:
path_d = define_paths()

In [5]:
# Create Spark session for the pipeline.
spark = create_spark_session()

Create Spark session


In [6]:
# Process all input files
hurdat_df_spark = process_hurdat_data(spark, path_d)
hurdat_storms_df_spark = hurdat_df_spark[0]
hurdat_tracks_df_spark = hurdat_df_spark[1]

Processing HURDAT data
Reading HURDAT to Spark
HURDAT processing complete


In [7]:
hurdat_table = process_joined_hurdat_data( spark, path_d, hurdat_storms_df_spark, hurdat_tracks_df_spark)

Creating table
HURDAT schema:
root
 |-- storm_id: string (nullable = false)
 |-- storm_name: string (nullable = false)
 |-- sample_count: string (nullable = false)
 |-- datetime: date (nullable = false)
 |-- record_id: string (nullable = false)
 |-- system_status: string (nullable = false)
 |-- latitude: string (nullable = false)
 |-- longitude: string (nullable = false)
 |-- max_sust_wind: string (nullable = false)
 |-- max_pressure: string (nullable = false)
 |-- NE34: string (nullable = false)
 |-- SE34: string (nullable = false)
 |-- SW34: string (nullable = false)
 |-- NW34: string (nullable = false)
 |-- NE50: string (nullable = false)
 |-- SE50: string (nullable = false)
 |-- SW50: string (nullable = false)
 |-- NW50: string (nullable = false)
 |-- NE64: string (nullable = false)
 |-- SE64: string (nullable = false)
 |-- SW64: string (nullable = false)
 |-- NW64: string (nullable = false)

HURDAT table complete


In [8]:
results = check_data_quality( spark, hurdat_table)
results_all.append(results)

print(results_all)

Checking HURDAT table...
NULLS:
+--------+
|count(1)|
+--------+
|       0|
+--------+

ROWS:
+--------+
|count(1)|
+--------+
|   51840|
+--------+

Checking data quality complete
[{'hurdat_count': 51840, 'hurdat': 'OK'}]


In [9]:
print("ETL pipeline complete")

ETL pipeline complete


In [22]:
hurdattime_table_createquery = """
    SELECT DISTINCT  dt           AS datetime, 
                     year(dt)       AS year,
                     month(dt)      AS month,
                     day(dt)        AS day, 
                     hour(dt)       AS hour,                      
                     minute(dt)     AS minute
    FROM time_table_DF
"""

In [23]:
hurdat_tracks_df_spark = hurdat_tracks_df_spark.withColumn("dt", to_timestamp(hurdat_tracks_df_spark.Tdatetime))

hurdat_tracks_df_spark.createOrReplaceTempView("time_table_DF")

time_table = spark.sql(hurdattime_table_createquery)

In [24]:
time_table.show()

+-------------------+----+-----+---+----+------+
|           datetime|year|month|day|hour|minute|
+-------------------+----+-----+---+----+------+
|1862-11-24 00:00:00|1862|   11| 24|   0|     0|
|1865-09-28 00:00:00|1865|    9| 28|   0|     0|
|1867-06-21 00:00:00|1867|    6| 21|   0|     0|
|1867-09-01 00:00:00|1867|    9|  1|   0|     0|
|1879-08-22 00:00:00|1879|    8| 22|   0|     0|
|1887-06-14 00:00:00|1887|    6| 14|   0|     0|
|1889-09-08 00:00:00|1889|    9|  8|   0|     0|
|1893-11-06 00:00:00|1893|   11|  6|   0|     0|
|1894-09-29 00:00:00|1894|    9| 29|   0|     0|
|1908-09-09 00:00:00|1908|    9|  9|   0|     0|
|1908-10-02 00:00:00|1908|   10|  2|   0|     0|
|1908-10-19 00:00:00|1908|   10| 19|   0|     0|
|1921-09-11 00:00:00|1921|    9| 11|   0|     0|
|1926-08-03 00:00:00|1926|    8|  3|   0|     0|
|1934-08-22 00:00:00|1934|    8| 22|   0|     0|
|1935-11-12 00:00:00|1935|   11| 12|   0|     0|
|1936-12-14 00:00:00|1936|   12| 14|   0|     0|
|1939-09-24 00:00:00