In [10]:
from pyspark.sql import SparkSession
from pyspark.sql.types import TimestampType, StructType, StringType, IntegerType, FloatType, StructField
from pyspark.sql.functions import lit, concat, col, regexp_replace, collect_list

In [11]:
ss = SparkSession \
    .builder \
    .master("local[*]") \
    .appName("us_import_sample") \
    .getOrCreate()

In [12]:
cargo_desc = ss.read \
    .option("header", True) \
    .option("escape", '"') \
    .option("inferSchema", True) \
    .csv('./ams/2020/202001201500/ams__cargodesc_2020__202001201500.csv')

In [13]:
cargo_desc.printSchema()

root
 |-- identifier: long (nullable = true)
 |-- container_number: string (nullable = true)
 |-- description_sequence_number: integer (nullable = true)
 |-- piece_count: integer (nullable = true)
 |-- description_text: string (nullable = true)



In [14]:
cargo_desc.createOrReplaceTempView("cargo_desc")

In [15]:
hazmat = ss.read \
    .option("header", True) \
    .option("escape", '"') \
    .option("inferSchema", True) \
    .csv('./ams/2020/202001201500/ams__hazmat_2020__202001201500.csv')

In [16]:
hazmat.printSchema()

root
 |-- identifier: long (nullable = true)
 |-- container_number: string (nullable = true)
 |-- hazmat_sequence_number: integer (nullable = true)
 |-- hazmat_code: string (nullable = true)
 |-- hazmat_class: string (nullable = true)
 |-- hazmat_code_qualifier: string (nullable = true)
 |-- hazmat_contact: string (nullable = true)
 |-- hazmat_page_number: string (nullable = true)
 |-- hazmat_flash_point_temperature: string (nullable = true)
 |-- hazmat_flash_point_temperature_negative_ind: string (nullable = true)
 |-- hazmat_flash_point_temperature_unit: string (nullable = true)
 |-- hazmat_description: string (nullable = true)



In [17]:
hazmat.createOrReplaceTempView("hazmat")

In [18]:
hazmat_class = ss.read \
    .option("header", True) \
    .option("escape", '"') \
    .option("inferSchema", True) \
    .csv('./ams/2020/202001201500/ams__hazmatclass_2020__202001201500.csv')

In [19]:
hazmat_class.printSchema()

root
 |-- identifier: long (nullable = true)
 |-- container_number: string (nullable = true)
 |-- hazmat_sequence_number: integer (nullable = true)
 |-- hazmat_classification: string (nullable = true)



In [20]:
hazmat_class.createOrReplaceTempView("hazmat_class")

In [21]:
cargo_table = ss.sql("""
    SELECT 
        c.identifier,
        c.container_number,
        c.description_sequence_number AS sequence_number,
        c.piece_count,
        c.description_text AS description,
        h.hazmat_code,
        (CASE 
            WHEN (hc.hazmat_classification IS NOT NULL) THEN hc.hazmat_classification
            ELSE h.hazmat_class
        END) AS hazmat_class,
        h.hazmat_code_qualifier,
        h.hazmat_contact,
        h.hazmat_page_number,
        h.hazmat_flash_point_temperature,
        h.hazmat_flash_point_temperature_negative_ind,
        h.hazmat_flash_point_temperature_unit,
        h.hazmat_description
    FROM cargo_desc AS c
    LEFT JOIN hazmat AS h
    ON 
        c.identifier = h.identifier AND 
        c.container_number = h.container_number AND 
        c.description_sequence_number = h.hazmat_sequence_number
    LEFT JOIN hazmat_class AS hc
    ON
        c.identifier = hc.identifier AND 
        c.container_number = hc.container_number AND 
        c.description_sequence_number = hc.hazmat_sequence_number
""")

In [22]:
cargo_table.printSchema()

root
 |-- identifier: long (nullable = true)
 |-- container_number: string (nullable = true)
 |-- sequence_number: integer (nullable = true)
 |-- piece_count: integer (nullable = true)
 |-- description: string (nullable = true)
 |-- hazmat_code: string (nullable = true)
 |-- hazmat_class: string (nullable = true)
 |-- hazmat_code_qualifier: string (nullable = true)
 |-- hazmat_contact: string (nullable = true)
 |-- hazmat_page_number: string (nullable = true)
 |-- hazmat_flash_point_temperature: string (nullable = true)
 |-- hazmat_flash_point_temperature_negative_ind: string (nullable = true)
 |-- hazmat_flash_point_temperature_unit: string (nullable = true)
 |-- hazmat_description: string (nullable = true)

