In [17]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import isnull, when, count, col

In [4]:
ss = SparkSession \
    .builder \
    .master("local[*]") \
    .appName("us_import_sample") \
    .getOrCreate()

In [5]:
kept_header_cols = [
    "identifier",
    "carrier_code",
    "vessel_country_code",
    "vessel_name",
    "port_of_unlading",
    "estimated_arrival_date",
    "foreign_port_of_lading_qualifier",
    "foreign_port_of_lading",
    "manifest_quantity",
    "manifest_unit",
    "weight",
    "weight_unit",
    "record_status_indicator",
    "place_of_receipt",
    "port_of_destination",
    "foreign_port_of_destination_qualifier",
    "foreign_port_of_destination",
    "conveyance_id_qualifier",
    "conveyance_id",
    "mode_of_transportation",
    "actual_arrival_date",
]

In [6]:
header = ss.read \
    .option("header", True) \
    .option("escape", '"') \
    .option("inferSchema", True) \
    .csv('./ams/2020/202001201500/ams__header_2020__202001201500.csv')

In [8]:
new_header = header.select(*kept_header_cols).where(col('identifier').isNotNull())

In [9]:
kept_bill_cols = [
    "identifier",
    "master_bol_number",
    "house_bol_number",
    "sub_house_bol_number",
    "voyage_number",
    "bill_type_code",
    "manifest_number",
    "trade_update_date",
    "run_date"
]

In [10]:
bill = ss.read \
    .option("header", True) \
    .option("escape", '"') \
    .option("inferSchema", True) \
    .csv('./ams/2020/202001201500/ams__billgen_2020__202001201500.csv')

In [11]:
new_bill = bill.select(*kept_bill_cols)

In [12]:
header_full = new_header.join(new_bill, ['identifier'], how='left')

In [13]:
header_full.printSchema() # fact table

root
 |-- identifier: long (nullable = true)
 |-- carrier_code: string (nullable = true)
 |-- vessel_country_code: string (nullable = true)
 |-- vessel_name: string (nullable = true)
 |-- port_of_unlading: string (nullable = true)
 |-- estimated_arrival_date: timestamp (nullable = true)
 |-- foreign_port_of_lading_qualifier: string (nullable = true)
 |-- foreign_port_of_lading: string (nullable = true)
 |-- manifest_quantity: integer (nullable = true)
 |-- manifest_unit: string (nullable = true)
 |-- weight: long (nullable = true)
 |-- weight_unit: string (nullable = true)
 |-- record_status_indicator: string (nullable = true)
 |-- place_of_receipt: string (nullable = true)
 |-- port_of_destination: string (nullable = true)
 |-- foreign_port_of_destination_qualifier: string (nullable = true)
 |-- foreign_port_of_destination: string (nullable = true)
 |-- conveyance_id_qualifier: string (nullable = true)
 |-- conveyance_id: string (nullable = true)
 |-- mode_of_transportation: string (n

In [14]:
header_full.count()

986077

In [19]:
row = header_full.select([
    count(when(isnull(col('identifier')), col('identifier'))).alias('identifier_null_count')
]).collect()[0]

In [20]:
row

Row(identifier_null_count=0)