In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType

In [2]:
ss = SparkSession \
    .builder \
    .master("local[*]") \
    .appName("us_import_sample") \
    .getOrCreate()

In [3]:
header_schema = StructType([
    StructField("identifier", StringType()),
    StructField("carrier_code", StringType()),
    StructField("vessel_country_code", StringType()),
    StructField("vessel_name", StringType()),
    StructField("port_of_unlading", StringType()),
    StructField("estimated_arrival_date", StringType()),
    StructField("foreign_port_of_lading_qualifier", StringType()),
    StructField("foreign_port_of_lading", StringType()),
    StructField("manifest_quantity", StringType()),
    StructField("manifest_unit", StringType()),
    StructField("weight", StringType()),
    StructField("weight_unit", StringType()),
    StructField("record_status_indicator", StringType()),
    StructField("place_of_receipt", StringType()),
    StructField("port_of_destination", StringType()),
    StructField("foreign_port_of_destination_qualifier", StringType()),
    StructField("foreign_port_of_destination", StringType()),
    StructField("conveyance_id_qualifier", StringType()),
    StructField("conveyance_id", StringType()),
    StructField("mode_of_transportation", StringType()),
    StructField("actual_arrival_date", StringType())
])

In [4]:
header_field_names = header_schema.fieldNames()

In [5]:
header = ss.read \
    .option("header", True) \
    .option("escape", '"') \
    .csv('./ams/2020/202001201500/ams__header_2020__202001201500.csv')

In [6]:
header.printSchema()

root
 |-- identifier: string (nullable = true)
 |-- carrier_code: string (nullable = true)
 |-- vessel_country_code: string (nullable = true)
 |-- vessel_name: string (nullable = true)
 |-- port_of_unlading: string (nullable = true)
 |-- estimated_arrival_date: string (nullable = true)
 |-- foreign_port_of_lading_qualifier: string (nullable = true)
 |-- foreign_port_of_lading: string (nullable = true)
 |-- manifest_quantity: string (nullable = true)
 |-- manifest_unit: string (nullable = true)
 |-- weight: string (nullable = true)
 |-- weight_unit: string (nullable = true)
 |-- measurement: string (nullable = true)
 |-- measurement_unit: string (nullable = true)
 |-- record_status_indicator: string (nullable = true)
 |-- place_of_receipt: string (nullable = true)
 |-- port_of_destination: string (nullable = true)
 |-- foreign_port_of_destination_qualifier: string (nullable = true)
 |-- foreign_port_of_destination: string (nullable = true)
 |-- conveyance_id_qualifier: string (nullable 

In [7]:
new_header = header.select(*header_field_names)

In [8]:
bill_schema = StructType([
    StructField("identifier", StringType()),
    StructField("master_bol_number", StringType()),
    StructField("house_bol_number", StringType()),
    StructField("sub_house_bol_number", StringType()),
    StructField("voyage_number", StringType()),
    StructField("bill_type_code", StringType()),
    StructField("manifest_number", StringType()),
    StructField("trade_update_date", StringType()),
    StructField("run_date", StringType())
])

In [9]:
bill_field_names = bill_schema.fieldNames()

In [10]:
bill = ss.read \
    .option("header", True) \
    .option("escape", '"') \
    .csv('./ams/2020/202001201500/ams__billgen_2020__202001201500.csv')

In [11]:
new_bill = bill.select(*bill_field_names)

In [12]:
header_full = new_header.join(new_bill, ['identifier'], how='left')

In [13]:
header_full.printSchema()

root
 |-- identifier: string (nullable = true)
 |-- carrier_code: string (nullable = true)
 |-- vessel_country_code: string (nullable = true)
 |-- vessel_name: string (nullable = true)
 |-- port_of_unlading: string (nullable = true)
 |-- estimated_arrival_date: string (nullable = true)
 |-- foreign_port_of_lading_qualifier: string (nullable = true)
 |-- foreign_port_of_lading: string (nullable = true)
 |-- manifest_quantity: string (nullable = true)
 |-- manifest_unit: string (nullable = true)
 |-- weight: string (nullable = true)
 |-- weight_unit: string (nullable = true)
 |-- record_status_indicator: string (nullable = true)
 |-- place_of_receipt: string (nullable = true)
 |-- port_of_destination: string (nullable = true)
 |-- foreign_port_of_destination_qualifier: string (nullable = true)
 |-- foreign_port_of_destination: string (nullable = true)
 |-- conveyance_id_qualifier: string (nullable = true)
 |-- conveyance_id: string (nullable = true)
 |-- mode_of_transportation: string (n