In [87]:
import findspark

In [88]:
findspark.init("/Users/jackyho/my_path_to_legend/spark-3.0.1")

In [89]:
import random
from pyspark.sql import SparkSession
from pyspark.sql.types import TimestampType, StructType, StringType, IntegerType, FloatType, StructField
from pyspark.sql.functions import lit

In [90]:
ss = SparkSession \
    .builder \
    .master("local[*]") \
    .appName("us_import_sample") \
    .getOrCreate()

In [91]:
contact_schema = StructType([
    StructField("identifier", StringType()),
    StructField("name", StringType()),
    StructField("address_1", StringType()),
    StructField("address_2", StringType()),
    StructField("address_3", StringType()),
    StructField("address_4", StringType()),
    StructField("city", StringType()),
    StructField("state_province", StringType()),
    StructField("zip_code", StringType()),
    StructField("country_code", StringType()),
    StructField("contact_name", StringType()),
    StructField("comm_number_qualifier", StringType()),
    StructField("comm_number", StringType())
])

In [92]:
consignee = ss.read \
    .option("header", True) \
    .option("escape", '"') \
    .csv(
        './ams/2020/202001201500/ams__consignee_2020__202001201500.csv', 
        schema=contact_schema
    )
consignee = consignee.withColumn('contact_type', lit('consignee'))

In [93]:
consignee.printSchema()

root
 |-- identifier: string (nullable = true)
 |-- name: string (nullable = true)
 |-- address_1: string (nullable = true)
 |-- address_2: string (nullable = true)
 |-- address_3: string (nullable = true)
 |-- address_4: string (nullable = true)
 |-- city: string (nullable = true)
 |-- state_province: string (nullable = true)
 |-- zip_code: string (nullable = true)
 |-- country_code: string (nullable = true)
 |-- contact_name: string (nullable = true)
 |-- comm_number_qualifier: string (nullable = true)
 |-- comm_number: string (nullable = true)
 |-- contact_type: string (nullable = false)



In [94]:
notified_party = ss.read \
    .option("header", True) \
    .option("escape", '"') \
    .csv(
        './ams/2020/202001201500/ams__notifyparty_2020__202001201500.csv', 
        schema=contact_schema
    )
notified_party = notified_party.withColumn('contact_type', lit('notified_party'))

In [96]:
shipper = ss.read \
    .option("header", True) \
    .option("escape", '"') \
    .csv(
        './ams/2020/202001201500/ams__shipper_2020__202001201500.csv', 
        schema=contact_schema
    )
shipper = shipper.withColumn('contact_type', lit('shipper'))

In [97]:
contact = consignee.union(shipper).union(notified_party)

In [98]:
contact.printSchema()

root
 |-- identifier: string (nullable = true)
 |-- name: string (nullable = true)
 |-- address_1: string (nullable = true)
 |-- address_2: string (nullable = true)
 |-- address_3: string (nullable = true)
 |-- address_4: string (nullable = true)
 |-- city: string (nullable = true)
 |-- state_province: string (nullable = true)
 |-- zip_code: string (nullable = true)
 |-- country_code: string (nullable = true)
 |-- contact_name: string (nullable = true)
 |-- comm_number_qualifier: string (nullable = true)
 |-- comm_number: string (nullable = true)
 |-- contact_type: string (nullable = false)



In [99]:
contact.coalesce(1).write.mode('overwrite').format("csv") \
    .option("header", True) \
    .option("escape", '"') \
    .save("./ams/contact")