In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType

In [2]:
ss = SparkSession \
    .builder \
    .master("local[*]") \
    .appName("us_import_sample") \
    .getOrCreate()

In [3]:
container_schema = StructType([
    StructField("identifier", StringType()),
    StructField("container_number", StringType()),
    StructField("equipment_description_code", StringType()),
    StructField("container_length", StringType()),
    StructField("container_height", StringType()),
    StructField("container_width", StringType()),
    StructField("container_type", StringType()),
    StructField("load_status", StringType()),
    StructField("type_of_service", StringType())
])

In [5]:
container = ss.read \
    .option("header", True) \
    .option("escape", '"') \
    .csv('./ams/2020/202001201500/ams__container_2020__202001201500.csv') \
    .select(*container_schema.fieldNames())

In [6]:
container.select('type_of_service').show()

+---------------+
|type_of_service|
+---------------+
|   Pier to Pier|
|   Pier to Pier|
| House to House|
| House to House|
| House to House|
|   Pier to Pier|
|   Pier to Pier|
|   Pier to Pier|
|   Pier to Pier|
|   Pier to Pier|
|  Pier to House|
| Container Yard|
|           null|
|           null|
|           null|
|           null|
|           null|
|   Pier to Pier|
|   Pier to Pier|
|   Pier to Pier|
+---------------+
only showing top 20 rows



In [7]:
marker = ss.read \
    .option("header", True) \
    .option("escape", '"') \
    .csv('./ams/2020/202001201500/ams__marksnumbers_2020__202001201500.csv') \

In [10]:
marker.select('marks_and_numbers_1').show()

+-------------------+
|marks_and_numbers_1|
+-------------------+
|           NO MARKS|
|           NO MARKS|
|              SKU#:|
|               QTY:|
|               DES:|
|        PHILIPPINES|
|       WEIGHT: 3.75|
|          BABY BEAR|
|               BARN|
|                KGS|
|        20991407PBR|
|       FRANCISCO CA|
|            9285948|
|        MADE IN THE|
|         335 OF 409|
|          MOUSE/GFT|
|               BARN|
|               KGS.|
|                ART|
|            WEIGHT:|
+-------------------+
only showing top 20 rows



In [11]:
marker.printSchema()

root
 |-- identifier: string (nullable = true)
 |-- container_number: string (nullable = true)
 |-- marks_and_numbers_1: string (nullable = true)
 |-- marks_and_numbers_2: string (nullable = true)
 |-- marks_and_numbers_3: string (nullable = true)
 |-- marks_and_numbers_4: string (nullable = true)
 |-- marks_and_numbers_5: string (nullable = true)
 |-- marks_and_numbers_6: string (nullable = true)
 |-- marks_and_numbers_7: string (nullable = true)
 |-- marks_and_numbers_8: string (nullable = true)



In [15]:
container_full = container.join(marker, ['identifier', 'container_number'], how='left')