In [59]:
from pyspark.sql import SparkSession
from pyspark.sql.types import TimestampType, StructType, StringType, IntegerType, FloatType, StructField
from pyspark.sql.functions import lit, concat, col, regexp_replace

In [31]:
ss = SparkSession \
    .builder \
    .master("local[*]") \
    .appName("us_import_sample") \
    .getOrCreate()

In [32]:
cargo_desc_schema = StructType([
    StructField("identifier", StringType()),
    StructField("name", StringType()),
    StructField("description_sequence_number", IntegerType()),
    StructField("piece_count", IntegerType()),
    StructField("description_text", StringType())
])

In [33]:
cargo_desc = ss.read \
    .option("header", True) \
    .option("escape", '"') \
    .csv(
        './ams/2020/202001201500/ams__cargodesc_2020__202001201500.csv',
        schema=cargo_desc_schema
    )

In [34]:
cargo_desc.printSchema()

root
 |-- identifier: string (nullable = true)
 |-- name: string (nullable = true)
 |-- description_sequence_number: integer (nullable = true)
 |-- piece_count: integer (nullable = true)
 |-- description_text: string (nullable = true)



In [35]:
cargo_desc.select('piece_count').show(10)

+-----------+
|piece_count|
+-----------+
|         18|
|         16|
|       null|
|          9|
|         19|
|       null|
|        242|
|        317|
|        666|
|          0|
+-----------+
only showing top 10 rows



In [50]:
cargo_desc = cargo_desc.withColumn("piece_count_str", col("piece_count").cast('string'))

In [51]:
cargo_desc.select('piece_count_str').show(10)

+---------------+
|piece_count_str|
+---------------+
|             18|
|             16|
|           null|
|              9|
|             19|
|           null|
|            242|
|            317|
|            666|
|              0|
+---------------+
only showing top 10 rows



In [52]:
cargo_desc = cargo_desc.withColumn("piece_count_str", concat(col('piece_count_str'), lit(' ')))

In [53]:
cargo_desc.select('piece_count_str').show(10)

+---------------+
|piece_count_str|
+---------------+
|            18 |
|            16 |
|           null|
|             9 |
|            19 |
|           null|
|           242 |
|           317 |
|           666 |
|             0 |
+---------------+
only showing top 10 rows



In [54]:
cargo_desc = cargo_desc.na.fill({'piece_count_str': ''})

In [55]:
cargo_desc.select('piece_count_str').show(10)

+---------------+
|piece_count_str|
+---------------+
|            18 |
|            16 |
|               |
|             9 |
|            19 |
|               |
|           242 |
|           317 |
|           666 |
|             0 |
+---------------+
only showing top 10 rows



In [56]:
cargo_desc = cargo_desc.withColumn(
    "full_description_text", 
    concat(cargo_desc['piece_count_str'], cargo_desc['description_text'])
)

In [57]:
cargo_desc.select('full_description_text').show(10)

+---------------------+
|full_description_text|
+---------------------+
|           18 ANTENNA|
|           16 ANTENNA|
| DECORATIVE TERRAC...|
| 9 CHRISTMAS ORNAM...|
| 19 DECORATIVE HAN...|
|            WOOD BOWL|
| 242 WOOD BARSTOOL...|
| 317 HAMPER SEAGRA...|
|         666 PU BELTS|
| 0 ENGINEERED FLOO...|
+---------------------+
only showing top 10 rows



In [60]:
cargo_desc = cargo_desc.withColumn(
    "full_description_text",
    regexp_replace(col('full_description_text'), '^\s+|\s+$', '')
)

In [61]:
cargo_desc.select('full_description_text').show(10)

+---------------------+
|full_description_text|
+---------------------+
|           18 ANTENNA|
|           16 ANTENNA|
| DECORATIVE TERRAC...|
| 9 CHRISTMAS ORNAM...|
| 19 DECORATIVE HAN...|
|            WOOD BOWL|
| 242 WOOD BARSTOOL...|
| 317 HAMPER SEAGRA...|
|         666 PU BELTS|
| 0 ENGINEERED FLOO...|
+---------------------+
only showing top 10 rows

