In [6]:
from pyspark import sql
from lib import common_functions
from lib import configuration
import uuid

In [2]:
spark = common_functions.get_spark_session('dp203')
spark.active()

Variables

In [7]:
twenty_n_path = configuration.dp203_input_path+'/2019.csv'
twenty_t_path = configuration.dp203_input_path+'/2020.csv'
twenty_to_path = configuration.dp203_input_path+'/2021.csv'

# Variable for unique folder name
folderName = uuid.uuid4()

folderName

UUID('4e605fd1-4786-41c3-b1f7-86a45b497c3d')

Dataframes

In [None]:
from pyspark.sql.types import *
from pyspark.sql.functions import *

orderSchema = StructType([
    StructField("SalesOrderNumber", StringType()),
    StructField("SalesOrderLineNumber", IntegerType()),
    StructField("OrderDate", DateType()),
    StructField("CustomerName", StringType()),
    StructField("Email", StringType()),
    StructField("Item", StringType()),
    StructField("Quantity", IntegerType()),
    StructField("UnitPrice", FloatType()),
    StructField("Tax", FloatType())
    ])

df = spark.read.option("header", True).load(f'{configuration.dp203_input_path}/*.csv', format='csv', schema=orderSchema)
df.count()
df.show()

+----------------+--------------------+----------+-------------------+--------------------+--------------------+--------+---------+--------+
|SalesOrderNumber|SalesOrderLineNumber| OrderDate|       CustomerName|               Email|                Item|Quantity|UnitPrice|     Tax|
+----------------+--------------------+----------+-------------------+--------------------+--------------------+--------+---------+--------+
|         SO49171|                   1|2021-01-01|      Mariah Foster|mariah21@adventur...|  Road-250 Black, 48|       1|2181.5625| 174.525|
|         SO49172|                   1|2021-01-01|       Brian Howard|brian23@adventure...|    Road-250 Red, 44|       1|  2443.35| 195.468|
|         SO49173|                   1|2021-01-01|      Linda Alvarez|linda19@adventure...|Mountain-200 Silv...|       1|2071.4197|165.7136|
|         SO49174|                   1|2021-01-01|     Gina Hernandez|gina4@adventure-w...|Mountain-200 Silv...|       1|2071.4197|165.7136|
|         SO4

32718

In [5]:
df.printSchema()

root
 |-- SalesOrderNumber: string (nullable = true)
 |-- SalesOrderLineNumber: integer (nullable = true)
 |-- OrderDate: date (nullable = true)
 |-- CustomerName: string (nullable = true)
 |-- Email: string (nullable = true)
 |-- Item: string (nullable = true)
 |-- Quantity: integer (nullable = true)
 |-- UnitPrice: float (nullable = true)
 |-- Tax: float (nullable = true)



In [10]:
from pyspark.sql.functions import split, col

# Create the new FirstName and LastName fields
transformed_df = df.withColumn("FirstName", split(col("CustomerName"), " ").getItem(0)).withColumn("LastName", split(col("CustomerName"), " ").getItem(1))

# Remove the CustomerName field
transformed_df = transformed_df.drop("CustomerName")
transformed_df.show(n=5)

+----------------+--------------------+----------+--------------------+--------------------+--------+---------+--------+---------+---------+
|SalesOrderNumber|SalesOrderLineNumber| OrderDate|               Email|                Item|Quantity|UnitPrice|     Tax|FirstName| LastName|
+----------------+--------------------+----------+--------------------+--------------------+--------+---------+--------+---------+---------+
|         SO49171|                   1|2021-01-01|mariah21@adventur...|  Road-250 Black, 48|       1|2181.5625| 174.525|   Mariah|   Foster|
|         SO49172|                   1|2021-01-01|brian23@adventure...|    Road-250 Red, 44|       1|  2443.35| 195.468|    Brian|   Howard|
|         SO49173|                   1|2021-01-01|linda19@adventure...|Mountain-200 Silv...|       1|2071.4197|165.7136|    Linda|  Alvarez|
|         SO49174|                   1|2021-01-01|gina4@adventure-w...|Mountain-200 Silv...|       1|2071.4197|165.7136|     Gina|Hernandez|
|         SO4

In [11]:
transformed_df.write.mode("overwrite").parquet(f'{configuration.dp203_output_path}/10/%s' % folderName)
print ("Transformed data saved in %s!" % folderName)

Transformed data saved in 4e605fd1-4786-41c3-b1f7-86a45b497c3d!
