In [0]:
from pyspark.sql import SparkSession, DataFrame

spark = SparkSession.builder \
    .config("spark.jars.packages", "com.crealytics:spark-excel_2.12:0.13.5") \
    .appName("badStructuredDataCleaning") \
    .getOrCreate()

In [0]:
df = spark.read.format("com.crealytics.spark.excel") \
    .option("inferSchema", "false") \
    .option("header", "true") \
    .option("dataAddress", "'Dirty 1'!A2") \
    .load("/FileStore/tables/1__Badly_Structured_Sales_Data_1.xlsx")

In [0]:
display(df)

Ship Mode>>,First Class1,Same Day2,Second Class3,Standard Class4,First Class5,Same Day6,Second Class7,Standard Class8,First Class9,Same Day10,Second Class11,Standard Class12
Order ID,,,,,,,,,,,,
CA-2011-100293,,,,,,,,,,,,91.056
CA-2011-100706,,,129.44,,,,,,,,,
CA-2011-100895,,,,605.47,,,,,,,,
CA-2011-100916,,,,,,,,788.86,,,,
CA-2011-101266,,,13.36,,,,,,,,,
CA-2011-101560,,,,,,,542.34,,,,,
CA-2011-101770,,,,,,,,1.869,,,,
CA-2011-102274,,,,,,,,865.5,,,,
CA-2011-102673,,,,,,,,1044.44,,,,


In [0]:
df.printSchema()

root
 |-- Ship Mode>>: string (nullable = true)
 |-- First Class1: string (nullable = true)
 |-- Same Day2: string (nullable = true)
 |-- Second Class3: string (nullable = true)
 |-- Standard Class4: string (nullable = true)
 |-- First Class5: string (nullable = true)
 |-- Same Day6: string (nullable = true)
 |-- Second Class7: string (nullable = true)
 |-- Standard Class8: string (nullable = true)
 |-- First Class9: string (nullable = true)
 |-- Same Day10: string (nullable = true)
 |-- Second Class11: string (nullable = true)
 |-- Standard Class12: string (nullable = true)



In [0]:
renamed_dict = {
    "Ship Mode>>": "Order ID",
    "First Class1": "Consumer First Class",
    "Same Day2": "Consumer Same Day",
    "Second Class3": "Consumer Second Class",
    "Standard Class4": "Consumer Standard Class",
    "First Class5": "Corporate First Class",
    "Same Day6": "Corporate Same Day",
    "Second Class7": "Corporate Second Class",
    "Standard Class8": "Corporate Standard Class",
    "First Class9": "Home_Office First Class",
    "Same Day10": "Home_Office Same Day",
    "Second Class11": "Home_Office Second Class",
    "Standard Class12": "Home_Office Standard Class",
}

for old_name, new_name in renamed_dict.items():
    df = df.withColumnRenamed(old_name, new_name)


In [0]:
df.show()

+--------------+--------------------+-----------------+---------------------+-----------------------+---------------------+------------------+----------------------+------------------------+-----------------------+--------------------+------------------------+--------------------------+
|      Order ID|Consumer First Class|Consumer Same Day|Consumer Second Class|Consumer Standard Class|Corporate First Class|Corporate Same Day|Corporate Second Class|Corporate Standard Class|Home_Office First Class|Home_Office Same Day|Home_Office Second Class|Home_Office Standard Class|
+--------------+--------------------+-----------------+---------------------+-----------------------+---------------------+------------------+----------------------+------------------------+-----------------------+--------------------+------------------------+--------------------------+
|      Order ID|                null|             null|                 null|                   null|                 null|             

In [0]:
# koalas method (>3.2, <3.4)
# df.to_koalas() \
#     .melt(id_vars=['Order ID'], value_vars=["Consumer First Class", "Consumer Same Day", "Consumer Second Class", "Consumer Standard Class", "Corporate First Class", "Corporate Same Day", "Corporate Second Class", "Corporate Standard Class", "Home_Office First Class", "Home_Office Same Day", "Home_Office Second Class", "Home_Office Standard Class"], var_name="Segment and Ship Mode", value_name="Sales") \
#     .to_spark() \
#     .show()

In [0]:
from typing import Iterable
from pyspark.sql import DataFrame
from pyspark.sql.functions import array, explode, lit, col, struct

In [0]:
def melt(df: DataFrame, id_vars: Iterable[str], value_vars: Iterable[str], var_name: str="variable", value_name: str="value") -> DataFrame:
    """"Convert :Class:`Dataframe` from wide to long format."""
    # Create array<struct<variable: str, value:.. >>>
    # array(*( )) create a spark sql array type from a python iterable(e.g list, tuple)
    _vars_and_vals = array(*(struct(lit(c).alias(var_name), col(c).alias(value_name)) for c in value_vars)) 

    # Add to the DataFrame and explode
    _tmp = df.withColumn("_vars_and_vals", explode(_vars_and_vals))

    cols = id_vars + [
        col("_vars_and_vals")[x].alias(x) for x in [var_name, value_name]
    ] 
    return _tmp.select(*cols) # * unpacks the contents of cols

In [0]:
# sdf = spark.createDataFrame(df)
melted_df = melt(df, id_vars=['Order ID'], value_vars=["Consumer First Class", "Consumer Same Day", "Consumer Second Class", "Consumer Standard Class", "Corporate First Class", "Corporate Same Day", "Corporate Second Class", "Corporate Standard Class", "Home_Office First Class", "Home_Office Same Day", "Home_Office Second Class", "Home_Office Standard Class"], var_name="Segment and Ship Mode" ,value_name="Sales")

In [0]:
melted_df = melted_df.dropna(subset=["Sales"])

In [0]:
melted_df.show()

+--------------+---------------------+-------+
|      Order ID|Segment and Ship Mode|  Sales|
+--------------+---------------------+-------+
|CA-2011-100293| Home_Office Stand...| 91.056|
|CA-2011-100706| Consumer Second C...| 129.44|
|CA-2011-100895| Consumer Standard...| 605.47|
|CA-2011-100916| Corporate Standar...| 788.86|
|CA-2011-101266| Consumer Second C...|  13.36|
|CA-2011-101560| Corporate Second ...| 542.34|
|CA-2011-101770| Corporate Standar...|  1.869|
|CA-2011-102274| Corporate Standar...|  865.5|
|CA-2011-102673| Corporate Standar...|1044.44|
|CA-2011-102988| Corporate Second ...|4251.92|
|CA-2011-103317| Corporate First C...|242.546|
|CA-2011-103366| Consumer First Class| 149.95|
|CA-2011-103807| Corporate Standar...|  21.19|
|CA-2011-103989| Corporate First C...|590.762|
|CA-2011-104283| Consumer Standard...| 616.14|
|CA-2011-106054| Corporate First C...|  12.78|
|CA-2011-106810| Corporate Standar...| 310.88|
|CA-2011-107573| Consumer Standard...| 23.472|
|CA-2011-1078

In [0]:
from pyspark.sql.functions import split

In [0]:
melted_df = melted_df.withColumn("Segment", split(melted_df["Segment and Ship Mode"], " ")[0]) \
    .withColumn("Ship Mode", split(melted_df["Segment and Ship Mode"], " ", 2)[1]) #limits the split to a maximum of 2 parts.

In [0]:
melted_df = melted_df.drop("Segment and Ship Mode")

In [0]:
melted_df.show()

+--------------+-------+-----------+--------------+
|      Order ID|  Sales|    Segment|     Ship Mode|
+--------------+-------+-----------+--------------+
|CA-2011-100293| 91.056|Home_Office|Standard Class|
|CA-2011-100706| 129.44|   Consumer|  Second Class|
|CA-2011-100895| 605.47|   Consumer|Standard Class|
|CA-2011-100916| 788.86|  Corporate|Standard Class|
|CA-2011-101266|  13.36|   Consumer|  Second Class|
|CA-2011-101560| 542.34|  Corporate|  Second Class|
|CA-2011-101770|  1.869|  Corporate|Standard Class|
|CA-2011-102274|  865.5|  Corporate|Standard Class|
|CA-2011-102673|1044.44|  Corporate|Standard Class|
|CA-2011-102988|4251.92|  Corporate|  Second Class|
|CA-2011-103317|242.546|  Corporate|   First Class|
|CA-2011-103366| 149.95|   Consumer|   First Class|
|CA-2011-103807|  21.19|  Corporate|Standard Class|
|CA-2011-103989|590.762|  Corporate|   First Class|
|CA-2011-104283| 616.14|   Consumer|Standard Class|
|CA-2011-106054|  12.78|  Corporate|   First Class|
|CA-2011-106

In [0]:
from pyspark.sql.functions import regexp_replace, round

In [0]:
melted_df = melted_df.withColumn("Segment", regexp_replace(melted_df["Segment"], "_", " "))

In [0]:
melted_df.withColumn("Sales", round(melted_df["Sales"], 2)).show()

+--------------+-------+-----------+--------------+
|      Order ID|  Sales|    Segment|     Ship Mode|
+--------------+-------+-----------+--------------+
|CA-2011-100293|  91.06|Home Office|Standard Class|
|CA-2011-100706| 129.44|   Consumer|  Second Class|
|CA-2011-100895| 605.47|   Consumer|Standard Class|
|CA-2011-100916| 788.86|  Corporate|Standard Class|
|CA-2011-101266|  13.36|   Consumer|  Second Class|
|CA-2011-101560| 542.34|  Corporate|  Second Class|
|CA-2011-101770|   1.87|  Corporate|Standard Class|
|CA-2011-102274|  865.5|  Corporate|Standard Class|
|CA-2011-102673|1044.44|  Corporate|Standard Class|
|CA-2011-102988|4251.92|  Corporate|  Second Class|
|CA-2011-103317| 242.55|  Corporate|   First Class|
|CA-2011-103366| 149.95|   Consumer|   First Class|
|CA-2011-103807|  21.19|  Corporate|Standard Class|
|CA-2011-103989| 590.76|  Corporate|   First Class|
|CA-2011-104283| 616.14|   Consumer|Standard Class|
|CA-2011-106054|  12.78|  Corporate|   First Class|
|CA-2011-106