In [0]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .config("spark.jars.packages", "com.crealytics:spark-excel_2.12:0.13.5") \
    .getOrCreate()

In [0]:
from pyspark.sql.types import StructField, StructType, StringType

In [0]:
df = spark.read.format("com.crealytics.spark.excel") \
    .option("inferSchema", True) \
    .option("header", True) \
    .option("dataAddress", "'Dirty 3'!A2") \
    .option("treatEmptyValuesAsNulls", "false")\
    .load("/FileStore/tables/3__Badly_Structured_Sales_Data_3-1.xlsx")

display(df)

Segment,Consumer1,Corporate2,Home Office3,Consumer4,Corporate5,Home Office6,Consumer7,Corporate8,Home Office9,Consumer10,Corporate11,Home Office12
Order Date,,,,,,,,,,,,
14-Mar-13,,,,,,,,,,,,91.056
16-Dec-13,,,,,,,129.44,,,,,
2-Jun-13,,,,,,,,,,605.47,,
21-Oct-13,,,,,,,,,,,788.86,
27-Aug-13,,,,,,,13.36,,,,,
28-Nov-13,,,,,,,,542.3399999999999,,,,
31-Mar-13,,,,,,,,,,,1.8690000000000004,
21-Nov-13,,,,,,,,,,,865.5,
1-Nov-13,,,,,,,,,,,1044.4399999999998,


In [0]:
df.printSchema()

root
 |-- Segment: string (nullable = true)
 |-- Consumer1: string (nullable = true)
 |-- Corporate2: string (nullable = true)
 |-- Home Office3: string (nullable = true)
 |-- Consumer4: string (nullable = true)
 |-- Corporate5: string (nullable = true)
 |-- Home Office6: string (nullable = true)
 |-- Consumer7: double (nullable = true)
 |-- Corporate8: double (nullable = true)
 |-- Home Office9: string (nullable = true)
 |-- Consumer10: double (nullable = true)
 |-- Corporate11: double (nullable = true)
 |-- Home Office12: double (nullable = true)



In [0]:
renamed_dict = {
    "Segment": "Order Date",
    "Consumer1": "First Class Consumer",
    "Corporate2": "First Class Corporate",
    "Home Office3": "First Class Home_Office",
    "Consumer4": "Same Day Consumer",
    "Corporate5": "Same Day Corporate",
    "Home Office6": "Same Day Home_Office",
    "Consumer7": "Second Class Consumer",
    "Corporate8": "Second Class Corporate",
    "Home Office9": "Second Class Home_Office",
    "Consumer10": "Standard Class Consumer",
    "Corporate11": "Standard Class Corporate",
    "Home Office12": "Standard Class Home_Office",
}

for old_name, new_name in renamed_dict.items():
    df = df.withColumnRenamed(old_name, new_name)

In [0]:
display(df)

Order Date,First Class Consumer,First Class Corporate,First Class Home_Office,Same Day Consumer,Same Day Corporate,Same Day Home_Office,Second Class Consumer,Second Class Corporate,Second Class Home_Office,Standard Class Consumer,Standard Class Corporate,Standard Class Home_Office
Order Date,,,,,,,,,,,,
14-Mar-13,,,,,,,,,,,,91.056
16-Dec-13,,,,,,,129.44,,,,,
2-Jun-13,,,,,,,,,,605.47,,
21-Oct-13,,,,,,,,,,,788.86,
27-Aug-13,,,,,,,13.36,,,,,
28-Nov-13,,,,,,,,542.3399999999999,,,,
31-Mar-13,,,,,,,,,,,1.8690000000000004,
21-Nov-13,,,,,,,,,,,865.5,
1-Nov-13,,,,,,,,,,,1044.4399999999998,


In [0]:
from pyspark.sql import functions as F

In [0]:
melted_df = (
    df.select(
        "Order Date",
        F.lit("First Class Consumer").alias("Ship Mode and Segment"),
        F.col("First Class Consumer").alias("Total")
    ).union(
        df.select(
            "Order Date",
            F.lit("First Class Corporate").alias("Ship Mode and Segment"),
            F.col("First Class Corporate").alias("Total")
        )
    ).union(
        df.select(
            "Order Date",
            F.lit("First Class Home_Office").alias("Ship Mode and Segment"),
            F.col("First Class Home_Office").alias("Total")
        )
    ).union(
        df.select(
            "Order Date",
            F.lit("Second Class Consumer").alias("Ship Mode and Segment"),
            F.col("Second Class Consumer").alias("Total")
        )
    ).union(
        df.select(
            "Order Date",
            F.lit("Second Class Corporate").alias("Ship Mode and Segment"),
            F.col("Second Class Corporate").alias("Total")
        )
    ).union(
        df.select(
            "Order Date",
            F.lit("Second Class Home_Office").alias("Ship Mode and Segment"),
            F.col("Second Class Home_Office").alias("Total")
        )
    ).union(
        df.select(
            "Order Date",
            F.lit("Same Day Consumer").alias("Ship Mode and Segment"),
            F.col("Same Day Consumer").alias("Total")
        )
    ).union(
        df.select(
            "Order Date",
            F.lit("Same Day Corporate").alias("Ship Mode and Segment"),
            F.col("Same Day Corporate").alias("Total")
        )
    ).union(
        df.select(
            "Order Date",
            F.lit("Same Day Home_Office").alias("Ship Mode and Segment"),
            F.col("Same Day Home_Office").alias("Total")
        )
    ).union(
        df.select(
            "Order Date",
            F.lit("Standard Class Consumer").alias("Ship Mode and Segment"),
            F.col("Standard Class Consumer").alias("Total")
        )
    ).union(
        df.select(
            "Order Date",
            F.lit("Standard Class Corporate").alias("Ship Mode and Segment"),
            F.col("Standard Class Corporate").alias("Total")
        )
    ).union(
        df.select(
            "Order Date",
            F.lit("Standard Class Home_Office").alias("Ship Mode and Segment"),
            F.col("Standard Class Home_Office").alias("Total")
        )
    )
)

In [0]:
melted_df.filter(melted_df["Ship Mode and Segment"] == "Second Class Consumer")

melted_df.show(10)

+----------+---------------------+-----+
|Order Date|Ship Mode and Segment|Total|
+----------+---------------------+-----+
|Order Date| First Class Consumer| null|
| 14-Mar-13| First Class Consumer| null|
| 16-Dec-13| First Class Consumer| null|
|  2-Jun-13| First Class Consumer| null|
| 21-Oct-13| First Class Consumer| null|
| 27-Aug-13| First Class Consumer| null|
| 28-Nov-13| First Class Consumer| null|
| 31-Mar-13| First Class Consumer| null|
| 21-Nov-13| First Class Consumer| null|
|  1-Nov-13| First Class Consumer| null|
+----------+---------------------+-----+
only showing top 10 rows



In [0]:
first_row = melted_df.limit(1)
melted_df = melted_df.subtract(first_row)

In [0]:
melted_df = melted_df.dropna(subset=["Total"])

In [0]:
melted_df.printSchema()

root
 |-- Order Date: string (nullable = true)
 |-- Ship Mode and Segment: string (nullable = false)
 |-- Total: string (nullable = true)



In [0]:
melted_df.show()

+----------+---------------------+-------+
|Order Date|Ship Mode and Segment|  Total|
+----------+---------------------+-------+
| 24-Dec-13| First Class Consumer|  9.568|
|  5-Sep-13| First Class Consumer| 31.984|
| 15-Aug-13| First Class Consumer|  243.6|
|  7-Apr-13| First Class Consumer|   8.96|
| 15-Jan-13| First Class Consumer| 149.95|
| 19-May-13| First Class Consumer|   34.2|
| 23-Mar-13| First Class Consumer|  9.912|
| 12-Aug-13| First Class Consumer| 286.65|
| 18-Apr-14| First Class Consumer|  106.5|
| 30-Apr-13| First Class Consumer|1000.95|
| 23-Mar-14| First Class Consumer|  59.48|
| 23-Dec-14| First Class Consumer| 194.32|
|  5-Jul-13| First Class Consumer| 514.03|
| 21-Nov-14| First Class Consumer| 18.176|
| 30-Dec-13| First Class Consumer| 39.128|
| 22-Nov-14| First Class Consumer| 32.985|
| 30-Nov-14| First Class Consumer|440.144|
|  2-Nov-14| First Class Consumer| 197.72|
| 11-Dec-14| First Class Consumer| 196.62|
| 10-Dec-15| First Class Consumer|1056.86|
+----------

In [0]:
melted_df = melted_df.withColumn('Order Date', F.to_date(melted_df['Order Date'], 'd-MMM-yy'))

In [0]:
from pyspark.sql.functions import split,expr

In [0]:
melted_df = melted_df.withColumn("split_array", split(melted_df["Ship Mode and Segment"], " "))

In [0]:
melted_df = melted_df.withColumn("Segment", expr("split_array[size(split_array) - 1]"))

In [0]:
melted_df = melted_df.withColumn("Ship Mode", expr("concat_ws(' ', slice(split_array, 1, size(split_array)-1) )"))

In [0]:
melted_df.show(10)

+----------+---------------------+-------+--------------------+--------+-----------+
|Order Date|Ship Mode and Segment|  Total|         split_array| Segment|  Ship Mode|
+----------+---------------------+-------+--------------------+--------+-----------+
|2013-12-24| First Class Consumer|  9.568|[First, Class, Co...|Consumer|First Class|
|2013-09-05| First Class Consumer| 31.984|[First, Class, Co...|Consumer|First Class|
|2013-08-15| First Class Consumer|  243.6|[First, Class, Co...|Consumer|First Class|
|2013-04-07| First Class Consumer|   8.96|[First, Class, Co...|Consumer|First Class|
|2013-01-15| First Class Consumer| 149.95|[First, Class, Co...|Consumer|First Class|
|2013-05-19| First Class Consumer|   34.2|[First, Class, Co...|Consumer|First Class|
|2013-03-23| First Class Consumer|  9.912|[First, Class, Co...|Consumer|First Class|
|2013-08-12| First Class Consumer| 286.65|[First, Class, Co...|Consumer|First Class|
|2014-04-18| First Class Consumer|  106.5|[First, Class, Co...|Co

In [0]:
melted_df = melted_df.drop('Ship Mode and Segment','split_array')

In [0]:
melted_df.show()

+----------+-------+--------+-----------+
|Order Date|  Total| Segment|  Ship Mode|
+----------+-------+--------+-----------+
|2013-12-24|  9.568|Consumer|First Class|
|2013-09-05| 31.984|Consumer|First Class|
|2013-08-15|  243.6|Consumer|First Class|
|2013-04-07|   8.96|Consumer|First Class|
|2013-01-15| 149.95|Consumer|First Class|
|2013-05-19|   34.2|Consumer|First Class|
|2013-03-23|  9.912|Consumer|First Class|
|2013-08-12| 286.65|Consumer|First Class|
|2014-04-18|  106.5|Consumer|First Class|
|2013-04-30|1000.95|Consumer|First Class|
|2014-03-23|  59.48|Consumer|First Class|
|2014-12-23| 194.32|Consumer|First Class|
|2013-07-05| 514.03|Consumer|First Class|
|2014-11-21| 18.176|Consumer|First Class|
|2013-12-30| 39.128|Consumer|First Class|
|2014-11-22| 32.985|Consumer|First Class|
|2014-11-30|440.144|Consumer|First Class|
|2014-11-02| 197.72|Consumer|First Class|
|2014-12-11| 196.62|Consumer|First Class|
|2015-12-10|1056.86|Consumer|First Class|
+----------+-------+--------+-----

In [0]:
melted_df[melted_df["Total"] == 149.95].show()

+----------+------+--------+-----------+
|Order Date| Total| Segment|  Ship Mode|
+----------+------+--------+-----------+
|2013-01-15|149.95|Consumer|First Class|
+----------+------+--------+-----------+

