In [0]:
# create sales dataframe
file_location = "/FileStore/tables/sales.csv"
file_type = "csv"

sales_df = spark.read.format(file_type)\
                .option('inferschema','true')\
                .option('header','true')\
                .option('sep',',')\
                .load(file_location)
display(sales_df.take(5))

ORDERNUMBER,QUANTITYORDERED,PRICEEACH,ORDERLINENUMBER,SALES,ORDERDATE,STATUS,QTR_ID,MONTH_ID,YEAR_ID,PRODUCTLINE,MSRP,PRODUCTCODE,DEALSIZE,CUSTOMERID
10107,30,95.7,2,2871.0,2/24/2003 0:00,Shipped,1,2,2003,Motorcycles,95,S10_1678,Small,1
10121,34,81.35,5,2765.9,5/7/2003 0:00,Shipped,2,5,2003,Motorcycles,95,S10_1678,Small,2
10134,41,94.74,2,3884.34,7/1/2003 0:00,Shipped,3,7,2003,Motorcycles,95,S10_1678,Medium,3
10145,45,83.26,6,3746.7,8/25/2003 0:00,Shipped,3,8,2003,Motorcycles,95,S10_1678,Medium,4
10159,49,100.0,14,5205.27,10/10/2003 0:00,Shipped,4,10,2003,Motorcycles,95,S10_1678,Medium,5


### Rename columns to lower case

In [0]:
columns = sales_df.columns
columns_to_lower = [(i,i.lower()) for i in columns]
for col_upper,col_lower in columns_to_lower:
  sales_df = sales_df.withColumnRenamed(col_upper,col_lower)

sales_df.printSchema()

root
 |-- ordernumber: integer (nullable = true)
 |-- quantityordered: integer (nullable = true)
 |-- priceeach: double (nullable = true)
 |-- orderlinenumber: integer (nullable = true)
 |-- sales: double (nullable = true)
 |-- orderdate: string (nullable = true)
 |-- status: string (nullable = true)
 |-- qtr_id: integer (nullable = true)
 |-- month_id: integer (nullable = true)
 |-- year_id: integer (nullable = true)
 |-- productline: string (nullable = true)
 |-- msrp: integer (nullable = true)
 |-- productcode: string (nullable = true)
 |-- dealsize: string (nullable = true)
 |-- customerid: integer (nullable = true)



### Change datatype of Sales column to decimal

In [0]:
sales_df = sales_df.withColumn('sales',sales_df.sales.cast('decimal(9,3)'))

### Pivot total sales by year grouped by productline

In [0]:
display(sales_df.groupby('productline').pivot('year_id').sum('sales'))

productline,2003,2004,2005
Motorcycles,370895.58,560545.23,234947.53
Vintage Cars,650987.76,911423.77,340739.31
Ships,244821.09,341437.97,128178.07
Trucks and Buses,420429.93,529302.89,178057.02
Classic Cars,1484785.29,1762257.09,672573.28
Trains,72802.29,116523.85,36917.33
Planes,272257.6,502671.8,200074.17


### Explicity pass pivot column values

In [0]:
from pyspark.sql.functions import lit
display(sales_df.filter(sales_df.productline.contains('Cars'))\
        .groupby('productline')\
        .pivot('year_id',[2003,2004])\
        .count())

productline,2003,2004
Vintage Cars,221,284
Classic Cars,366,442
