# Total Invoice Amount Distribution

In [0]:
#Create SparkSession for using Spark Structured APIs
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
spark = SparkSession \
.builder \
.appName("DataBricks Spark SQL basic example") \
.config("spark.some.config.option", "some-value") \
.getOrCreate()
#Create dataframe to work
df = spark.sql("SELECT * FROM online_retail_ii_csv")

In [0]:
#Calculate the invoice amount (Invoice column) using SQL
total_invoice = spark.sql("SELECT invoice, sum(price) FROM online_retail_ii_csv GROUP BY invoice")

#Calculate the invoice amount (Invoice column) using DataFrame APIs
total_invoice = df.groupby("invoice").sum("price").selectExpr("invoice", "`sum(price)` as Total")

display(df.take(10))

Invoice,StockCode,Description,Quantity,InvoiceDate,Price,Customer_ID,Country
489434,85048,15CM CHRISTMAS GLASS BALL 20 LIGHTS,12,2009-12-01T07:45:00.000+0000,6.95,13085,United Kingdom
489434,79323P,PINK CHERRY LIGHTS,12,2009-12-01T07:45:00.000+0000,6.75,13085,United Kingdom
489434,79323W,WHITE CHERRY LIGHTS,12,2009-12-01T07:45:00.000+0000,6.75,13085,United Kingdom
489434,22041,"""RECORD FRAME 7"""" SINGLE SIZE """,48,2009-12-01T07:45:00.000+0000,2.1,13085,United Kingdom
489434,21232,STRAWBERRY CERAMIC TRINKET BOX,24,2009-12-01T07:45:00.000+0000,1.25,13085,United Kingdom
489434,22064,PINK DOUGHNUT TRINKET POT,24,2009-12-01T07:45:00.000+0000,1.65,13085,United Kingdom
489434,21871,SAVE THE PLANET MUG,24,2009-12-01T07:45:00.000+0000,1.25,13085,United Kingdom
489434,21523,FANCY FONT HOME SWEET HOME DOORMAT,10,2009-12-01T07:45:00.000+0000,5.95,13085,United Kingdom
489435,22350,CAT BOWL,12,2009-12-01T07:46:00.000+0000,2.55,13085,United Kingdom
489435,22349,"DOG BOWL , CHASING BALL DESIGN",12,2009-12-01T07:46:00.000+0000,3.75,13085,United Kingdom


# Monthly Placed and Canceled Orders

In [0]:
# Get Placed Orders
monthly_placed_orders = df.withColumn('Monthly', concat(col('invoicedate').substr(0,5),  col('invoicedate').substr(6,2))).groupBy('Monthly', 'invoice').count().groupBy('Monthly').count().withColumnRenamed('Count', 'PlacedBefore')

monthly_placed_orders.show()

# Get cancelled orders
monthly_cancelled_orders = df.withColumn('Monthly', concat(col('invoicedate').substr(0,5),col('invoicedate').substr(6,2))).filter(col('invoice').startswith("C")).groupBy('invoice', 'Monthly').count().groupBy('Monthly').count().orderBy('Monthly').withColumnRenamed('Count', 'Cancelled')

monthly_cancelled_orders.show()

# Placed orders - 2 * Cancelled Orders
#Join the two df at Monthly and add the Placed column
monthly_orders = monthly_placed_orders.join(monthly_cancelled_orders, monthly_placed_orders['Monthly'] == monthly_cancelled_orders['Monthly'], "inner").withColumn('Placed', col('PlacedBefore') - (2 * col('Cancelled'))).drop(monthly_placed_orders['Monthly'])
#Select desired columns to show
display(monthly_orders.select('Monthly', 'Placed', 'Cancelled'))



Monthly,Placed,Cancelled
2010-01,65,17
2009-12,1528,401


# Monthly Sales

In [0]:
#Make a Monthly column and an Amount column (price * quantity) > groupby Monthly and sum up the Amount
#Sort records by Monthly and rename the sum(Amount) column to Total_Amount > round the Total_Amount
monthly_sales = df.withColumn('Monthly', concat(col('invoicedate').substr(0,5), col('invoicedate').substr(6,2))).withColumn('Amount', col('quantity') * col('price')).groupBy('Monthly').sum('Amount').orderBy('Monthly').withColumnRenamed('sum(Amount)', 'Total_Amount')
monthly_sales = monthly_sales.withColumn('Total_Amount', round(monthly_sales['Total_Amount'], 2))
monthly_sales.show()

# Monthly Active Users

In [0]:
monthly_active_users = df.withColumn('Monthly', concat(col('invoicedate').substr(0,5), col('invoicedate').substr(6,2)).cast("DATE")).groupBy('Monthly', "customer_id").count().groupBy('Monthly').count().withColumnRenamed('count', 'active_users')

monthly_active_users.show()

# New and Existing Users

In [0]:
#Add Monthly column to the df
monthly_df = df.withColumn('Monthly', concat(col('invoicedate').substr(0,5), col('invoicedate').substr(6,2)).cast("DATE"))

#Find out the amount of users joined over each month
monthly_users_joined_date_df = monthly_df.groupBy('customer_id').agg(min('Monthly')).groupBy('min(Monthly)').count().orderBy('min(Monthly)').withColumnRenamed('count', 'new_users').withColumnRenamed('min(Monthly)','Monthly')
monthly_users_joined_date_df.show()

#Find out amount of new user and existing user (active users - new users) each month
monthly_users = monthly_active_users.join(monthly_users_joined_date_df, monthly_active_users['Monthly'] == monthly_users_joined_date_df['Monthly'], "inner").withColumn('existing_users', col('active_users') - col('new_users')).drop(monthly_active_users['Monthly'])
monthly_users = monthly_users.select('Monthly', 'new_users', 'existing_users').orderBy('Monthly')

monthly_users.show()

# RFM Segmentation

In [0]:
# Find frequency
frequency = df.groupBy('customer_id', 'invoice').count().groupBy('customer_id').count().orderBy('customer_id').where(col('customer_id').isNotNull()).withColumnRenamed('count', 'frequency')
#Find monetary
monetary = df.withColumn('Amount', col('price') * col('quantity')).groupBy('customer_id').sum('Amount').withColumnRenamed('sum(Amount)', 'monetary').withColumn('monetary', round('monetary', 2)).where(col('customer_id').isNotNull()).orderBy('customer_id')

# Finding recency (compare current date with the Monthly)
recency = df.withColumn('Monthly', concat(col('invoicedate').substr(0,5), col('invoicedate').substr(6,2)).cast("DATE")).groupBy('customer_id').agg(max('Monthly')).withColumn('recency', round(months_between(current_date() , col('max(Monthly)')),0))

# Merge frequency, monetary and and recency
rfm = recency.join(frequency, frequency['customer_id'] == recency['customer_id'], "inner").drop(recency['customer_id']).drop('max(Monthly)')
rfm = rfm.join(monetary, monetary['customer_id'] == rfm['customer_id'], "inner").drop(rfm['customer_id'])
display(rfm.select('customer_id', 'recency','frequency','monetary').take(20))


customer_id,recency,frequency,monetary
13623,144.0,2,634.09
17679,144.0,4,451.0
17389,144.0,1,908.04
18051,143.0,3,1010.56
13289,144.0,1,307.95
15967,144.0,2,134.31
15254,144.0,1,-14.95
12471,144.0,12,3148.67
17172,144.0,1,336.82
14514,144.0,2,852.53
