# Spark Initialization

In [1]:
# Import and init findspark
import findspark
findspark.init()

In [2]:
# Import required library
from pyspark.sql import SparkSession

# Create Spark Session
spark = SparkSession.builder.appName("Online Retail").getOrCreate()

# Print Spark object ID
print(spark)

<pyspark.sql.session.SparkSession object at 0x000001A47C6E75C0>


# Pre-processing Data

In [3]:
# Import pandas to read excel
import pandas

In [4]:
# Dataset could be downloaded at https://www.kaggle.com/puneetbhaya/online-retail/
datafile = pandas.read_excel("D://kuliah//bigdata//online_retail.xlsx")

In [5]:
# Show detail of input data
datafile.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 541909 entries, 0 to 541908
Data columns (total 8 columns):
InvoiceNo      541909 non-null object
StockCode      541909 non-null object
Description    540455 non-null object
Quantity       541909 non-null int64
InvoiceDate    541909 non-null datetime64[ns]
UnitPrice      541909 non-null float64
CustomerID     406829 non-null float64
Country        541909 non-null object
dtypes: datetime64[ns](1), float64(2), int64(1), object(4)
memory usage: 33.1+ MB


In [6]:
# create dataframe from input
from pyspark.sql.types import *

dataSchema = StructType([ StructField("InvoiceNo", StringType(), True)\
                       ,StructField("StockCode", StringType(), True)\
                       ,StructField("Description", StringType(), True)\
                       ,StructField("Quantity", IntegerType(), True)\
                       ,StructField("InvoiceDate", TimestampType(), True)\
                       ,StructField("UnitPrice", FloatType(), True)\
                       ,StructField("CustomerID", FloatType(), True)\
                       ,StructField("Country", StringType(), True)])

df = spark.createDataFrame(datafile, schema = dataSchema)

In [7]:
# Shows detail of dataframe
df.printSchema()

root
 |-- InvoiceNo: string (nullable = true)
 |-- StockCode: string (nullable = true)
 |-- Description: string (nullable = true)
 |-- Quantity: integer (nullable = true)
 |-- InvoiceDate: timestamp (nullable = true)
 |-- UnitPrice: float (nullable = true)
 |-- CustomerID: float (nullable = true)
 |-- Country: string (nullable = true)



In [8]:
# Print top 20 rows data
df.show()

+---------+---------+--------------------+--------+-------------------+---------+----------+--------------+
|InvoiceNo|StockCode|         Description|Quantity|        InvoiceDate|UnitPrice|CustomerID|       Country|
+---------+---------+--------------------+--------+-------------------+---------+----------+--------------+
|   536365|   85123A|WHITE HANGING HEA...|       6|2010-12-01 01:26:00|     2.55|   17850.0|United Kingdom|
|   536365|    71053| WHITE METAL LANTERN|       6|2010-12-01 01:26:00|     3.39|   17850.0|United Kingdom|
|   536365|   84406B|CREAM CUPID HEART...|       8|2010-12-01 01:26:00|     2.75|   17850.0|United Kingdom|
|   536365|   84029G|KNITTED UNION FLA...|       6|2010-12-01 01:26:00|     3.39|   17850.0|United Kingdom|
|   536365|   84029E|RED WOOLLY HOTTIE...|       6|2010-12-01 01:26:00|     3.39|   17850.0|United Kingdom|
|   536365|    22752|SET 7 BABUSHKA NE...|       2|2010-12-01 01:26:00|     7.65|   17850.0|United Kingdom|
|   536365|    21730|GLASS S

In [9]:
# Set ID and Items column for processing
data = df.selectExpr(['InvoiceNo as ID', 'StockCode as ITEMS'])

In [10]:
# Show two colums data
data.show()

+------+------+
|    ID| ITEMS|
+------+------+
|536365|85123A|
|536365| 71053|
|536365|84406B|
|536365|84029G|
|536365|84029E|
|536365| 22752|
|536365| 21730|
|536366| 22633|
|536366| 22632|
|536367| 84879|
|536367| 22745|
|536367| 22748|
|536367| 22749|
|536367| 22310|
|536367| 84969|
|536367| 22623|
|536367| 22622|
|536367| 21754|
|536367| 21755|
|536367| 21777|
+------+------+
only showing top 20 rows



# Processing Data

In [11]:
# Import library for grouping
from pyspark.sql.functions import collect_list

In [12]:
# Group ITEMS from data by ID 
data_grouped = data.groupby("ID").agg(collect_list('ITEMS').alias('ITEMS'))

In [13]:
# Show grouped data
data_grouped.show()

+------+--------------------+
|    ID|               ITEMS|
+------+--------------------+
|536596|[21624, 22900, 22...|
|536938|[22386, 85099C, 2...|
|537252|             [22197]|
|537691|[22791, 22171, 82...|
|538041|             [22145]|
|538184|[22585, 21481, 22...|
|538517|[22491, 21232, 21...|
|538879|[84819, 22150, 21...|
|539275|[22909, 22423, 22...|
|539630|[21484, 85099B, 2...|
|540499|[21868, 22697, 22...|
|540540|[21877, 21868, 21...|
|540976|[22394, 21890, 22...|
|541432|[21485, 22457, 84...|
|541518|[21880, 21881, 21...|
|541783|[22423, 22854, 22...|
|542026|[21754, 82600, 22...|
|542375|[21731, 22367, 22...|
|543641|[85123A, 21833, 2...|
|544303|[22660, 48138, 48...|
+------+--------------------+
only showing top 20 rows



In [14]:
# Set distinct ITEMS from each ID
from pyspark.sql.functions import udf

distinct_udf = udf(lambda row: list(set(row)), ArrayType(StringType()))
data_grouped_distinct = data_grouped.withColumn("ITEMS", distinct_udf("ITEMS"))

In [15]:
# Show grouped distinct data
data_grouped_distinct.show()

+------+--------------------+
|    ID|               ITEMS|
+------+--------------------+
|536596|[84926A, 21624, 2...|
|536938|[21479, 84997B, 2...|
|537252|             [22197]|
|537691|[20975, 22149, 21...|
|538041|             [22145]|
|538184|[22492, 22561, 48...|
|538517|[22197, 22844, 22...|
|538879|[22593, 22983, 22...|
|539275|[22423, 21914, 22...|
|539630|[22988, 84347, 22...|
|540499|[21755, 84978, 22...|
|540540|[22555, 22551, 22...|
|540976|[22207, 21110, 84...|
|541432|[22113, 22457, 21...|
|541518|[20724, 21982, 20...|
|541783|[22197, 84978, 22...|
|542026|[22197, 22398, 22...|
|542375|[22367, 22629, 21...|
|543641|[22371, 44265, 21...|
|544303|[20856, 22197, 20...|
+------+--------------------+
only showing top 20 rows



In [16]:
# Import library for calculating frequently co-occurring items
from pyspark.ml.fpm import FPGrowth

In [17]:
# Calculate frequent itemsets with minimum support 0.4 and minimum confidence 0.8
fpgrowth1 = FPGrowth(itemsCol="ITEMS", minSupport=0.4, minConfidence=0.8)
model1 = fpgrowth1.fit(data_grouped_distinct)

In [18]:
# Show the result
model1.freqItemsets.show()

+-----+----+
|items|freq|
+-----+----+
+-----+----+



In [19]:
# No result from calculation above, lower minimum support to 0.05 and minimum confidence to 0.7
fpgrowth2 = FPGrowth(itemsCol="ITEMS", minSupport=0.05, minConfidence=0.7)
model2 = fpgrowth2.fit(data_grouped_distinct)

In [20]:
# Show the result
model2.freqItemsets.show()

+--------+----+
|   items|freq|
+--------+----+
|[85123A]|2246|
| [22423]|2172|
|[85099B]|2135|
| [47566]|1706|
| [20725]|1608|
| [84879]|1468|
| [22720]|1462|
| [22197]|1442|
| [21212]|1334|
| [22383]|1306|
| [20727]|1295|
+--------+----+



In [21]:
# Show association rules
model2.associationRules.show()

+----------+----------+----------+
|antecedent|consequent|confidence|
+----------+----------+----------+
+----------+----------+----------+



In [22]:
# Lower minimum support to 0.02 and minimum confidence to 0.6
fpgrowth3 = FPGrowth(itemsCol="ITEMS", minSupport=0.02, minConfidence=0.6)
model3 = fpgrowth3.fit(data_grouped_distinct)

In [23]:
# Show the result
model3.freqItemsets.show()

+---------------+----+
|          items|freq|
+---------------+----+
|       [85123A]|2246|
|        [22423]|2172|
|       [85099B]|2135|
|        [47566]|1706|
|        [20725]|1608|
|[20725, 85099B]| 588|
|        [84879]|1468|
|        [22720]|1462|
|        [22197]|1442|
|        [21212]|1334|
|        [22383]|1306|
| [22383, 20725]| 663|
|        [20727]|1295|
| [20727, 20725]| 648|
| [20727, 22383]| 587|
|        [22457]|1266|
|         [POST]|1254|
|        [23203]|1249|
|[23203, 85099B]| 582|
|        [22386]|1231|
+---------------+----+
only showing top 20 rows



In [24]:
# Show association rules
model3.associationRules.show()

+--------------+----------+------------------+
|    antecedent|consequent|        confidence|
+--------------+----------+------------------+
|       [22699]|   [22697]|               0.7|
|       [22386]|  [85099B]|0.6766856214459789|
|       [22910]|   [22086]|0.6670673076923077|
|       [22630]|   [22629]|0.6255813953488372|
|       [22726]|   [22727]|0.6440677966101694|
|[22698, 22697]|   [22699]|0.8524844720496895|
|       [21931]|  [85099B]|0.6103247293921732|
|      [85099F]|  [85099B]|0.6566265060240963|
|       [20712]|  [85099B]|0.6169724770642202|
|       [22698]|   [22697]|0.8029925187032418|
|       [22698]|   [22699]|0.7655860349127181|
|       [22697]|   [22699]|0.7417218543046358|
|       [22697]|   [22698]| 0.609271523178808|
|       [21928]|  [85099B]|0.6691176470588235|
|[22698, 22699]|   [22697]|0.8941368078175895|
|      [85099C]|  [85099B]|0.6261879619852164|
|[22697, 22699]|   [22698]|0.7002551020408163|
|       [22356]|   [20724]|0.6921052631578948|
|       [2330

In [25]:
# Transform item data with association rules to get predictions
model3.transform(data_grouped_distinct).show()

+------+--------------------+--------------+
|    ID|               ITEMS|    prediction|
+------+--------------------+--------------+
|536596|[84926A, 21624, 2...|            []|
|536938|[21479, 84997B, 2...|      [85099B]|
|537252|             [22197]|            []|
|537691|[20975, 22149, 21...|            []|
|538041|             [22145]|            []|
|538184|[22492, 22561, 48...|            []|
|538517|[22197, 22844, 22...|            []|
|538879|[22593, 22983, 22...|            []|
|539275|[22423, 21914, 22...|            []|
|539630|[22988, 84347, 22...|            []|
|540499|[21755, 84978, 22...|[22698, 20724]|
|540540|[22555, 22551, 22...|            []|
|540976|[22207, 21110, 84...|            []|
|541432|[22113, 22457, 21...|            []|
|541518|[20724, 21982, 20...|            []|
|541783|[22197, 84978, 22...|       [22698]|
|542026|[22197, 22398, 22...|            []|
|542375|[22367, 22629, 21...|            []|
|543641|[22371, 44265, 21...|            []|
|544303|[2

# Testing

- First test is to predict item 22698

In [26]:
# Set item 22698 into dataframe
df_predict1 = spark.createDataFrame([('0',['22698'])],['ID','ITEMS'])
df_predict1.show()

+---+-------+
| ID|  ITEMS|
+---+-------+
|  0|[22698]|
+---+-------+



In [27]:
# Use model3 to make prediction
model3.transform(df_predict1).show()

+---+-------+--------------+
| ID|  ITEMS|    prediction|
+---+-------+--------------+
|  0|[22698]|[22697, 22699]|
+---+-------+--------------+



In [28]:
# Get the description of item that being predicted
df.select('Description').filter(df.StockCode == '22698').collect()[0][0]

'PINK REGENCY TEACUP AND SAUCER'

In [29]:
# Get the description of prediction item 22697
df.select('Description').filter(df.StockCode == '22697').collect()[0][0]

'GREEN REGENCY TEACUP AND SAUCER'

In [30]:
# Get the description of prediction item 22699
df.select('Description').filter(df.StockCode == '22699').collect()[0][0]

'ROSES REGENCY TEACUP AND SAUCER '

- Second test is to predict item 22386

In [31]:
# Set item 22386 into dataframe
df_predict2 = spark.createDataFrame([('0',['22386'])],['ID','ITEMS'])
df_predict2.show()

+---+-------+
| ID|  ITEMS|
+---+-------+
|  0|[22386]|
+---+-------+



In [32]:
# Use model3 to make prediction
model3.transform(df_predict2).show()

+---+-------+----------+
| ID|  ITEMS|prediction|
+---+-------+----------+
|  0|[22386]|  [85099B]|
+---+-------+----------+



In [33]:
# Get the description of item that being predicted
df.select('Description').filter(df.StockCode == '22386').collect()[0][0]

'JUMBO BAG PINK POLKADOT'

In [34]:
# Get the description of prediction item 85099B
df.select('Description').filter(df.StockCode == '85099B').collect()[0][0]

'JUMBO BAG RED RETROSPOT'

# Results

From predictions above we could conclude that:
- Customers that buy item 22698 (PINK REGENCY TEACUP AND SAUCER) are most likely to buy item 22697 (GREEN REGENCY TEACUP AND SAUCER) and/or 22699 (ROSES REGENCY TEACUP AND SAUCER) too
- Customers that buy item 22386 (JUMBO BAG PINK POLKADOT) are most likely to buy item 85099B (JUMBO BAG RED RETROSPOT) too

# References

- Read Excel using Python - https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.read_excel.html
- Remove duplicates from PySpark array column - https://stackoverflow.com/questions/54185710/remove-duplicates-from-pyspark-array-column
- Convert spark DataFrame column to python list - https://stackoverflow.com/questions/38610559/convert-spark-dataframe-column-to-python-list
