In [1]:
from IPython.core.display import HTML
display(HTML("<style>pre { white-space: pre !important; }</style>"))

In [2]:
import findspark
findspark.init()

In [3]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [4]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import collect_list, col, count, collect_set
from pyspark.sql.types import StringType

from pyspark.ml.fpm import FPGrowth
from pyspark.ml.evaluation import RegressionEvaluator

In [5]:
spark = SparkSession.builder.appName('bakery').getOrCreate()

In [6]:
# Load data
data = spark.read.csv("../../Data/75000/75000i.csv", header=False, inferSchema=True)

In [7]:
data.show()

+---+---+---+
|_c0|_c1|_c2|
+---+---+---+
|  1|  1| 21|
|  1|  5| 11|
|  2|  1|  7|
|  2|  3| 11|
|  2|  4| 37|
|  2|  3| 45|
|  3|  5|  3|
|  3|  3| 42|
|  3|  3| 33|
|  4|  1|  5|
|  4|  2| 12|
|  4|  1| 47|
|  4|  4| 17|
|  5|  2| 42|
|  5|  5|  6|
|  5|  3| 18|
|  6|  4|  2|
|  6|  3|  4|
|  6|  4| 34|
|  7|  1| 15|
+---+---+---+
only showing top 20 rows



In [8]:
data = data.withColumnRenamed("_c0","OrderID").withColumnRenamed("_c1","Rating").withColumnRenamed("_c2","ProductID")

In [9]:
data.createOrReplaceTempView('order_products_train')

In [10]:
raw_data = spark.sql('SELECT * FROM order_products_train')
grouping_data = raw_data.groupby('OrderID').agg(collect_set('ProductID').alias('items'))
grouping_data.createOrReplaceTempView('baskets')

In [11]:
grouping_data.show(5)

+-------+--------------------+
|OrderID|               items|
+-------+--------------------+
|    148|[33, 27, 9, 46, 2...|
|    463|            [17, 14]|
|    471|     [9, 37, 34, 20]|
|    496|     [15, 6, 47, 26]|
|    833|         [12, 5, 21]|
+-------+--------------------+
only showing top 5 rows



In [12]:
fpGrowth = FPGrowth(itemsCol='items', minSupport=0.005, minConfidence=0.005)
#model = fpGrowth.fit(grouping_data)

In [13]:
goods = spark.read.csv('../../Data/75000/goods.csv', header=True, inferSchema=True)
goods.show(10)

+---+------------+--------+-----+------+
| Id|      Flavor|    Food|Price|  Type|
+---+------------+--------+-----+------+
|  0| 'Chocolate'|  'Cake'| 8.95|'Food'|
|  1|     'Lemon'|  'Cake'| 8.95|'Food'|
|  2|    'Casino'|  'Cake'|15.95|'Food'|
|  3|     'Opera'|  'Cake'|15.95|'Food'|
|  4|'Strawberry'|  'Cake'|11.95|'Food'|
|  5|   'Truffle'|  'Cake'|15.95|'Food'|
|  6| 'Chocolate'|'Eclair'| 3.25|'Food'|
|  7|    'Coffee'|'Eclair'|  3.5|'Food'|
|  8|   'Vanilla'|'Eclair'| 3.25|'Food'|
|  9|  'Napoleon'|  'Cake'|13.49|'Food'|
+---+------------+--------+-----+------+
only showing top 10 rows



In [14]:
goods.createOrReplaceTempView('goods')

In [15]:
rawdata_1 = spark.sql('''SELECT g.Flavor, g.Food, o.OrderID 
                         FROM goods g 
                         INNER JOIN order_products_train o WHERE o.ProductID = g.Id''')

In [16]:
baskets = rawdata_1.groupBy('OrderID').agg(collect_set('Food').alias('items'))
baskets.show(truncate=False)

+-------+-----------------------------------------------------+
|OrderID|items                                                |
+-------+-----------------------------------------------------+
|148    |['Cookie', 'Cake', 'Coffee', 'Croissant']            |
|463    |['Tart']                                             |
|471    |['Tart', 'Cake', 'Croissant', 'Twist']               |
|496    |['Tart', 'Frappuccino', 'Eclair', 'Meringue']        |
|833    |['Cookie', 'Tart', 'Cake']                           |
|1088   |['Cookie', 'Tart', 'Cake', 'Danish', 'Lemonade']     |
|1238   |['Tart', 'Croissant']                                |
|1342   |['Tart', 'Espresso', 'Eclair']                       |
|1580   |['Tart', 'Danish', 'Croissant']                      |
|1591   |['Tart', 'Cake']                                     |
|1645   |['Tart', 'Espresso', 'Eclair']                       |
|1829   |['Cookie', 'Tart', 'Espresso', 'Eclair', 'Bear Claw']|
|1959   |['Cookie', 'Tart', 'Cake']     

In [17]:
baskets.createOrReplaceTempView('baskets')

In [18]:
model = fpGrowth.fit(baskets)

In [19]:
model.freqItemsets.show(truncate=False)

+---------------------------------------+-----+
|items                                  |freq |
+---------------------------------------+-----+
|['Tart']                               |41111|
|['Cake']                               |32605|
|['Cake', 'Tart']                       |16944|
|['Cookie']                             |29198|
|['Cookie', 'Cake']                     |11789|
|['Cookie', 'Cake', 'Tart']             |5345 |
|['Cookie', 'Tart']                     |13648|
|['Croissant']                          |21668|
|['Croissant', 'Cookie']                |6769 |
|['Croissant', 'Cookie', 'Cake']        |2725 |
|['Croissant', 'Cookie', 'Cake', 'Tart']|1338 |
|['Croissant', 'Cookie', 'Tart']        |3447 |
|['Croissant', 'Cake']                  |7123 |
|['Croissant', 'Cake', 'Tart']          |3721 |
|['Croissant', 'Tart']                  |12706|
|['Danish']                             |14597|
|['Danish', 'Cookie']                   |4139 |
|['Danish', 'Cookie', 'Cake']           

In [20]:
# Transform examines the input items agaist all the association rules and summary consquents as predict
mostPopularItem = model.transform(baskets)

In [21]:
mostPopularItem.head(3)

[Row(OrderID=148, items=["'Cookie'", "'Cake'", "'Coffee'", "'Croissant'"], prediction=["'Tart'", "'Danish'", "'Eclair'", "'Lemonade'", "'Juice'", "'Meringue'", "'Frappuccino'", "'Twist'", "'Pie'", "'Water'", "'Espresso'", "'Tea'", "'Soda'", "'Bear Claw'"]),
 Row(OrderID=463, items=["'Tart'"], prediction=["'Cake'", "'Cookie'", "'Croissant'", "'Danish'", "'Eclair'", "'Coffee'", "'Lemonade'", "'Juice'", "'Meringue'", "'Frappuccino'", "'Twist'", "'Pie'", "'Water'", "'Espresso'", "'Tea'", "'Soda'", "'Bear Claw'"]),
 Row(OrderID=471, items=["'Tart'", "'Cake'", "'Croissant'", "'Twist'"], prediction=["'Cookie'", "'Eclair'", "'Coffee'", "'Danish'", "'Juice'", "'Lemonade'", "'Meringue'", "'Frappuccino'", "'Pie'", "'Water'", "'Espresso'", "'Tea'", "'Soda'", "'Bear Claw'"])]

In [22]:
mostPopularItem.createOrReplaceTempView('popular_items')

In [23]:
DF_cast = mostPopularItem.select('OrderID', mostPopularItem.items.cast(StringType()), mostPopularItem.prediction.cast(StringType()))
DF_cast.printSchema()

root
 |-- OrderID: integer (nullable = true)
 |-- items: string (nullable = false)
 |-- prediction: string (nullable = true)



In [24]:
DF_cast.head(3)

[Row(OrderID=148, items="['Cookie', 'Cake', 'Coffee', 'Croissant']", prediction="['Tart', 'Danish', 'Eclair', 'Lemonade', 'Juice', 'Meringue', 'Frappuccino', 'Twist', 'Pie', 'Water', 'Espresso', 'Tea', 'Soda', 'Bear Claw']"),
 Row(OrderID=463, items="['Tart']", prediction="['Cake', 'Cookie', 'Croissant', 'Danish', 'Eclair', 'Coffee', 'Lemonade', 'Juice', 'Meringue', 'Frappuccino', 'Twist', 'Pie', 'Water', 'Espresso', 'Tea', 'Soda', 'Bear Claw']"),
 Row(OrderID=471, items="['Tart', 'Cake', 'Croissant', 'Twist']", prediction="['Cookie', 'Eclair', 'Coffee', 'Danish', 'Juice', 'Lemonade', 'Meringue', 'Frappuccino', 'Pie', 'Water', 'Espresso', 'Tea', 'Soda', 'Bear Claw']")]