In [0]:
import pandas as pd
import numpy as np
import collections

In [0]:
%scala
import org.apache.spark.ml.fpm.FPGrowth

In [0]:
# Import data
grocery = pd.read_csv('file:/dbfs/FileStore/tables/groceries.csv')

In [0]:
grocery.head()

Unnamed: 0,Item(s),Item 1,Item 2,Item 3,Item 4,Item 5,Item 6,Item 7,Item 8,Item 9,...,Item 23,Item 24,Item 25,Item 26,Item 27,Item 28,Item 29,Item 30,Item 31,Item 32
0,4,citrus fruit,semi-finished bread,margarine,ready soups,,,,,,...,,,,,,,,,,
1,3,tropical fruit,yogurt,coffee,,,,,,,...,,,,,,,,,,
2,1,whole milk,,,,,,,,,...,,,,,,,,,,
3,4,pip fruit,yogurt,cream cheese,meat spreads,,,,,,...,,,,,,,,,,
4,4,other vegetables,whole milk,condensed milk,long life bakery product,,,,,,...,,,,,,,,,,


In [0]:
grocery.shape

Out[23]: (9835, 33)

In [0]:
# Drop item(s) column
grocery.drop(['Item(s)'], axis=1, inplace=True)

In [0]:
grocery.head()

Unnamed: 0,Item 1,Item 2,Item 3,Item 4,Item 5,Item 6,Item 7,Item 8,Item 9,Item 10,...,Item 23,Item 24,Item 25,Item 26,Item 27,Item 28,Item 29,Item 30,Item 31,Item 32
0,citrus fruit,semi-finished bread,margarine,ready soups,,,,,,,...,,,,,,,,,,
1,tropical fruit,yogurt,coffee,,,,,,,,...,,,,,,,,,,
2,whole milk,,,,,,,,,,...,,,,,,,,,,
3,pip fruit,yogurt,cream cheese,meat spreads,,,,,,,...,,,,,,,,,,
4,other vegetables,whole milk,condensed milk,long life bakery product,,,,,,,...,,,,,,,,,,


In [0]:
# Make transaction dataframe
transactions = pd.DataFrame(columns=['Id', 'Items'])
transactions_list = []

for idx, row in grocery.iterrows():
    # Drop nan and ocnvert to numpy array
    items_list = [i for i in list(row) if str(i) != 'nan']
    items_arr = np.array(items_list)
    # Add to dataframe
    transactions.at[idx, 'Id'] = idx
    transactions.at[idx, 'Items'] = items_arr

In [0]:
# Convert pandas df to pyspark and create temp view

basket = spark.createDataFrame(transactions) 
basket.createOrReplaceTempView('basket')

In [0]:
display(basket)

Id,Items
0,"List(citrus fruit, semi-finished bread, margarine, ready soups)"
1,"List(tropical fruit, yogurt, coffee)"
2,List(whole milk)
3,"List(pip fruit, yogurt, cream cheese, meat spreads)"
4,"List(other vegetables, whole milk, condensed milk, long life bakery product)"
5,"List(whole milk, butter, yogurt, rice, abrasive cleaner)"
6,List(rolls/buns)
7,"List(other vegetables, UHT-milk, rolls/buns, bottled beer, liquor (appetizer))"
8,List(potted plants)
9,"List(whole milk, cereals)"


In [0]:
%scala
// Extract items 
val basket_items = spark.sql("select Items from basket").as[Array[String]].toDF("items")

// FPGrowth algorithm
val fpgrowth = new FPGrowth().setItemsCol("items").setMinSupport(0.01).setMinConfidence(0.2)
val model = fpgrowth.fit(basket_items)

In [0]:
%scala
// Get frequent itemsets and convert to temp view
val popularItemSets = model.freqItemsets
popularItemSets.createOrReplaceTempView("popularItemSets")

In [0]:
%scala
// Association rules to determine the likelihood of a buyer purchasing an item given another item that was already purchased
val rules = model.associationRules
rules.createOrReplaceTempView("rules")

In [0]:
%sql
select antecedent as `antecedent (if)`, consequent as `consequent (then)`, confidence from rules order by confidence desc limit 20

antecedent (if),consequent (then),confidence
"List(citrus fruit, root vegetables)",List(other vegetables),0.5862068965517241
"List(tropical fruit, root vegetables)",List(other vegetables),0.5845410628019324
"List(curd, yogurt)",List(whole milk),0.5823529411764706
"List(butter, other vegetables)",List(whole milk),0.5736040609137056
"List(tropical fruit, root vegetables)",List(whole milk),0.5700483091787439
"List(root vegetables, yogurt)",List(whole milk),0.562992125984252
"List(domestic eggs, other vegetables)",List(whole milk),0.5525114155251142
"List(whipped/sour cream, yogurt)",List(whole milk),0.5245098039215687
"List(root vegetables, rolls/buns)",List(whole milk),0.5230125523012552
"List(pip fruit, other vegetables)",List(whole milk),0.5175097276264592
