## Data Analysis (Cont)

### Model build

#### Import Libraries

In [None]:
import pandas as pd
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


#### Setting up PySpark

In [None]:
# install java
!apt-get install openjdk-8-jdk-headless -qq > /dev/null

# install spark 
!wget -q https://archive.apache.org/dist/spark/spark-3.2.1/spark-3.2.1-bin-hadoop3.2.tgz

# unzip the spark file to the current folder
!tar xf spark-3.2.1-bin-hadoop3.2.tgz

# set your spark folder to your system path environment. 
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.2.1-bin-hadoop3.2"


# install findspark using pip
!pip install -q findspark

In [None]:
# locate Spark in the system
import findspark
findspark.init()

In [None]:
findspark.find()

'/content/spark-3.2.1-bin-hadoop3.2'

In [None]:
# create a SparkSession, which is the entry point to Spark
from pyspark.sql import SparkSession

spark = SparkSession.builder\
        .master("local")\
        .appName("Colab")\
        .config('spark.ui.port', '4050')\
        .getOrCreate()

In [None]:
spark

#### Loading data

In [None]:
path = '/content/drive/MyDrive/Y4_T2/Data Mining/Market Basket Analysis/'

In [None]:
temp = pd.read_csv(path+'basket_sets.csv', index_col='Trans No') 

In [None]:
products = []
for i in range(0, temp.shape[0]):
  products.append([str(temp.columns[j]) for j in range(0,temp.shape[1]) if temp.values[i,j] == 1])

In [None]:
len(products)

2525

In [None]:
df = pd.DataFrame(data=list(zip(temp.index, products)), columns=['Trans No','Products'])

In [None]:
df.head()

Unnamed: 0,Trans No,Products
0,1,"[7 up pet 2ltr, bakers inn white loaf, bakers ..."
1,2,"[12inch butter cream cake, 20x20 butter cream ..."
2,3,"[12inch butter cream cake, 12inch rect fresh c..."
3,4,"[12inch butter cream cake, 20x20 butter cream ..."
4,5,"[12inch butter cream cake, 12inch rect fresh c..."


In [None]:
df.shape

(2525, 2)

In [None]:
basket_sets = spark.createDataFrame(df) 

In [None]:
basket_sets.show(5, False)

+--------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [None]:
basket_sets.printSchema()

root
 |-- Trans No: long (nullable = true)
 |-- Products: array (nullable = true)
 |    |-- element: string (containsNull = true)



In [None]:
print('Rows:',basket_sets.count())

Rows: 2525


#### Fit the Model

In [None]:
from pyspark.ml.fpm import FPGrowth

In [128]:
# use sample, dataset too large
sample = basket_sets.sample(0.02)

In [129]:
print('Sample Rows:',sample.count())

Sample Rows: 53


**Hyperparameters**

* minSupport - The minimum support of an item to be considered in a frequent itemset.
* minConfidence - The minimum confidence for generating an association rule from an itemset.
* numPartitions - The number of partitions used to distribute the work. This is Spark-specific.

The default number of partitions is the number of partitions for the input dataset.

In [130]:
fpGrowth = FPGrowth(itemsCol='Products', minSupport=0.3, minConfidence=0.3, predictionCol='prediction')
model = fpGrowth.fit(sample)

## Data Interpretation

In [131]:
# View a subset of the frequent itemset. 
model.freqItemsets.show(10, False)



+------------------------------------------------------------------------------------+----+
|items                                                                               |freq|
+------------------------------------------------------------------------------------+----+
|[colcom pork pie 100g]                                                              |19  |
|[colcom pork pie 100g, garfunkels family polony/kg]                                 |16  |
|[colcom pork pie 100g, garfunkels family polony/kg, pepsi pet 500ml]                |16  |
|[colcom pork pie 100g, garfunkels family polony/kg, pepsi pet 500ml, fw plastic bag]|16  |
|[colcom pork pie 100g, garfunkels family polony/kg, fw plastic bag]                 |16  |
|[colcom pork pie 100g, chibuku super 1.25l]                                         |16  |
|[colcom pork pie 100g, chibuku super 1.25l, fw plastic bag]                         |16  |
|[colcom pork pie 100g, huletts brown sugar 2kg]                                

In [132]:
# Display generated association rules.
model.associationRules.show()



+--------------------+--------------------+----------+------------------+------------------+
|          antecedent|          consequent|confidence|              lift|           support|
+--------------------+--------------------+----------+------------------+------------------+
|[willards things ...|[britelite green ...|       1.0|2.9444444444444446|0.3018867924528302|
|[willards things ...|[chibuku super 1....|       1.0|2.2083333333333335|0.3018867924528302|
|[fruitade orange ...|       [fw taxi bag]|       1.0|2.0384615384615383|0.3018867924528302|
|[fruitade orange ...|[cascade orange j...|       1.0|2.5238095238095237|0.3018867924528302|
|[fruitade orange ...|[hot cross bun each]|       1.0|              2.65|0.3018867924528302|
|[dzl super lacto ...|[lobels whole whe...|       1.0|2.9444444444444446|0.3018867924528302|
|[dzl super lacto ...|       [fw taxi bag]|       1.0|2.0384615384615383|0.3018867924528302|
|[dzl super lacto ...|[bakers inn white...|       1.0|2.20833333333333

### Frequently being bought together

* Which items are frequently being bought together ?

In [133]:
# Showing basket of items with the highest support ratio in descending order.
model.associationRules.orderBy("support", ascending=False).show()



+--------------------+--------------------+------------------+------------------+-------------------+
|          antecedent|          consequent|        confidence|              lift|            support|
+--------------------+--------------------+------------------+------------------+-------------------+
|    [fw plastic bag]|   [pepsi pet 500ml]|0.7142857142857143|1.1830357142857144| 0.5660377358490566|
|   [pepsi pet 500ml]|    [fw plastic bag]|            0.9375|1.1830357142857142| 0.5660377358490566|
|[huletts brown su...|    [fw plastic bag]|0.9615384615384616|1.2133699633699633| 0.4716981132075472|
|    [fw plastic bag]|       [fw taxi bag]|0.5952380952380952|1.2133699633699633| 0.4716981132075472|
|       [fw taxi bag]|    [fw plastic bag]|0.9615384615384616|1.2133699633699633| 0.4716981132075472|
|    [fw plastic bag]|[huletts brown su...|0.5952380952380952|1.2133699633699633| 0.4716981132075472|
|    [fw plastic bag]|[garfunkels famil...|0.5714285714285714|1.1648351648351647| 

### Highest lift ratio

* Using the principle of association rules in the form of A -> B, show the transactions or rules where A and B are 3 times more likely to be bought together than B to be bought alone.

In [134]:
# Use filter to show baskets where the lift values are >=3 in descending order..
model.associationRules.filter(model.associationRules.lift>=3).orderBy("lift", ascending=False).show()



+--------------------+--------------------+----------+------+------------------+
|          antecedent|          consequent|confidence|  lift|           support|
+--------------------+--------------------+----------+------+------------------+
|[lobels whole whe...|[revive dairy ble...|       1.0|3.3125|0.3018867924528302|
|[plain sadza/kg, ...|[revive dairy ble...|       1.0|3.3125|0.3018867924528302|
|[fw white bread, ...|[revive dairy ble...|       1.0|3.3125|0.3018867924528302|
|[dzl super lacto ...|[revive dairy ble...|       1.0|3.3125|0.3018867924528302|
|[dzl super lacto ...|[revive dairy ble...|       1.0|3.3125|0.3018867924528302|
|[colcom mnandi po...|[fanta orange pet...|       1.0|3.3125|0.3018867924528302|
|[fw white bread, ...|[revive dairy ble...|       1.0|3.3125|0.3018867924528302|
|[fw white bread, ...|[revive dairy ble...|       1.0|3.3125|0.3018867924528302|
|[fw white bread, ...|[revive dairy ble...|       1.0|3.3125|0.3018867924528302|
|[lobels whole whe...|[reviv

### Highest confidence values

* Using the principle of association rules in the form of A->B, show the rules where you have 100% confidence that if A is bought then B will be bought also

In [135]:
# Use filter to show transactions or rules with highest confidence values(100%).
model.associationRules.filter(model.associationRules.confidence==1.0).show()



+--------------------+--------------------+----------+------------------+------------------+
|          antecedent|          consequent|confidence|              lift|           support|
+--------------------+--------------------+----------+------------------+------------------+
|[willards things ...|[britelite green ...|       1.0|2.9444444444444446|0.3018867924528302|
|[willards things ...|[chibuku super 1....|       1.0|2.2083333333333335|0.3018867924528302|
|[fruitade orange ...|       [fw taxi bag]|       1.0|2.0384615384615383|0.3018867924528302|
|[fruitade orange ...|[cascade orange j...|       1.0|2.5238095238095237|0.3018867924528302|
|[fruitade orange ...|[hot cross bun each]|       1.0|              2.65|0.3018867924528302|
|[dzl super lacto ...|[lobels whole whe...|       1.0|2.9444444444444446|0.3018867924528302|
|[dzl super lacto ...|       [fw taxi bag]|       1.0|2.0384615384615383|0.3018867924528302|
|[dzl super lacto ...|[bakers inn white...|       1.0|2.20833333333333

### Items for the same shelf

* Finally which items can be put on the same shelf so that it becomes easy for our customers to pick items which they frequently buy together ?

In [142]:
model.associationRules.filter((model.associationRules.support>=0.3) & (model.associationRules.lift>=3) & (model.associationRules.confidence==1.0)).select('antecedent','consequent').show(20, False)



+---------------------------------------------------------------------------------------------------------------------------------------------------------------------+----------------------------------+
|antecedent                                                                                                                                                           |consequent                        |
+---------------------------------------------------------------------------------------------------------------------------------------------------------------------+----------------------------------+
|[lobels whole wheat loaf bread, hot cross bun each, bakers inn white loaf, fw taxi bag, fw plastic bag]                                                              |[tomatoes loose/kg]               |
|[star cane pet 200ml, hot cross bun each, pepsi pet 500ml]                                                                                                           |[tomatoes loose/kg]  

In [147]:
# transform examines the input items against all the association rules and summarize the consequents as prediction
model.transform(sample).show()

+--------+--------------------+--------------------+
|Trans No|            Products|          prediction|
+--------+--------------------+--------------------+
|      12|[12inch butter cr...|                  []|
|      18|[12inch butter cr...|                  []|
|      21|[12inch butter cr...|                  []|
|      28|[12inch butter cr...|                  []|
|      96|[12inch butter cr...|                  []|
|     159|[12inch butter cr...|                  []|
|     248|[12inch butter cr...|                  []|
|     275|[12inch butter cr...|                  []|
|     367|[12inch butter cr...|                  []|
|     417|[12inch butter cr...|                  []|
|     487|[20x20 butter cre...|[zimgold cooking ...|
|     488|[20x20 butter cre...|[pfuko butter mil...|
|     499|[25x25 butter cre...|[castle lite nrb ...|
|     580|[20x20 butter cre...|[charhons loose b...|
|     767|[aqualite still w...|[colcom pork pie ...|
|     788|[bakers inn white...|[sadza beef veg

In [176]:
# Finding items which can be put on the same shelf from rules generated.
preds = model.transform(sample).select('prediction').toPandas()

In [177]:
preds = [x for x in preds.prediction if len(x) > 0]

In [178]:
len(preds)

40

In [None]:
preds = pd.DataFrame(list(zip(preds)))

In [183]:
pd.set_option("display.max_rows", None, "display.max_columns", None,'display.max_colwidth', -1)

  """Entry point for launching an IPython kernel.


In [184]:
preds

Unnamed: 0,0
0,"[zimgold cooking oil 2l, king kurls chicken 100g, pfuko traditional maheu 500ml, mackerel fish/kg, fruitade lemon&lime 2l]"
1,"[pfuko butter milk maheu 500ml, fruitade lemon&lime 2l]"
2,"[castle lite nrb dumpy 340ml, star cane pet 200ml, swan bath soap perfumed 200g]"
3,"[charhons loose biscuits 500g, lemon slice, nestle cerevita corn&wheat 500g, sun mixed fruit jam 500g, pfuko traditional maheu 500ml, bananas loose, chimombe uht full cream 500ml, yummy yoghurt strawberry 150ml, crystal top ten sweets 280g, fw rolls, sona beauty soap pink 250g/300g, eversharp assorted pens., fanta orange pet 350ml, twist bread, coke pet 500ml, pork shoulder/kg, plain chips mini, zb zesa token, nurse 1ply 4s, fruitade lemon&lime 2l, black label 440ml, mazoe blackberry 2l, swan bath soap perfumed 200g, vivon purified water 500ml]"
4,"[colcom pork pie 100g, sadza beef veg 2 piece, tomatoes loose/kg, sawa sour milk 500ml, mirinda green apple pet 500ml, icing bun, value beef kg, charhons loose biscuits 500g, lemon slice, zlg purified water 2l, proton white loaf, mazoe orange crush 2lt, colgate cdc 100g/150g, mirinda orange pet 500ml, zimgold cooking oil 2l, cascade orange juice 400ml, hot cross bun each, fw white bread, probrands masi 500ml, colcom mnandi polony /kg, plain sadza/kg, chimombe uht full cream 500ml, cake delight, pepsi pet 500ml, revive dairy blend mango 1l, fw plastic cover., yummy yoghurt strawberry 150ml, willards things 150g, crystal top ten sweets 280g, eversharp assorted pens., star cane pet 200ml, fruitade orange juice 2l, pork shoulder/kg, proton wholemeal loaf, pascal chocolate mallow 30g, twist bread, fw tea loaf, plain chips mini, zb zesa token, nurse 1ply 4s, fanta orange pet 350ml, coke pet 500ml, black label 440ml, mazoe blackberry 2l, swan bath soap perfumed 200g]"
5,"[sadza beef veg 2 piece, plain bun each, sawa sour milk 500ml, mirinda green apple pet 500ml, pfuko butter milk maheu 500ml, sugar doughnut, lobels prime white loaf, icing bun, britelite green bar 1kg, value beef kg, charhons loose biscuits 500g, nestle cerevita corn&wheat 500g, mirinda fruity pet 500ml, castle lite nrb dumpy 340ml, zlg purified water 2l, proton white loaf, sun mixed fruit jam 500g, chimombe uht full cream 1l, colgate cdc 100g/150g, mirinda orange pet 500ml, zimgold cooking oil 2l, cascade orange juice 400ml, colcom mnandi polony /kg, zlg purified water 500ml, mega long grain value rice 2kg, empty bottle deposit 750ml/660ml, willards things 150g, crystal top ten sweets 280g, fw rolls, sona beauty soap pink 250g/300g, fruitade orange juice 2l, mega fine salt 1kg, proton wholemeal loaf, pork shoulder/kg, pascal chocolate mallow 30g, fw tea loaf, willards jupiters 150g, zb zesa token, nurse 1ply 4s, fanta orange pet 350ml, coke pet 500ml, black label 440ml, mazoe blackberry 2l, swan bath soap perfumed 200g, vivon purified water 500ml]"
6,"[garfunkels family polony/kg, colcom mnandi polony /kg, colcom pork pie 100g, sadza beef veg 2 piece, plain bun each, sawa sour milk 500ml, sugar doughnut, icing bun, britelite green bar 1kg, value beef kg, charhons loose biscuits 500g, lemon slice, nestle cerevita corn&wheat 500g, king kurls chicken 100g, dzl super lacto 500ml, castle lite nrb dumpy 340ml, raha soya cooking oil 2l, mazoe orange crush 2lt, bananas loose, fw white bread, plain sadza/kg, cake delight, chibuku super 1.25l, empty bottle deposit 750ml/660ml, yummy yoghurt strawberry 150ml, fw plastic cover., willards things 150g, revive dairy blend mango 1l, fw rolls, mega long grain value rice 2kg, sona beauty soap pink 250g/300g, fruitade orange juice 2l, proton wholemeal loaf, coke pet 500ml, pascal chocolate mallow 30g, mackerel fish/kg, mega fine salt 1kg, fw tea loaf, willards jupiters 150g, plain chips mini, zb zesa token, nurse 1ply 4s, fanta orange pet 350ml, fruitade lemon&lime 2l, black label 440ml, mazoe blackberry 2l, swan bath soap perfumed 200g, vivon purified water 500ml]"
7,"[huletts brown sugar 2kg, colcom pork pie 100g, sadza beef veg 2 piece, lobels whole wheat loaf bread, tomatoes loose/kg, mirinda green apple pet 500ml, sugar doughnut, icing bun, value beef kg, charhons loose biscuits 500g, nestle cerevita corn&wheat 500g, king kurls chicken 100g, mirinda fruity pet 500ml, dzl super lacto 500ml, zlg purified water 2l, proton white loaf, sun mixed fruit jam 500g, pfuko traditional maheu 500ml, raha soya cooking oil 2l, mazoe orange crush 2lt, colgate cdc 100g/150g, mirinda orange pet 500ml, zimgold cooking oil 2l, cascade orange juice 400ml, fw white bread, colcom mnandi polony /kg, zlg purified water 500ml, dendairy maas 500ml, chimombe uht full cream 500ml, cake delight, garfunkels family polony/kg, fw taxi bag, fw plastic cover., revive dairy blend mango 1l, empty bottle deposit 750ml/660ml, eversharp assorted pens., star cane pet 200ml, fruitade orange juice 2l, mega long grain value rice 2kg, yummy yoghurt strawberry 150ml, fw rolls, plain chips mini, zb zesa token, nurse 1ply 4s, pork shoulder/kg, fw tea loaf, fanta orange pet 350ml, proton wholemeal loaf, crystal top ten sweets 280g, pascal chocolate mallow 30g, sona beauty soap pink 250g/300g, coke pet 500ml, black label 440ml, mackerel fish/kg, swan bath soap perfumed 200g, mega fine salt 1kg, twist bread, willards jupiters 150g]"
8,"[bakers inn white loaf, dendairy maas 500ml, lobels whole wheat loaf bread, britelite green bar 1kg, revive dairy blend mango 1l, mega long grain value rice 2kg, dzl super lacto 500ml, fw white bread, plain sadza/kg, plain bun each, probrands masi 500ml, mirinda fruity pet 500ml, zlg purified water 500ml, colcom pork pie 100g, mirinda green apple pet 500ml, sugar doughnut, willards things 150g, crystal top ten sweets 280g, nestle cerevita corn&wheat 500g, fw rolls, zlg purified water 2l, sun mixed fruit jam 500g, pfuko traditional maheu 500ml, sona beauty soap pink 250g/300g, mazoe orange crush 2lt, mirinda orange pet 500ml, eversharp assorted pens., chimombe uht full cream 500ml, cake delight, mega fine salt 1kg, mackerel fish/kg, sawa sour milk 500ml, lemon slice, pork shoulder/kg, proton wholemeal loaf, king kurls chicken 100g, pascal chocolate mallow 30g, twist bread, willards jupiters 150g, value beef kg, nurse 1ply 4s, plain chips mini, zb zesa token, fw tea loaf, fruitade lemon&lime 2l, castle lite nrb dumpy 340ml, coke pet 500ml, black label 440ml, mazoe blackberry 2l, vivon purified water 500ml]"
9,"[lobels whole wheat loaf bread, lobels prime white loaf, garfunkels family polony/kg, huletts brown sugar 2kg, pepsi pet 500ml, cascade orange juice 400ml, fw taxi bag, bakers inn white loaf, britelite green bar 1kg, chimombe uht full cream 1l, hot cross bun each, fw plastic cover., willards things 150g, revive dairy blend mango 1l, dzl super lacto 500ml, mazoe orange crush 2lt, fw white bread, probrands masi 500ml, colgate cdc 100g/150g, mega long grain value rice 2kg, tomatoes loose/kg, mirinda green apple pet 500ml, mirinda fruity pet 500ml, zlg purified water 2l, zimgold cooking oil 2l, colcom mnandi polony /kg, raha soya cooking oil 2l, mirinda orange pet 500ml, cake delight, empty bottle deposit 750ml/660ml, proton wholemeal loaf, king kurls chicken 100g, sun mixed fruit jam 500g, bananas loose, mega fine salt 1kg, zlg purified water 500ml, twist bread, chimombe uht full cream 500ml, sadza beef veg 2 piece, plain bun each, yummy yoghurt strawberry 150ml, icing bun, nurse 1ply 4s, fanta orange pet 350ml, plain chips mini, zb zesa token, sawa sour milk 500ml, pork shoulder/kg, sugar doughnut, value beef kg, crystal top ten sweets 280g, lemon slice, nestle cerevita corn&wheat 500g, castle lite nrb dumpy 340ml, pascal chocolate mallow 30g, pfuko traditional maheu 500ml, sona beauty soap pink 250g/300g, coke pet 500ml, black label 440ml, mackerel fish/kg, mazoe blackberry 2l, eversharp assorted pens., vivon purified water 500ml, willards jupiters 150g]"
