### AIT 614 - Big Data Essentials <br>
#### Lab 5.1 - Market Basket Analysis and Recommendation
<hr>
<b>Machine Learning using Spark MLlib on Databricks</b><br>

Course Section #: AIT614-Sect2<br>
Student's Full Name: Khanh Nguyen<br>

In [0]:
from pyspark.ml.fpm import FPGrowth

df = spark.createDataFrame([
    (0, [1, 2, 5]),
    (1, [1, 2, 3, 5]),
    (2, [1, 2])
], ["id", "items"])

fpGrowth = FPGrowth(itemsCol="items", minSupport=0.5, minConfidence=0.6)
model = fpGrowth.fit(df)

# Display frequent itemsets.
model.freqItemsets.show()

# Display generated association rules.
model.associationRules.show()

# transform examines the input items against all the association rules and summarize the
# consequents as prediction
model.transform(df).show()

In [0]:
#Create a training set

from pyspark.ml.fpm import FPGrowth

df_train = spark.createDataFrame([
    (0, ["apple", "orange", "eggs", "milk"]),
    (1, ["apple", "orange", "bread", "milk"]),
    (2, ["apple", "orange"]),
    (3, ["eggs", "butter", "bread"]),
    (4, ["rice", "bread", "orange", "mushroom", "tomato"]),
    (5, ["sausage", "milk", "mushroom", "tomato"]),
    (6, ["apple", "orange", "mushroom", "tomato"]),
    (7, ["butter", "bread", "milk", "mushroom", "tomato"]),
    (8, ["sausage", "bread", "butter"]),
    (9, ["butter", "bread", "apple", "sausage", "tomato"]),
    (10, ["orange", "bread", "sausage"]),
    (11, ["apple", "butter", "mushroom", "bread"]),
    (12, ["rice", "bread", "mushroom", "eggs", "orange"]),
    (13, ["eggs", "rice", "milk"]),
    (14, ["rice", "sausage"]),
    (15, ["eggs", "sausage", "rice", "apple", "orange"]),
], ["id", "items"])

df_train.display()

id,items
0,"List(apple, orange, eggs, milk)"
1,"List(apple, orange, bread, milk)"
2,"List(apple, orange)"
3,"List(eggs, butter, bread)"
4,"List(rice, bread, orange, mushroom, tomato)"
5,"List(sausage, milk, mushroom, tomato)"
6,"List(apple, orange, mushroom, tomato)"
7,"List(butter, bread, milk, mushroom, tomato)"
8,"List(sausage, bread, butter)"
9,"List(butter, bread, apple, sausage, tomato)"


In [0]:
#Use FP-Growth to build a model. Set different minSupport and minConfidence

fpGrowth1 = FPGrowth(itemsCol="items", minSupport=0.2, minConfidence=0.65)
model1 = fpGrowth1.fit(df_train)

# Display frequent itemsets.
model1.freqItemsets.show()


In [0]:

# Display generated association rules.
model1.associationRules.show()

# transform examines the input items against all the association rules and summarize the
# consequents as prediction
model1.transform(df_train).show()

In [0]:
#Create a Test set
df_test = spark.createDataFrame([
    (0, ["apple", "eggs", "milk"]),
    (1, ["bread", "milk", "mushroom"]),
    (2, ["orange", "tomato"]),
    (3, ["apple", "orange", "bread"]),
    (4, ["apple", "bread", "tomato"]),
], ["id", "items"])

df_test.display()

id,items
0,"List(apple, eggs, milk)"
1,"List(bread, milk, mushroom)"
2,"List(orange, tomato)"
3,"List(apple, orange, bread)"
4,"List(apple, bread, tomato)"


In [0]:
#Make Recommendation/Prediction
model1.transform(df_test).show()

### Citation/Reference <br>

Apache Spark. (2022). Frequent Pattern Mining.  Retrieved on April 6, 2022 from https://spark.apache.org/docs/latest/ml-frequent-pattern-mining.html

Han J., Pei J., Yin Y. (2000). Mining frequent patterns without candidate generation. Association for Computing Machinery Publisher. Retrieved on April 6, 2022 from https://dl.acm.org/doi/10.1145/335191.335372
<br>

In [0]:
#PrefixSpan: a sequential pattern mining algorithm

from pyspark.ml.fpm import PrefixSpan
from pyspark.sql import Row, SparkSession
df = sc.parallelize([Row(sequence=[[1, 2], [3]]),
                     Row(sequence=[[1], [3, 2], [1, 2]]),
                     Row(sequence=[[1, 2], [5]]),
                     Row(sequence=[[6]])]).toDF()

prefixSpan = PrefixSpan(minSupport=0.5, maxPatternLength=5,
                        maxLocalProjDBSize=32000000)

# Find frequent sequential patterns.
prefixSpan.findFrequentSequentialPatterns(df).show()