In [24]:
# ! apt-get install openjdk-8-jdk-headless -qq > /dev/null
# ! wget -q https://archive.apache.org/dist/spark/spark-3.5.4/spark-3.5.4-bin-hadoop3.tgz
# ! tar xf spark-3.5.4-bin-hadoop3.tgz
# ! pip install -q findspark

In [None]:
import os
from itertools import combinations

import findspark
os.environ["SPARK_HOME"] = "spark-3.5.4-bin-hadoop3"
findspark.init()

import pyspark as spark
from pyspark import SparkContext

from pyspark.sql import (
    SparkSession,
    DataFrame,
    functions as f
)
from pyspark.sql.functions import (
    collect_list,
    explode,
    col,
    size,
)
from pyspark.sql.types import (
    ArrayType,
    IntegerType,
    StringType,
)

In [54]:
spark = SparkSession \
        .builder \
        .appName("task02c") \
        .getOrCreate()
sc = spark.sparkContext
spark.sparkContext.setLogLevel("ERROR") 

In [55]:
file_path = "baskets.csv"
df = spark.read.csv(file_path, header=True, inferSchema=True)
df = df.dropna()

In [None]:
basket_df = df.groupBy("Member_number", "Date") \
              .agg(collect_list("itemDescription"))

basket_df = basket_df.toDF("id", "date", "item_list")
basket_df.show(10)

+----+----------+--------------------+
|  id|      date|           item_list|
+----+----------+--------------------+
|1038|04/10/2015|[other vegetables...|
|1042|02/12/2015|     [beef, dessert]|
|1081|02/11/2014| [beverages, yogurt]|
|1182|21/01/2015| [beef, butter milk]|
|1217|12/03/2014|[citrus fruit, gr...|
|1233|04/07/2014|    [UHT-milk, curd]|
|1263|19/10/2014|[frozen meals, bo...|
|1294|07/05/2014|[hygiene articles...|
|1305|24/05/2015|[citrus fruit, tr...|
|1342|14/02/2014|[napkins, white b...|
+----+----------+--------------------+
only showing top 10 rows



In [57]:
from pcy import PCY, PCYModel

pcy_analyzer = PCY(min_support=100, min_confidence=0.1)

pcy_model = pcy_analyzer.fit(basket_df)

print("\n--- Results from PCYModel ---")
pcy_model.show_frequent_pairs(n=10)
pcy_model.show_association_rules(n=10)


--- Results from PCYModel ---
Top 10 frequent pairs found:
+----------------+----------+-------+
|item1           |item2     |support|
+----------------+----------+-------+
|other vegetables|whole milk|243    |
|rolls/buns      |whole milk|227    |
|soda            |whole milk|199    |
|whole milk      |yogurt    |183    |
|other vegetables|rolls/buns|182    |
|other vegetables|soda      |160    |
|whole milk      |whole milk|148    |
|sausage         |whole milk|147    |
|tropical fruit  |whole milk|136    |
|other vegetables|yogurt    |131    |
+----------------+----------+-------+
only showing top 10 rows

Top 10 association rules found (ordered by confidence):
+--------------+----------+-------+-------------------+
|item1         |item2     |support|conf               |
+--------------+----------+-------+-------------------+
|bottled beer  |whole milk|113    |0.16448326055312956|
|sausage       |whole milk|147    |0.1590909090909091 |
|pip fruit     |whole milk|110    |0.147849462