In [9]:
%pip install mlxtend





In [44]:
%pip install PyMuPDF

Note: you may need to restart the kernel to use updated packages.


In [112]:
import fitz  
def read_pdf(file_path):
    doc = fitz.open(file_path)
    text = ""
    for page in doc:
        text += page.get_text()
    return text
    
sports_text = read_pdf("sports.pdf")
space_text = read_pdf("space.pdf")

In [114]:
def parse(text):
    lines = text.strip().split("\n")
    transactions = []
    for line in lines[1:]:  # skip header line
        items = line.split(",")[1:]  # skip transaction ID
        transactions.append([item.strip() for item in items])
    return transactions
sports_transactions = parse(sports_text)
space_transactions = parse(space_text)

In [116]:

import pandas as pd
from mlxtend.preprocessing import TransactionEncoder

def encode(transactions):
    te = TransactionEncoder()
    te_ary = te.fit(transactions).transform(transactions)
    return pd.DataFrame(te_ary, columns=te.columns_)

sports_df = encode(sports_transactions)
space_df = encode(space_transactions)

In [118]:
from mlxtend.frequent_patterns import apriori, association_rules

def apply_apriori(df, min_support=0.2):
    freq_items = apriori(df, min_support=min_support, use_colnames=True)
    rules = association_rules(freq_items, metric="confidence", min_threshold=0.6)
    return freq_items, rules

sports_freq, sports_rules = apply_apriori(sports_df)
space_freq, space_rules = apply_apriori(space_df)

print("Sports - Frequent Itemsets:\n", sports_freq)
print("\nSports - Association Rules:\n", sports_rules[['antecedents','consequents','support','confidence','lift']])


Sports - Frequent Itemsets:
    support        itemsets
0     0.36  (cricket ball)
1     0.40   (cricket bat)
2     0.44      (football)
3     0.36        (gloves)
4     0.26     (ice cream)
5     0.42         (juice)
6     0.28  (water bottle)

Sports - Association Rules:
 Empty DataFrame
Columns: [antecedents, consequents, support, confidence, lift]
Index: []


In [120]:
from mlxtend.frequent_patterns import fpgrowth

def apply_fpgrowth(df, min_support=0.2):
    freq_items = fpgrowth(df, min_support=min_support, use_colnames=True)
    rules = association_rules(freq_items, metric="confidence", min_threshold=0.6)
    return freq_items, rules

sports_fp_freq, sports_fp_rules = apply_fpgrowth(sports_df)
space_fp_freq, space_fp_rules = apply_fpgrowth(space_df)

print("Sports - FP-Growth Frequent Itemsets:\n", sports_fp_freq)
print("\nSports - FP-Growth Association Rules:\n", sports_fp_rules[['antecedents','consequents','support','confidence','lift']])
sports_freq, sports_rules = apply_apriori(sports_df, min_support=0.2)
space_freq, space_rules = apply_apriori(space_df, min_support=0.2)

sports_fp_freq, sports_fp_rules = apply_fpgrowth(sports_df, min_support=0.2)
space_fp_freq, space_fp_rules = apply_fpgrowth(space_df, min_support=0.2)


Sports - FP-Growth Frequent Itemsets:
    support        itemsets
0     0.44      (football)
1     0.36        (gloves)
2     0.36  (cricket ball)
3     0.42         (juice)
4     0.40   (cricket bat)
5     0.28  (water bottle)
6     0.26     (ice cream)

Sports - FP-Growth Association Rules:
 Empty DataFrame
Columns: [antecedents, consequents, support, confidence, lift]
Index: []


In [122]:
from mlxtend.frequent_patterns import fpgrowth

def apply_fpgrowth(df, min_support=0.2):
    freq_items = fpgrowth(df, min_support=min_support, use_colnames=True)
    rules = association_rules(freq_items, metric="confidence", min_threshold=0.6)
    return freq_items, rules

sports_fp_freq, sports_fp_rules = apply_fpgrowth(sports_df)
space_fp_freq, space_fp_rules = apply_fpgrowth(space_df)

print("Sports - FP-Growth Frequent Itemsets:\n", sports_fp_freq)
print("\nSports - FP-Growth Association Rules:\n", sports_fp_rules[['antecedents','consequents','support','confidence','lift']])



Sports - FP-Growth Frequent Itemsets:
    support        itemsets
0     0.44      (football)
1     0.36        (gloves)
2     0.36  (cricket ball)
3     0.42         (juice)
4     0.40   (cricket bat)
5     0.28  (water bottle)
6     0.26     (ice cream)

Sports - FP-Growth Association Rules:
 Empty DataFrame
Columns: [antecedents, consequents, support, confidence, lift]
Index: []


In [124]:
def compare_results(name, apriori_rules, fp_rules):
    print(f"---- {name} ----")
    print(f"Apriori Rules Count: {len(apriori_rules)}")
    print(f"FP-Growth Rules Count: {len(fp_rules)}")
    print()

compare_results("Sports", sports_rules, sports_fp_rules)
compare_results("Space", space_rules, space_fp_rules)


---- Sports ----
Apriori Rules Count: 0
FP-Growth Rules Count: 0

---- Space ----
Apriori Rules Count: 0
FP-Growth Rules Count: 0

