# Practical 3

In [None]:
''' 
Aim:
    Run Apriori algorithm to find frequent item sets and association rules on 2 real datasets and use
    appropriate evaluation measures to compute correctness of obtained patterns
        a) Use minimum support as 50% and minimum confidence as 75%
        b) Use minimum support as 60% and minimum confidence as 60 %
'''

In [1]:
# apriori on 2 real world dataset + evaluation
# Transactional retail dataset 1: Groceries (common toy dataset)
# Transactional retail dataset 2: Online Retail or any market-basket CSV of transactions

In [2]:
import pandas as pd
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori, association_rules

# Example: reading transactions where dataset is a list-of-lists (transactions)
# Suppose you have a CSV where each row has items separated by commas:
tr = []
with open('transactions.csv') as f:
    for line in f:
        items = [i.strip() for i in line.strip().split(',') if i.strip()]
        tr.append(items)

te = TransactionEncoder()
te_ary = te.fit(tr).transform(tr)
df_tr = pd.DataFrame(te_ary, columns=te.columns_)

def run_apriori(min_support, min_confidence):
    frequent_itemsets = apriori(df_tr, min_support=min_support, use_colnames=True)
    rules = association_rules(frequent_itemsets, metric="confidence", min_threshold=min_confidence)
    # add lift and conviction (already included), sort
    rules = rules.sort_values(['confidence','lift'], ascending=False)
    return frequent_itemsets, rules

# (a) 50% support, 75% confidence
fi_50, rules_50_75 = run_apriori(0.5, 0.75)

# (b) 60% support, 60% confidence
fi_60, rules_60_60 = run_apriori(0.6, 0.6)

print("Frequent (50%) count:", len(fi_50))
print("Rules (50/75):", len(rules_50_75))
print("Rules (60/60):", len(rules_60_60))


FileNotFoundError: [Errno 2] No such file or directory: 'transactions.csv'

In [None]:
# Rule correctness test on a holdout
# You can split transactions into train/test, 
# learn rules on train, then compute empirical rule accuracy on test:
from sklearn.model_selection import train_test_split

# create train-test split of transactions
tr_train, tr_test = train_test_split(tr, test_size=0.3, random_state=42)

# transform both
te = TransactionEncoder()
te_ary_train = te.fit(tr_train).transform(tr_train)
te_ary_test = te.transform(tr_test)  # use same columns
df_train = pd.DataFrame(te_ary_train, columns=te.columns_)
df_test = pd.DataFrame(te_ary_test, columns=te.columns_)

fi = apriori(df_train, min_support=0.5, use_colnames=True)
rules = association_rules(fi, metric='confidence', min_threshold=0.75)

# Evaluate each rule on test:
def rule_accuracy_on_transactions(rule, df_test):
    antecedents = list(rule['antecedents'])
    consequents = list(rule['consequents'])
    # boolean series: rows where all antecedents present
    antecedent_mask = df_test[antecedents].all(axis=1)
    if antecedent_mask.sum() == 0:
        return np.nan  # cannot evaluate; zero cases
    # fraction where antecedent -> consequent also present
    consequent_mask = df_test[consequents].all(axis=1)
    return (antecedent_mask & consequent_mask).sum() / antecedent_mask.sum()

accuracies = []
for _, r in rules.iterrows():
    acc = rule_accuracy_on_transactions(r, df_test)
    accuracies.append(acc)

rules['test_accuracy'] = accuracies
print(rules[['antecedents','consequents','support','confidence','lift','test_accuracy']])
