# Section 2: Data Mining
## Task 3 Part B: Association Rule Mining (10 Marks)
This stand-alone notebook generates synthetic transactional data, runs Apriori (mlxtend if available, else a minimal fallback), extracts rules with specified thresholds, and provides analytical commentary.

In [None]:
# 1. Imports & Configuration
import numpy as np
import pandas as pd
from pathlib import Path
import random, json

RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)
random.seed(RANDOM_SEED)
ARTIFACT_DIR = Path('artifacts')
ARTIFACT_DIR.mkdir(exist_ok=True)
print('Configuration initialized.')

In [None]:
# 2. Synthetic Transaction Generation
item_pool = ['milk','bread','butter','cheese','eggs','beer','diapers','apples','bananas','cereal',
              'chicken','rice','pasta','tomatoes','onions','yogurt','chips','soda','coffee','tea']
n_transactions = 45  # within required 20-50 range
rng = np.random.default_rng(RANDOM_SEED)
transactions = []
for _ in range(n_transactions):
    basket_size = rng.integers(3,9)
    basket = rng.choice(item_pool, size=basket_size, replace=False).tolist()
    # Inject patterned co-occurrences
    if rng.random() < 0.45:  # bread & milk synergy
        for it in ['milk','bread']:
            if it not in basket: basket.append(it)
    if rng.random() < 0.30:  # beer & diapers classic example
        for it in ['beer','diapers']:
            if it not in basket: basket.append(it)
    if rng.random() < 0.35:  # coffee & tea pairing
        for it in ['coffee','tea']:
            if it not in basket: basket.append(it)
    transactions.append(sorted(set(basket)))
print('First 5 transactions sample:')
for t in transactions[:5]:
    print(t)
print('Total transactions:', len(transactions))

In [None]:
# 3. Apriori Mining (mlxtend preferred, fallback otherwise)
MIN_SUPPORT = 0.2
MIN_CONF = 0.5
try:
    from mlxtend.frequent_patterns import apriori, association_rules
    from mlxtend.preprocessing import TransactionEncoder
    use_mlxtend = True
    print('Using mlxtend Apriori implementation.')
except ImportError:
    use_mlxtend = False
    print('mlxtend not installed; using simplified fallback Apriori (pairs only).')

if use_mlxtend:
    te = TransactionEncoder()
    te_ary = te.fit(transactions).transform(transactions)
    df_tx = pd.DataFrame(te_ary, columns=te.columns_)
    frequent = apriori(df_tx, min_support=MIN_SUPPORT, use_colnames=True)
    rules = association_rules(frequent, metric='confidence', min_threshold=MIN_CONF)
    rules = rules[['antecedents','consequents','support','confidence','lift','leverage','conviction']]
else:
    # Minimal fallback focusing on singletons & pairs
    def support(itemset):
        count = sum(1 for t in transactions if set(itemset).issubset(t))
        return count / len(transactions)
    singles = sorted({i for t in transactions for i in t})
    L1 = [{i} for i in singles if support({i}) >= MIN_SUPPORT]
    pairs = []
    for i in range(len(L1)):
        for j in range(i+1, len(L1)):
            cand = L1[i] | L1[j]
            sup = support(cand)
            if sup >= MIN_SUPPORT:
                pairs.append((cand, sup))
    rows = []
    for cand, sup in pairs:
        a,b = tuple(cand)
        sup_a, sup_b = support({a}), support({b})
        conf_a_b = sup / sup_a if sup_a else 0
        conf_b_a = sup / sup_b if sup_b else 0
        lift_a_b = conf_a_b / sup_b if sup_b else 0
        lift_b_a = conf_b_a / sup_a if sup_a else 0
        if conf_a_b >= MIN_CONF:
            rows.append({'antecedents': {a}, 'consequents': {b}, 'support': sup, 'confidence': conf_a_b, 'lift': lift_a_b})
        if conf_b_a >= MIN_CONF:
            rows.append({'antecedents': {b}, 'consequents': {a}, 'support': sup, 'confidence': conf_b_a, 'lift': lift_b_a})
    rules = pd.DataFrame(rows)
    if not rules.empty: rules['leverage'] = np.nan; rules['conviction'] = np.nan

if rules.empty:
    print('No rules found at given thresholds. Consider lowering support/confidence.')
else:
    print('Total candidate rules:', len(rules))
rules_sorted = rules.sort_values('lift', ascending=False).head(5).reset_index(drop=True)
display(rules_sorted)
rules_sorted.to_csv(ARTIFACT_DIR / 'top5_rules_partB.csv', index=False)
print('Saved top 5 rules to artifacts/top5_rules_partB.csv')

# 4. Analysis
A representative high-lift rule such as {bread} → {milk} implies that the presence of bread meaningfully increases the probability that milk appears in the same basket relative to baseline frequency. Practically, retailers can exploit this by: (1) cross-promoting milk near bread aisles, (2) bundling discounts to increase average basket value, and (3) ensuring synchronized replenishment to avoid stockouts that would reduce rule utility. Lift’s normalization over marginal supports helps filter out spurious popularity-driven associations. Still, rules must be validated over time: seasonality, promotions, and changing customer habits can erode rule strength. A/B testing recommendations based on the rule (e.g., suggesting milk at online checkout after bread is added) quantifies uplift in conversion. Additionally, combining rules with customer segmentation may personalize which associations to prioritize for distinct shopper cohorts.

In [None]:
# 5. Metadata & Persistence
metadata = {
    'n_transactions': len(transactions),
    'min_support': MIN_SUPPORT,
    'min_confidence': MIN_CONF,
    'rules_found': int(len(rules)),
    'top5_rules_count': int(len(rules_sorted))
}
with open(ARTIFACT_DIR / 'task3b_metadata.json', 'w') as f:
    json.dump(metadata, f, indent=2)
print('Saved metadata to artifacts/task3b_metadata.json')
display(pd.DataFrame([metadata]))

### Part B Complete
Generated transactions, mined rules with Apriori (or fallback), exported top 5 by lift, and provided actionable analysis.