# Market Basket Analysis

This notebook performs market basket analysis on the Sample Superstore dataset using a simple Apriori implementation. It identifies frequent item pairs and computes their support and confidence.


In [None]:
import pandas as pd
from itertools import combinations

# Load dataset
df = pd.read_csv('Sample_Superstore.csv', encoding='ISO-8859-1')

# Create transactions by grouping product names per order ID
transactions = df.groupby('Order ID')['Product Name'].apply(list)
print('Number of transactions:', len(transactions))


In [None]:
# Compute pair counts
from collections import Counter

# Generate pair counts across all transactions
pair_counts = Counter()
for items in transactions:
    unique_items = set(items)
    for pair in combinations(sorted(unique_items), 2):
        pair_counts[pair] += 1

# Convert counts to a DataFrame and display top 20 pairs
pair_df = pd.DataFrame([(pair[0], pair[1], count) for pair, count in pair_counts.items()],
                      columns=['Item A', 'Item B', 'Count']).sort_values('Count', ascending=False)
pair_df.head(10)


In [None]:
# Compute basic metrics (support, confidence, lift) for top 20 pairs
total = len(transactions)
results = []
# Compute item supports
from collections import defaultdict
item_counts = defaultdict(int)
for items in transactions:
    for item in set(items):
        item_counts[item] += 1

# We'll use the top 20 pairs from pair_df
top_pairs = pair_df.head(10)

for _, row in top_pairs.iterrows():
    a, b, count = row['Item A'], row['Item B'], row['Count']
    support = count / total
    support_a = item_counts[a] / total
    support_b = item_counts[b] / total
    confidence_ab = support / support_a if support_a > 0 else 0
    confidence_ba = support / support_b if support_b > 0 else 0
    lift = support / (support_a * support_b) if support_a * support_b > 0 else 0
    results.append({'Pair': f"{a} & {b}", 'Count': count, 'Support': support,
                    'Confidence (A->B)': confidence_ab, 'Confidence (B->A)': confidence_ba, 'Lift': lift})

results_df = pd.DataFrame(results)
results_df.sort_values('Lift', ascending=False)
