<a href="https://colab.research.google.com/github/krishatuladhar/Data-Warehousing-and-Data-Mining/blob/main/lab2dwm.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Lab2
#Apriori algorithm:
* Find frequently occurring itemsets using the Apriori algorithm.
* Compute the support of the frequent itemset.
* Compute the confidence and lift of an association rule.

#FP-Growth algorithm.
* Find frequently occurring itemsets using the FP-Growth algorithm.
* Compute the support of the frequent itemset.
* Compute the confidence and lift of an association rule.
* Compare Apriori and FP-growth algorithms.
* Note: Use the sports.txt and space.txt as input data.


In [None]:
# Frequent Itemset Mining using Apriori and FP-Growth

import pandas as pd
from itertools import combinations
from collections import defaultdict
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import fpgrowth, association_rules
import time

# Settings
MIN_SUPPORT = 0.15
MIN_CONFIDENCE = 0.7
FILE_PATHS = {
    'space': '/content/drive/MyDrive/datawarehousinglab/space.txt',
    'sports': '/content/drive/MyDrive/datawarehousinglab/sports.txt'
}

# --- Helper Functions ---
def get_support(itemset, transactions):
    return sum(1 for tx in transactions if itemset.issubset(set(tx))) / len(transactions)

def apriori_algorithm(transactions, min_support):
    total = len(transactions)
    item_counts = defaultdict(int)

    # Step 1: Count individual items
    for tx in transactions:
        for item in tx:
            item_counts[frozenset([item])] += 1

    freq_items = {item: count for item, count in item_counts.items() if count / total >= min_support}
    all_frequent = freq_items.copy()
    current_freq = list(freq_items.keys())
    k = 2

    # Step 2: Generate combinations
    while current_freq:
        candidates = set()
        for i in range(len(current_freq)):
            for j in range(i + 1, len(current_freq)):
                union = current_freq[i] | current_freq[j]
                if len(union) == k:
                    candidates.add(union)

        candidate_counts = defaultdict(int)
        for tx in transactions:
            tx_set = set(tx)
            for candidate in candidates:
                if candidate.issubset(tx_set):
                    candidate_counts[candidate] += 1

        current_freq = [item for item in candidate_counts if candidate_counts[item] / total >= min_support]
        all_frequent.update({item: candidate_counts[item] for item in current_freq})
        k += 1

    return all_frequent

def extract_rules(frequent_itemsets, transactions, min_confidence):
    rules = []
    total = len(transactions)

    for itemset in frequent_itemsets:
        if len(itemset) < 2:
            continue
        support_itemset = frequent_itemsets[itemset] / total

        for i in range(1, len(itemset)):
            for antecedent in combinations(itemset, i):
                antecedent = frozenset(antecedent)
                consequent = itemset - antecedent
                support_ante = get_support(antecedent, transactions)
                support_cons = get_support(consequent, transactions)
                confidence = support_itemset / support_ante
                lift = confidence / support_cons
                if confidence >= min_confidence:
                    rules.append({
                        'antecedents': set(antecedent),
                        'consequents': set(consequent),
                        'support': round(support_itemset, 2),
                        'confidence': round(confidence, 2),
                        'lift': round(lift, 2)
                    })
    return rules

# --- Main Processing Function ---
def analyze_dataset(name, path):
    print(f"\n=== Analyzing '{name}' Dataset ===")

    # Load transaction data
    with open(path, 'r') as file:
        next(file)
        transactions = [[item.strip() for item in line.strip().split(',')[1:] if item.strip()] for line in file]

    total = len(transactions)

    # --- Apriori ---
    print("\n[Apriori Algorithm]")
    start_apriori = time.time()
    apriori_itemsets = apriori_algorithm(transactions, MIN_SUPPORT)
    apriori_rules = extract_rules(apriori_itemsets, transactions, MIN_CONFIDENCE)
    end_apriori = time.time()

    apriori_df = pd.DataFrame([{
        'itemsets': set(item),
        'support': round(count / total, 2)
    } for item, count in apriori_itemsets.items()])

    if not apriori_rules:
        print("No association rules found with confidence >=", MIN_CONFIDENCE)
    else:
        apriori_rules_df = pd.DataFrame(apriori_rules)
        print(apriori_rules_df[['antecedents', 'consequents', 'support', 'confidence', 'lift']])

    # --- FP-Growth ---
    print("\n[FP-Growth Algorithm]")
    start_fp = time.time()
    te = TransactionEncoder()
    df_encoded = pd.DataFrame(te.fit_transform(transactions), columns=te.columns_)
    fp_itemsets = fpgrowth(df_encoded, min_support=MIN_SUPPORT, use_colnames=True)
    fp_rules = association_rules(fp_itemsets, metric="confidence", min_threshold=MIN_CONFIDENCE)
    end_fp = time.time()

    if fp_rules.empty:
        print("No association rules found using FP-Growth with confidence >=", MIN_CONFIDENCE)
    else:
        print(fp_rules[['antecedents', 'consequents', 'support', 'confidence', 'lift']])

    # --- Time Comparison ---
    print("\n[Performance]")
    print(f"Apriori Time: {round(end_apriori - start_apriori, 4)}s")
    print(f"FP-Growth Time: {round(end_fp - start_fp, 4)}s")

    print("\n[Summary]")
    print(f"Apriori Rules: {len(apriori_rules)}")
    print(f"FP-Growth Rules: {len(fp_rules)}")
    print("FP-Growth is faster." if end_fp < end_apriori else "Apriori is faster.")

# --- Run on All Datasets ---
for name, path in FILE_PATHS.items():
    analyze_dataset(name, path)



=== Analyzing 'space' Dataset ===

[Apriori Algorithm]
No association rules found with confidence >= 0.7

[FP-Growth Algorithm]
No association rules found using FP-Growth with confidence >= 0.7

[Performance]
Apriori Time: 0.0004s
FP-Growth Time: 0.0431s

[Summary]
Apriori Rules: 0
FP-Growth Rules: 0
Apriori is faster.

=== Analyzing 'sports' Dataset ===

[Apriori Algorithm]
No association rules found with confidence >= 0.7

[FP-Growth Algorithm]
No association rules found using FP-Growth with confidence >= 0.7

[Performance]
Apriori Time: 0.0004s
FP-Growth Time: 0.0231s

[Summary]
Apriori Rules: 0
FP-Growth Rules: 0
Apriori is faster.
