In [17]:
import pandas as pd
from itertools import combinations

In [37]:
#  df = pd.DataFrame({
#         'Transactions': ['T1','T1','T1','T2','T3','T3','T3','T3','T4','T4','T4','T5','T5','T6','T6','T6','T6','T7','T7','T8','T8','T9','T9','T9','T9','T10','T10'], 
#         'Item': ['bread','butter','milk','milk','bread','milk','sugar','tea','bread','butter','milk','milk','cereals','milk','coffee','sugar','tea','milk','bread','cereals','butter','bread','cereals','sugar','tea','bread','coffee']})
# df.head()

df = pd.DataFrame({
        'TID': ['T1','T1','T1','T2','T2','T2','T2','T2','T3','T3','T3','T4','T4','T4','T4','T5','T5','T5','T5'], 
        'Item': ['bread','butter','milk','bread','butter','coffee','sugar','milk','butter','coffee','milk','butter','coffee','sugar','milk','bread','butter','coffee']})
df.head()

Unnamed: 0,Transactions,Item
0,T1,bread
1,T1,butter
2,T1,milk
3,T2,milk
4,T3,bread


In [38]:
def generate_one_hot_encoding(df):
    """
        This method generates one hot encoding for pandas dataframe
        Output to this method is also pandas dataframe
        This method utlizes in-built method from pandas to return one-hot encoding
    """
    one_hot_encoded = pd.get_dummies(df['Item'])
    
    basket = pd.concat([df['Transactions'], one_hot_encoded],axis=1)
    basket = basket.groupby('Transactions').sum().reset_index()
    return basket

In [41]:
basket = generate_one_hot_encoding(df)
basket = basket.drop(columns='Transactions', axis=1,)
basket.head()

Unnamed: 0,bread,butter,cereals,coffee,milk,sugar,tea
0,1,1,0,0,1,0,0
1,1,0,0,1,0,0,0
2,0,0,0,0,1,0,0
3,1,0,0,0,1,1,1
4,1,1,0,0,1,0,0


In [42]:
def generate_frequent_itemsets(df, min_support=0.5):
    """
    Implements Apriori algorithm to generate frequent itemsets from binary transaction data.
    
    Parameters:
    df (pd.DataFrame): A dataframe with binary values (0/1) where rows represent transactions and
                       columns represent items.
    min_support (float): Minimum support threshold.
    
    Returns:
    dict: A dictionary where keys are frequent itemsets (frozenset) and values are their support.
    """
    # Number of transactions
    num_transactions = len(df)
    
    # Step 1: Find frequent 1-itemsets
    item_support = {}
    for column in df.columns:
        
        print(type(int(df[column].sum())))
        print(type(num_transactions))
        support = int(df[column].sum()) / num_transactions
        if support >= min_support:
            item_support[frozenset([column])] = support
    
    # Initialize the frequent itemsets with frequent 1-itemsets
    frequent_itemsets = item_support.copy()
    
    # Step 2: Generate candidate itemsets of size k and filter by min_support
    k = 2
    current_itemsets = list(item_support.keys())
    
    while current_itemsets:
        # Generate candidate itemsets of size k by combining k-1 itemsets
        candidate_itemsets = []
        for i in range(len(current_itemsets)):
            for j in range(i + 1, len(current_itemsets)):
                candidate = current_itemsets[i].union(current_itemsets[j])
                if len(candidate) == k:
                    candidate_itemsets.append(candidate)
        
        # Calculate support for candidate itemsets
        candidate_support = {}
        for candidate in candidate_itemsets:
            # Check how many transactions contain this candidate
            support = df[list(candidate)].all(axis=1).sum() / num_transactions
            if support >= min_support:
                candidate_support[frozenset(candidate)] = support
        
        # Add frequent k-itemsets to the result
        frequent_itemsets.update(candidate_support)
        
        # Prepare for the next iteration (generate k+1 itemsets)
        current_itemsets = list(candidate_support.keys())
        k += 1
    
    return frequent_itemsets

In [43]:
import pandas as pd
from itertools import combinations

def generate_rules(frequent_itemsets, min_confidence=0.5):
    """
    Generate association rules from frequent itemsets along with confidence and lift metrics.
    
    Parameters:
    frequent_itemsets (dict): Dictionary where keys are itemsets and values are their support.
    min_confidence (float): Minimum confidence threshold for filtering rules.
    
    Returns:
    pd.DataFrame: A DataFrame of rules with antecedent, consequent, confidence, and lift.
    """
    rules = []
    itemsets = list(frequent_itemsets.keys())
    
    for itemset in itemsets:
        if len(itemset) > 1:
            # For each frequent itemset, generate all possible non-empty proper subsets (A)
            for i in range(1, len(itemset)):
                for antecedent in combinations(itemset, i):
                    antecedent = frozenset(antecedent)
                    consequent = itemset - antecedent
                    
                    if consequent:
                        # Calculate confidence: P(A → B) = support(A ∪ B) / support(A)
                        confidence = frequent_itemsets[itemset] / frequent_itemsets[antecedent]
                        
                        # Only consider rules with confidence greater than or equal to min_confidence
                        if confidence >= min_confidence:
                            # Calculate lift: Lift(A → B) = confidence(A → B) / support(B)
                            lift = confidence / frequent_itemsets[consequent]
                            
                            # Append the rule as a row to the list
                            rules.append({
                                'antecedent': set(antecedent),
                                'consequent': set(consequent),
                                'confidence': confidence,
                                'lift': lift
                            })
    
    # Convert the list of rules to a DataFrame
    return pd.DataFrame(rules)




In [61]:
def generate_rules_2(frequent_itemsets, min_confidence=0.5, max_antecedent=3, max_consequent=3):
    """
    Generate association rules from frequent itemsets along with confidence, lift, and support metrics.
    
    Parameters:
    frequent_itemsets (dict): Dictionary where keys are itemsets (frozenset) and values are their support.
    min_confidence (float): Minimum confidence threshold for filtering rules.
    max_antecedent (int): Maximum number of items allowed in the antecedent.
    max_consequent (int): Maximum number of items allowed in the consequent.
    
    Returns:
    pd.DataFrame: A DataFrame of rules with antecedent, consequent, confidence, lift, and support.
    """
    rules = []
    itemsets = list(frequent_itemsets.keys())
    
    for itemset in itemsets:
        if len(itemset) > 1:
            # For each frequent itemset, generate all possible non-empty proper subsets (A)
            for i in range(1, min(max_antecedent, len(itemset)) + 1):
                for antecedent in combinations(itemset, i):
                    antecedent = frozenset(antecedent)
                    consequent = itemset - antecedent
                    
                    if consequent and len(consequent) <= max_consequent:
                        # Calculate confidence: P(A → B) = support(A ∪ B) / support(A)
                        confidence = frequent_itemsets[itemset] / frequent_itemsets[antecedent]
                        
                        # Only consider rules with confidence greater than or equal to min_confidence
                        if confidence >= min_confidence:
                            # Calculate lift: Lift(A → B) = confidence(A → B) / support(B)
                            lift = confidence / frequent_itemsets[frozenset(consequent)]
                            
                            # Get support for the full itemset (antecedent ∪ consequent)
                            support = frequent_itemsets[itemset]
                            
                            # Append the rule as a row to the list
                            rules.append({
                                'antecedent': antecedent,
                                'consequent': frozenset(consequent),                                
                                'support': support,
                                'confidence': confidence,
                                'lift': lift,
                            })
    
    # Convert the list of rules to a DataFrame
    return pd.DataFrame(rules)


In [64]:
def generate_rules_3(frequent_itemsets, min_confidence=0.5, max_antecedent=2, max_consequent=2):
    """
    Generate association rules from frequent itemsets along with confidence, lift, and support metrics.
    
    Parameters:
    frequent_itemsets (dict): Dictionary where keys are itemsets (frozenset) and values are their support.
    min_confidence (float): Minimum confidence threshold for filtering rules.
    max_antecedent (int): Maximum number of items allowed in the antecedent.
    max_consequent (int): Maximum number of items allowed in the consequent.
    
    Returns:
    pd.DataFrame: A DataFrame of rules with antecedent, consequent, confidence, lift, and support.
    """
    rules = []
    itemsets = list(frequent_itemsets.keys())
    
    for itemset in itemsets:
        if len(itemset) > 1:
            # For each frequent itemset, generate all possible non-empty proper subsets (A)
            for i in range(1, min(max_antecedent, len(itemset)) + 1):
                for antecedent in combinations(itemset, i):
                    antecedent = frozenset(antecedent)
                    consequent = itemset - antecedent
                    
                    if consequent and len(consequent) <= max_consequent:
                        # Calculate confidence: P(A → B) = support(A ∪ B) / support(A)
                        confidence = frequent_itemsets[itemset] / frequent_itemsets[antecedent]
                        
                        # Only consider rules with confidence greater than or equal to min_confidence
                        if confidence >= min_confidence:
                            # Calculate lift: Lift(A → B) = confidence(A → B) / support(B)
                            lift = confidence / frequent_itemsets[frozenset(consequent)]
                            
                            # Get support for the full itemset (antecedent ∪ consequent)
                            support = frequent_itemsets[itemset]
                            
                            # Convert sets to strings for display without brackets
                            rules.append({
                                'antecedent': ', '.join(antecedent),
                                'consequent': ', '.join(consequent),
                                'confidence': confidence,
                                'lift': lift,
                                'support': support
                            })
    
    # Convert the list of rules to a DataFrame
    return pd.DataFrame(rules)

In [65]:
# Example usage with frequent itemsets from the previous step
frequent_itemsets = generate_frequent_itemsets(basket, 0.1)
# print(frequent_itemsets)
# Minimum confidence threshold
min_confidence = 0.1

# Generate rules
rules_df = generate_rules_3(frequent_itemsets, min_confidence,5,5)

# Display the rules DataFrame
rules_df.tail(10)

<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>


Unnamed: 0,antecedent,consequent,confidence,lift,support
132,"tea, milk","coffee, sugar",0.5,5.0,0.1
133,"tea, coffee","sugar, milk",1.0,5.0,0.1
134,"tea, sugar","coffee, milk",0.333333,3.333333,0.1
135,"coffee, milk","tea, sugar",1.0,3.333333,0.1
136,"sugar, milk","tea, coffee",0.5,5.0,0.1
137,"coffee, sugar","tea, milk",1.0,5.0,0.1
138,"tea, coffee, milk",sugar,1.0,3.333333,0.1
139,"sugar, tea, milk",coffee,0.5,2.5,0.1
140,"tea, coffee, sugar",milk,1.0,1.428571,0.1
141,"sugar, coffee, milk",tea,1.0,3.333333,0.1


In [66]:
rules_df.to_csv('apriori_3.csv')