#  Name -> Deven Chhajed
# Roll No-> 32
# Batch -> B1 (CSE)
# Prn -> 1032210789
# Association Rules

# What is Association Mining

**Purpose:** Identifies relationships, patterns, or associations between items in large datasets.

**Goal:** Discovers frequent itemsets or combinations of items that often occur together.

**Technique:** Uses algorithms like Apriori or FP-Growth to find these frequent itemsets.

**Association Rules:** Generates rules in the form of "if-then" statements to describe item relationships.

**Rules Interpretation:** Indicates the likelihood of certain items co-occurring based on support and confidence measures.

**Applications:** Commonly used in market basket analysis, recommendation systems, and various areas to reveal hidden patterns in data.

**Support and Confidence:** Measures used to quantify the significance and reliability of discovered associations.

**Scalability:** Important for handling large datasets efficiently in order to identify meaningful associations.

# Importing the pandas library as 'pd'

In [20]:
import pandas as pd

  and should_run_async(code)


# Creating the Data Set

In [21]:
data = {
    'Transaction': [1,2,3],
    'Item1': ['A','B','C'],
    'Item2': ['A','C','D'],
    'Item3': ['B','C','D'],
    'Item4': ['A','D','E'],
    'Item5': ['B','C','E'],
}

  and should_run_async(code)


# Creating the Data Frame

In [22]:
df = pd.DataFrame(data)
df

  and should_run_async(code)


Unnamed: 0,Transaction,Item1,Item2,Item3,Item4,Item5
0,1,A,A,B,A,B
1,2,B,C,C,D,C
2,3,C,D,D,E,E


# Sets Data Frame index to custom strings


In [23]:
df = df.set_index(
    [pd.Index(['Transaction 1', 'Transaction 2', 'Transaction 3'])])

  and should_run_async(code)


# Drops column named 'Transaction' from DataFrame 'df'
# Returns the modified Data Frame

In [24]:
df.drop('Transaction',axis=1,inplace=True)
df

  and should_run_async(code)


Unnamed: 0,Item1,Item2,Item3,Item4,Item5
Transaction 1,A,A,B,A,B
Transaction 2,B,C,C,D,C
Transaction 3,C,D,D,E,E


#  Defines minimum support value as 2


In [25]:
min_support = 2

  and should_run_async(code)


# Calculates item counts for each element in DataFrame 'df'
# Accumulates counts in 'item_counts' dictionary


In [26]:
item_counts = {}

for _, row in df.iterrows():
    item_counts_row = row.value_counts().to_dict()

    for item, count in item_counts_row.items():
        item_counts[item] = item_counts.get(item, 0) + count

  and should_run_async(code)


# Converts item_counts dictionary to DataFrame 'c1'
# With columns 'Item' and 'Count', containing item counts


In [27]:
c1 = pd.DataFrame(list(item_counts.items()), columns=['Item', 'Count'])
c1

  and should_run_async(code)


Unnamed: 0,Item,Count
0,A,3
1,B,3
2,C,4
3,D,3
4,E,2


# Filters items in DataFrame 'c1' with count >= minimum support
# Stores the result in DataFrame 'l1'


In [28]:
l1 = c1[c1['Count'] >= min_support]
l1

  and should_run_async(code)


Unnamed: 0,Item,Count
0,A,3
1,B,3
2,C,4
3,D,3
4,E,2


# Generates combinations of length 2 from items in DataFrame 'l1'
# Stores combinations in 'combinations_l2' as a list


In [29]:
from itertools import combinations
combinations_l2 = list(combinations(l1, 2))

  and should_run_async(code)


# Similiarly we will perform the same steps until we get our solution

In [30]:
item_pair_counts = {}

for _, row in df.iterrows():
    row_combinations = list(combinations(row, 2))

    for item_pair in row_combinations:
        if item_pair in item_pair_counts:
            item_pair_counts[item_pair] += 1
        else:
            item_pair_counts[item_pair] = 1

  and should_run_async(code)


In [31]:
c2 = pd.DataFrame(list(item_pair_counts.items()), columns=['Item Pair', 'Count'])
c2

  and should_run_async(code)


Unnamed: 0,Item Pair,Count
0,"(A, A)",3
1,"(A, B)",5
2,"(B, A)",1
3,"(B, B)",1
4,"(B, C)",3
5,"(B, D)",1
6,"(C, C)",3
7,"(C, D)",4
8,"(D, C)",1
9,"(C, E)",2


In [32]:
l2 = c2[c2['Count'] >= min_support]
l2

  and should_run_async(code)


Unnamed: 0,Item Pair,Count
0,"(A, A)",3
1,"(A, B)",5
4,"(B, C)",3
6,"(C, C)",3
7,"(C, D)",4
9,"(C, E)",2
11,"(D, E)",4


In [33]:
from itertools import combinations

combinations_l3 = list(combinations(l2['Item Pair'], 3))

item_triplet_counts = {}

for _, row in df.iterrows():
    row_combinations = list(combinations(row, 3))

    for item_triplet in row_combinations:
        if item_triplet in item_triplet_counts:
            item_triplet_counts[item_triplet] += 1
        else:
            item_triplet_counts[item_triplet] = 1

  and should_run_async(code)


In [34]:
c3 = pd.DataFrame(list(item_triplet_counts.items()), columns=['Item Triplet', 'Count'])
c3

  and should_run_async(code)


Unnamed: 0,Item Triplet,Count
0,"(A, A, B)",4
1,"(A, A, A)",1
2,"(A, B, A)",2
3,"(A, B, B)",2
4,"(B, A, B)",1
5,"(B, C, C)",3
6,"(B, C, D)",2
7,"(B, D, C)",1
8,"(C, C, D)",1
9,"(C, C, C)",1


In [35]:
l3= c3[c3['Count'] >= min_support]
l3

  and should_run_async(code)


Unnamed: 0,Item Triplet,Count
0,"(A, A, B)",4
2,"(A, B, A)",2
3,"(A, B, B)",2
5,"(B, C, C)",3
6,"(B, C, D)",2
10,"(C, D, C)",2
12,"(C, D, E)",4
14,"(D, D, E)",2
15,"(D, E, E)",2


In [36]:
from itertools import combinations

combinations_l4 = list(combinations(l2['Item Pair'], 4))

item_triplet_counts = {}

for _, row in df.iterrows():
    row_combinations = list(combinations(row, 4))

    for item_triplet in row_combinations:
        if item_triplet in item_triplet_counts:
            item_triplet_counts[item_triplet] += 1
        else:
            item_triplet_counts[item_triplet] = 1

  and should_run_async(code)


In [37]:
c4 = pd.DataFrame(list(item_triplet_counts.items()), columns=['Item Triplet', 'Count'])
c4

  and should_run_async(code)


Unnamed: 0,Item Triplet,Count
0,"(A, A, B, A)",1
1,"(A, A, B, B)",1
2,"(A, A, A, B)",1
3,"(A, B, A, B)",2
4,"(B, C, C, D)",1
5,"(B, C, C, C)",1
6,"(B, C, D, C)",2
7,"(C, C, D, C)",1
8,"(C, D, D, E)",2
9,"(C, D, E, E)",2


In [38]:
l4 = c4[c4['Count'] >= min_support]
l4

  and should_run_async(code)


Unnamed: 0,Item Triplet,Count
3,"(A, B, A, B)",2
6,"(B, C, D, C)",2
8,"(C, D, D, E)",2
9,"(C, D, E, E)",2


In [39]:
from itertools import combinations

combinations_l5 = list(combinations(l2['Item Pair'], 5))

item_triplet_counts = {}

for _, row in df.iterrows():
    row_combinations = list(combinations(row, 5))

    for item_triplet in row_combinations:
        if item_triplet in item_triplet_counts:
            item_triplet_counts[item_triplet] += 1
        else:
            item_triplet_counts[item_triplet] = 1

  and should_run_async(code)


In [40]:
c5 = pd.DataFrame(list(item_triplet_counts.items()), columns=['Item Triplet', 'Count'])
c5

  and should_run_async(code)


Unnamed: 0,Item Triplet,Count
0,"(A, A, B, A, B)",1
1,"(B, C, C, D, C)",1
2,"(C, D, D, E, E)",1


In [41]:
l5 = c5[c5['Count'] >= min_support]
l5

  and should_run_async(code)


Unnamed: 0,Item Triplet,Count


# Performing it on the CSV File (Store_Data)

In [None]:
from google.colab import files
uploaded = files.upload()

Saving store_data.csv to store_data.csv


# Installing the mlxtend library
`mlxtend` streamlines machine learning tasks with extra tools and utilities.

In [42]:
pip install mlxtend

  and should_run_async(code)




In [43]:
from mlxtend.preprocessing import TransactionEncoder  # TransactionEncoder for encoding transactional data
from mlxtend.frequent_patterns import apriori, association_rules  # apriori for frequent itemsets, association_rules for rule generation

  and should_run_async(code)


In [45]:
from mlxtend.frequent_patterns import apriori  # Import Apriori algorithm
from mlxtend.frequent_patterns import association_rules  # Import association rules functionality
import pandas as pd  # Import pandas library for data manipulation

# Sample dataset
data = {'TransactionID': [1, 2, 3, 4, 5],
        'Items': [['A', 'B', 'D'],
                  ['B', 'C', 'E'],
                  ['A', 'B', 'C', 'E'],
                  ['B', 'E'],
                  ['A', 'C', 'D']]}

df = pd.DataFrame(data)  # Create a DataFrame from the sample data

# Convert the 'Items' column to a one-hot encoded format
ohe_df = pd.get_dummies(df['Items'].apply(pd.Series).stack()).sum(level=0)

# Apply Apriori algorithm to find frequent itemsets with specified minimum support
frequent_itemsets = apriori(ohe_df, min_support=0.4, use_colnames=True)

# Generate association rules using the frequent itemsets and setting a minimum confidence threshold
rules = association_rules(frequent_itemsets, metric="confidence", min_threshold=0.7)

# Display the frequent itemsets found
print("The Frequent Itemsets:")
print(frequent_itemsets)

# Display the generated association rules
print("\nAssociation Rules:")
print(rules)

The Frequent Itemsets:
    support   itemsets
0       0.6        (A)
1       0.8        (B)
2       0.6        (C)
3       0.4        (D)
4       0.6        (E)
5       0.4     (A, B)
6       0.4     (C, A)
7       0.4     (D, A)
8       0.4     (C, B)
9       0.6     (E, B)
10      0.4     (C, E)
11      0.4  (C, E, B)

Association Rules:
  antecedents consequents  antecedent support  consequent support  support  \
0         (D)         (A)                 0.4                 0.6      0.4   
1         (E)         (B)                 0.6                 0.8      0.6   
2         (B)         (E)                 0.8                 0.6      0.6   
3      (C, E)         (B)                 0.4                 0.8      0.4   
4      (C, B)         (E)                 0.4                 0.6      0.4   

   confidence      lift  leverage  conviction  zhangs_metric  
0        1.00  1.666667      0.16         inf       0.666667  
1        1.00  1.250000      0.12         inf       0.500000  


  and should_run_async(code)
  ohe_df = pd.get_dummies(df['Items'].apply(pd.Series).stack()).sum(level=0)


In [46]:
file_path = 'store_data.csv'  # Define the file path for the CSV file ('store_data.csv')

  and should_run_async(code)


In [47]:
# Read the CSV file into a DataFrame
df = pd.read_csv(file_path, header=None)

# Create an empty DataFrame with columns 'Transaction ID' and 'Items'
result_df = pd.DataFrame(columns=['Transaction ID', 'Items'])

# Iterate through each row, extracting transaction IDs and items, and append them to the result DataFrame
for idx, row in df.iterrows():
    transaction_id = idx
    items = row.dropna().tolist()
    result_df = pd.concat([result_df, pd.DataFrame({'Transaction ID': [transaction_id], 'Items': [items]})], ignore_index=True)

# Convert the lists in the "Items" column to strings
result_df['Items'] = result_df['Items'].apply(lambda x: ', '.join(map(str, x)))

# Set 'Transaction ID' as the index of the DataFrame
result_df.set_index('Transaction ID', inplace=True)

# Display the resulting DataFrame
print(result_df)

  and should_run_async(code)


                                                            Items
Transaction ID                                                   
0               shrimp, almonds, avocado, vegetables mix, gree...
1                                        burgers, meatballs, eggs
2                                                         chutney
3                                                 turkey, avocado
4               mineral water, milk, energy bar, whole wheat r...
...                                                           ...
7496                              butter, light mayo, fresh bread
7497            burgers, frozen vegetables, eggs, french fries...
7498                                                      chicken
7499                                          escalope, green tea
7500            eggs, frozen smoothie, yogurt cake, low fat yo...

[7501 rows x 1 columns]


In [48]:
from mlxtend.frequent_patterns import apriori, association_rules  # Import Apriori algorithm and association rules functionalities

# Convert the lists in the "Items" column to strings
result_df['Items'] = result_df['Items'].apply(lambda x: ', '.join(map(str, x)))

# One-hot encode the 'Items' column
ohe_df = result_df['Items'].str.get_dummies(', ')

# Apply Apriori algorithm to find frequent itemsets based on specified minimum support
frequent_itemsets = apriori(ohe_df, min_support=0.4, use_colnames=True)

# Generate association rules with a minimum confidence threshold
rules = association_rules(frequent_itemsets, metric="confidence", min_threshold=0.7)

# Display the discovered frequent itemsets
print("The Frequent Itemsets:")
print(frequent_itemsets)

# Display the generated association rules
print("\nAssociation Rules:")
print(rules)

  and should_run_async(code)


The Frequent Itemsets:
        support                        itemsets
0      0.867618                             ( )
1      0.766165                             (,)
2      0.843221                             (a)
3      0.457672                             (b)
4      0.729636                             (c)
...         ...                             ...
10768  0.403280  ( , s, o, i, t, ,, a, n, l, e)
10769  0.426876  ( , r, s, i, t, ,, a, n, l, e)
10770  0.413412  ( , r, s, o, i, t, ,, a, l, e)
10771  0.430476  ( , r, s, o, i, t, ,, a, n, e)
10772  0.416078  ( , r, s, o, t, ,, a, n, l, e)

[10773 rows x 2 columns]

Association Rules:
       antecedents               consequents  antecedent support  \
0              (,)                       ( )            0.766165   
1              ( )                       (,)            0.867618   
2              ( )                       (a)            0.867618   
3              (a)                       ( )            0.843221   
4              

In [51]:
from mlxtend.frequent_patterns import apriori, association_rules  # Import functions for Apriori algorithm and association rules

def preprocess_data(dfs):
    df = pd.DataFrame(dfs)  # Convert provided data into a DataFrame
    ohe_df = pd.get_dummies(df['Items'].apply(pd.Series).stack()).sum(level=0)  # One-hot encode the 'Items' column
    return ohe_df  # Return the processed DataFrame

def find_frequent_itemsets(dataframe, min_support=0.4):
    frequent_itemsets = apriori(dataframe, min_support=min_support, use_colnames=True)  # Find frequent itemsets
    return frequent_itemsets  # Return the identified frequent itemsets

def generate_association_rules(frequent_itemsets, metric="confidence", min_threshold=0.7):
    rules = association_rules(frequent_itemsets, metric=metric, min_threshold=min_threshold)  # Generate association rules
    return rules  # Return the generated association rules

def main():
    # Preprocess the data
    ohe_df = preprocess_data(result_df)

    # Find frequent itemsets
    frequent_itemsets = find_frequent_itemsets(ohe_df)

    # Generate association rules
    rules = generate_association_rules(frequent_itemsets)

    # Display the frequent itemsets
    print("The Frequent Itemsets:")
    print(frequent_itemsets)

    # Display the association rules
    print("\nAssociation Rules:")
    print(rules)

if __name__ == "__main__":
    main()  # Execute the main function if this script is run directly


  and should_run_async(code)
  ohe_df = pd.get_dummies(df['Items'].apply(pd.Series).stack()).sum(level=0)  # One-hot encode the 'Items' column


ValueError: ignored

In [52]:
from mlxtend.frequent_patterns import apriori, association_rules  # Import necessary functions

def preprocess_data(data):
    df = pd.DataFrame(data)  # Convert provided data into a DataFrame
    ohe_df = pd.get_dummies(df['Items'].apply(pd.Series).stack()).sum(level=0)  # One-hot encode the 'Items' column
    return ohe_df  # Return the processed DataFrame

def find_frequent_itemsets(dataframe, min_support=0.4):
    frequent_itemsets = apriori(dataframe, min_support=min_support, use_colnames=True)  # Find frequent itemsets
    return frequent_itemsets  # Return the identified frequent itemsets

def generate_association_rules(frequent_itemsets, metric="confidence", min_threshold=0.7):
    rules = association_rules(frequent_itemsets, metric=metric, min_threshold=min_threshold)  # Generate association rules
    return rules  # Return the generated association rules

def main():
    # Sample dataset
    data = {'TransactionID': [1, 2, 3, 4, 5],
            'Items': [['A', 'B', 'D'],
                      ['B', 'C', 'E'],
                      ['A', 'B', 'C', 'E'],
                      ['B', 'E'],
                      ['A', 'C', 'D']]}

    # Preprocess the data
    ohe_df = preprocess_data(data)

    # Find frequent itemsets
    frequent_itemsets = find_frequent_itemsets(ohe_df)

    # Generate association rules
    rules = generate_association_rules(frequent_itemsets)

    # Display the frequent itemsets
    print("The Frequent Itemsets:")
    print(frequent_itemsets)

    # Display the association rules
    print("\nAssociation Rules:")
    print(rules)

if __name__ == "__main__":
    main()  # Execute the main function if this script is run directly


The Frequent Itemsets:
    support   itemsets
0       0.6        (A)
1       0.8        (B)
2       0.6        (C)
3       0.4        (D)
4       0.6        (E)
5       0.4     (A, B)
6       0.4     (C, A)
7       0.4     (D, A)
8       0.4     (C, B)
9       0.6     (E, B)
10      0.4     (C, E)
11      0.4  (C, E, B)

Association Rules:
  antecedents consequents  antecedent support  consequent support  support  \
0         (D)         (A)                 0.4                 0.6      0.4   
1         (E)         (B)                 0.6                 0.8      0.6   
2         (B)         (E)                 0.8                 0.6      0.6   
3      (C, E)         (B)                 0.4                 0.8      0.4   
4      (C, B)         (E)                 0.4                 0.6      0.4   

   confidence      lift  leverage  conviction  zhangs_metric  
0        1.00  1.666667      0.16         inf       0.666667  
1        1.00  1.250000      0.12         inf       0.500000  


  and should_run_async(code)
  ohe_df = pd.get_dummies(df['Items'].apply(pd.Series).stack()).sum(level=0)  # One-hot encode the 'Items' column


In [53]:
def preprocess_data(data):
    # Convert input data into a DataFrame and perform one-hot encoding
    df = pd.DataFrame(data)
    ohe_df = pd.get_dummies(df['Items'].apply(pd.Series).stack()).sum(level=0)
    return ohe_df  # Return the processed DataFrame

def find_frequent_itemsets(dataframe, min_support=0.4):
    num_transactions = len(dataframe)
    frequent_itemsets = {}  # Initialize a dictionary to store frequent itemsets

    # Calculate support for each item and identify frequent itemsets
    for column in dataframe.columns:
        support = dataframe[column].sum() / num_transactions
        if support >= min_support:
            frequent_itemsets[frozenset([column])] = support

    # Prune redundant itemsets to avoid repetition
    pruned_frequent_itemsets = {}
    for itemset in frequent_itemsets:
        is_redundant = False
        for larger_itemset in frequent_itemsets:
            if itemset != larger_itemset and itemset.issubset(larger_itemset) and frequent_itemsets[itemset] <= frequent_itemsets[larger_itemset]:
                is_redundant = True
                break

        if not is_redundant:
            pruned_frequent_itemsets[itemset] = frequent_itemsets[itemset]

    return pruned_frequent_itemsets  # Return the pruned frequent itemsets

def generate_association_rules(frequent_itemsets, transactions, min_confidence=0.0):
    rules = []  # Initialize a list to store generated association rules
    num_transactions = len(transactions)

    # Generate association rules based on frequent itemsets and minimum confidence
    for itemset, support in frequent_itemsets.items():
        if len(itemset) > 1:
            for antecedent in powerset(itemset):
                antecedent = frozenset(antecedent)
                consequent = itemset - antecedent

                antecedent_count = sum(1 for transaction in transactions if antecedent.issubset(transaction))
                consequent_count = sum(1 for transaction in transactions if consequent.issubset(transaction))
                both_count = sum(1 for transaction in transactions if antecedent.union(consequent).issubset(transaction))

                antecedent_support = antecedent_count / num_transactions
                consequent_support = consequent_count / num_transactions
                both_support = both_count / num_transactions

                confidence = support / antecedent_support if antecedent_support > 0 else 0

                if confidence >= min_confidence:
                    rules.append({
                        "antecedents": tuple(antecedent),
                        "consequents": tuple(consequent),
                        "antecedent support": antecedent_support,
                        "consequent support": consequent_support,
                        "support": support,
                        "confidence": confidence
                    })
    return rules  # Return the generated association rules

def powerset(s):
    return chain.from_iterable(combinations(s, r) for r in range(1, len(s)))

def display_results(frequent_itemsets, rules):
    # Display the frequent itemsets in DataFrame format
    frequent_itemsets_df = pd.DataFrame(list(frequent_itemsets.items()), columns=["itemsets", "support"])
    frequent_itemsets_df["itemsets"] = frequent_itemsets_df["itemsets"].apply(lambda x: tuple(x))  # Convert frozensets to tuples
    print("The Frequent Itemsets:")
    print(frequent_itemsets_df)

    # Display the association rules in DataFrame format
    rules_df = pd.DataFrame(rules)
    if not rules_df.empty:
        print("\nAssociation Rules:")
        print(rules_df)
    else:
        print("\nThere were no association rules found.")

def main():
    # Sample dataset
    data = {'TransactionID': [1, 2, 3, 4, 5],
            'Items': [['A', 'B', 'D'],
                      ['B', 'C', 'E'],
                      ['A', 'B', 'C', 'E'],
                      ['B', 'E'],
                      ['A', 'C', 'D']]}

    # Preprocess the data
    ohe_df = preprocess_data(data)

    # Find frequent itemsets
    frequent_itemsets = find_frequent_itemsets(ohe_df)

    # Generate association rules
    rules = generate_association_rules(frequent_itemsets, ohe_df)

    # Display the results
    display_results(frequent_itemsets, rules)

if __name__ == "__main__":
    main()  # Execute the main function if this script is run directly


The Frequent Itemsets:
  itemsets  support
0     (A,)      0.6
1     (B,)      0.8
2     (C,)      0.6
3     (D,)      0.4
4     (E,)      0.6

There were no association rules found.


  and should_run_async(code)
  ohe_df = pd.get_dummies(df['Items'].apply(pd.Series).stack()).sum(level=0)
