In [36]:
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori, association_rules
from mlxtend.frequent_patterns import fpmax, fpgrowth
import pandas as pd

data = pd.read_csv('Groceries data train.csv')
print("First few rows of the raw data:")
print(data.head())



First few rows of the raw data:
   User_id       Date itemDescription    year  month  day  day_of_week
0   2351.0  1/01/2014         cleaner  2014.0    1.0  1.0          2.0
1   2226.0  1/01/2014         sausage  2014.0    1.0  1.0          2.0
2   1922.0  1/01/2014  tropical fruit  2014.0    1.0  1.0          2.0
3   2943.0  1/01/2014      whole milk  2014.0    1.0  1.0          2.0
4   1249.0  1/01/2014    citrus fruit  2014.0    1.0  1.0          2.0


In [37]:

# data.fillna(0, inplace=True)
data.fillna('', inplace=True)


  data.fillna('', inplace=True)


In [38]:
# Group the Data into Transactions by User_id and Date
grouped = data.groupby(['User_id', 'Date'])['itemDescription'].apply(list).reset_index()

# Extract the list of transactions
transactions = grouped['itemDescription'].tolist()

print("\nTotal number of transactions:", len(transactions))
print("\nSample transactions")
for t in transactions[:5]:
    print(t)


Total number of transactions: 8362

Sample transactions
['whole milk', 'pastry', 'salty snack']
['whole milk', 'soda']
['sausage', 'whole milk', 'rolls/buns']
['frankfurter', 'soda']
['frozen vegetables', 'other vegetables']


In [39]:
#One-Hot Encode the Transaction Data
te = TransactionEncoder()
te_array = te.fit(transactions).transform(transactions)
df_encoded = pd.DataFrame(te_array, columns=te.columns_)

print("\nOne-hot encoded DataFrame preview:")
print(df_encoded.head())


One-hot encoded DataFrame preview:
          Instant food products  UHT-milk  abrasive cleaner  artif. sweetener  \
0  False                  False     False             False             False   
1  False                  False     False             False             False   
2  False                  False     False             False             False   
3  False                  False     False             False             False   
4  False                  False     False             False             False   

   baby cosmetics   bags  baking powder  bathroom cleaner   beef  ...  turkey  \
0           False  False          False             False  False  ...   False   
1           False  False          False             False  False  ...   False   
2           False  False          False             False  False  ...   False   
3           False  False          False             False  False  ...   False   
4           False  False          False             False  False  ...   

In [40]:
# Frequent Itemset Mining using Apriori

frequent_itemsets_apriori = apriori(df_encoded, min_support=0.05, use_colnames=True)

if frequent_itemsets_apriori.empty:
    print("No frequent itemsets found.")
else:
    print("Frequent Itemsets:")
    print(frequent_itemsets_apriori)

    #Generate Association Rules
    rules_apriori = association_rules(frequent_itemsets_apriori, metric="confidence", min_threshold=0.2)
    print("Association Rules:")
    print(rules_apriori)

Frequent Itemsets:
    support            itemsets
0  0.060751     (bottled water)
1  0.106673  (other vegetables)
2  0.101890        (rolls/buns)
3  0.060392   (root vegetables)
4  0.052739     (shopping bags)
5  0.094595              (soda)
6  0.130471        (whole milk)
7  0.078570            (yogurt)
Association Rules:
Empty DataFrame
Columns: [antecedents, consequents, antecedent support, consequent support, support, confidence, lift, representativity, leverage, conviction, zhangs_metric, jaccard, certainty, kulczynski]
Index: []


In [41]:
# Frequent Itemset Mining using FP-growth
frequent_itemsets_fp = fpgrowth(df_encoded, min_support=0.05, use_colnames=True)
if frequent_itemsets_fp.empty:
    print("No frequent itemsets found.")
else:
    print("Frequent Itemsets (FP-growth):")
    print(frequent_itemsets_fp)

    rules_fp = association_rules(frequent_itemsets_fp, metric="confidence", min_threshold=0.2)
    print("Association Rules:")
    print(rules_fp)


Frequent Itemsets (FP-growth):
    support            itemsets
0  0.130471        (whole milk)
1  0.094595              (soda)
2  0.101890        (rolls/buns)
3  0.106673  (other vegetables)
4  0.060392   (root vegetables)
5  0.052739     (shopping bags)
6  0.060751     (bottled water)
7  0.078570            (yogurt)
Association Rules:
Empty DataFrame
Columns: [antecedents, consequents, antecedent support, consequent support, support, confidence, lift, representativity, leverage, conviction, zhangs_metric, jaccard, certainty, kulczynski]
Index: []
