# File and libraries

In [1]:
import scipy.stats as stats
import pandas as pd, numpy as np
from IPython.display import display, HTML
from sklearn.preprocessing import LabelBinarizer #for dummification
from mlxtend.frequent_patterns import apriori, association_rules #for ARM

- apriori method - is used to first identify the most frequent itemsets in the data.
    - How does it work?
        - It takes a dataset of transactions (or a binary dataframe where each column represents an item and each row represents a transaction).
        - It finds frequent itemsets, meaning items that appear together in at least a minimum number of transactions (defined by min_support).

- association_rules - After obtaining frequent itemsets using apriori, you can extract association rules using association_rules.

    - How does it work?
        - It uses the frequent itemsets to generate rules like:
            {A} → {B} (if a customer buys A, there is a high probability they will buy B).
        - It evaluates rule strength using metrics such as:
        -   support → how often the rule appears in the dataset.
        - confidence → how often B is bought when A is bought.
        - lift → whether A and B occur together more often than expected by chance.

***Summary***
- Use Apriori (apriori) to find groups of items frequently bought together.
- Use Association Rules (association_rules) to generate rules like “if customers buy A, they are likely to buy B.”

In [2]:
import pandas as pd
import numpy as np


filepath=r'/Users/cozmaeug/Private/IST PG - DS/DaSH ENG/ist_dash_2024_rec/non_supervised_analysis/notebooks/dataset_2/df_bakery_encoded.csv'

file_tag = "Bakery Clustering"

data = pd.read_csv(filepath)


In [3]:
%run "scripts/dslabs_functions.py"

In [4]:
%run "scripts/data_functions.py"

data_functions lodaded


In [5]:
food_items = ['angbutter', 'plain bread', 'jam', 'croissant', "tiramisu croissant", "cacao deep", "pain au chocolat",
              "almond croissant", "gateau chocolat", "pandoro", "cheese cake",
              "orange pound", "wiener", "tiramisu", "merinque cookies"]  
drink_items = ['americano', 'caffe latte', "milk tea", "lemon ade", "vanila latte", "berry ade"] 

items = food_items + drink_items

data = data[items]
print(data)

      angbutter  plain bread  jam  croissant  tiramisu croissant  cacao deep  \
0           1.0          0.0  0.0        0.0                 3.0         0.0   
1           1.0          0.0  0.0        0.0                 1.0         0.0   
2           0.0          0.0  0.0        0.0                14.0         0.0   
3           1.0          1.0  0.0        0.0                 0.0         0.0   
4           2.0          0.0  0.0        0.0                 1.0         0.0   
...         ...          ...  ...        ...                 ...         ...   
2416        1.0          0.0  0.0        0.0                 1.0         0.0   
2417        1.0          1.0  0.0        0.0                 0.0         1.0   
2418        0.0          0.0  0.0        1.0                 1.0         0.0   
2419        2.0          0.0  0.0        0.0                 1.0         0.0   
2420        0.0          0.0  0.0        0.0                 0.0         0.0   

      pain au chocolat  almond croissan

In [6]:
# Convert all values > 0 to True (indicating presence of an item), otherwise False
data = data.applymap(lambda x: x > 0)

# Check if the conversion worked
print(data.describe())  # Should now have only True/False values
print(data)

       angbutter plain bread    jam croissant tiramisu croissant cacao deep  \
count       2421        2421   2421      2421               2421       2421   
unique         2           2      2         2                  2          2   
top         True       False  False     False              False      False   
freq        1973        1564   2201      1674               1642       2098   

       pain au chocolat almond croissant gateau chocolat pandoro  ...  \
count              2421             2421            2421    2421  ...   
unique                2                2               2       2  ...   
top               False            False           False   False  ...   
freq               1834             2219            2225    2078  ...   

       orange pound wiener tiramisu merinque cookies americano caffe latte  \
count          2421   2421     2421             2421      2421        2421   
unique            2      2        2                2         2           2   
top 

# Data processing

# Pattern mining application

In [7]:
print(data.columns)

Index(['angbutter', 'plain bread', 'jam', 'croissant', 'tiramisu croissant',
       'cacao deep', 'pain au chocolat', 'almond croissant', 'gateau chocolat',
       'pandoro', 'cheese cake', 'orange pound', 'wiener', 'tiramisu',
       'merinque cookies', 'americano', 'caffe latte', 'milk tea', 'lemon ade',
       'vanila latte', 'berry ade'],
      dtype='object')


## Parameterizable pattern discovery



In [8]:

from scipy.stats import binom
N = len(data)
probs = {col : data[[col]].eq(1).sum()[col]/N for col in data.columns}

def add_significance(patterns):
    patterns['significance'] = 0.0
    for i, pattern in patterns.iterrows():
        prob = 1
        for item in pattern['itemsets']: prob = prob * probs[item]
        patterns.at[i,'significance'] = 1-binom.cdf(pattern['support']*N-1, N, prob)

In [9]:
def find_patterns(mine_rules=True, min_patterns=10, min_length=2, max_pvalue=0.05, 
                  min_support=0.6, min_confidence=0.7, min_lift=1.4):
    patterns = {}
    min_support = 1
    while min_support>0:
    
        min_support = min_support*0.9
        print("Finding patterns with min sup %f"%min_support)
        patterns = apriori(data, min_support=min_support, use_colnames=True)
    
        if mine_rules and len(patterns)>0:
            patterns = association_rules(patterns, metric="lift", min_threshold=min_lift)
            patterns = patterns[['antecedents','consequents','support','confidence','lift']]
            patterns = patterns[(patterns['confidence'] >= min_confidence)]
            patterns['itemsets'] = [x | y for x, y in zip(patterns['antecedents'], patterns['consequents'])]
        
        patterns['length'] = patterns['itemsets'].apply(lambda x: len(x))
        patterns = patterns[(patterns['length'] >= min_length)]
        add_significance(patterns)
        patterns = patterns[(patterns['significance'] <= max_pvalue)]
            
        if len(patterns) >= min_patterns: break
    
    print("Number of found patterns:",len(patterns))
    return patterns

### find patterns inesperadamente frequentes

- using max of p_value=0.05 (5%)
    - This way we make sure we only consider patterns that have relevent statistic significance (most likely they are not random) 

In [10]:
df_freq_patterns = find_patterns(
    mine_rules = False,
    min_patterns = 20, 
    min_length = 2, 
    max_pvalue = 0.05, 
    min_support=0.50
)

df_freq_patterns

Finding patterns with min sup 0.900000
Finding patterns with min sup 0.810000
Finding patterns with min sup 0.729000
Finding patterns with min sup 0.656100
Finding patterns with min sup 0.590490
Finding patterns with min sup 0.531441
Finding patterns with min sup 0.478297
Finding patterns with min sup 0.430467
Finding patterns with min sup 0.387420
Finding patterns with min sup 0.348678
Finding patterns with min sup 0.313811
Finding patterns with min sup 0.282430
Finding patterns with min sup 0.254187
Finding patterns with min sup 0.228768
Finding patterns with min sup 0.205891
Finding patterns with min sup 0.185302
Finding patterns with min sup 0.166772
Finding patterns with min sup 0.150095
Finding patterns with min sup 0.135085
Finding patterns with min sup 0.121577
Finding patterns with min sup 0.109419
Finding patterns with min sup 0.098477
Finding patterns with min sup 0.088629
Finding patterns with min sup 0.079766
Finding patterns with min sup 0.071790
Finding patterns with min

Finding patterns with min sup 0.025032
Finding patterns with min sup 0.022528
Finding patterns with min sup 0.020276
Finding patterns with min sup 0.018248
Finding patterns with min sup 0.016423
Finding patterns with min sup 0.014781
Finding patterns with min sup 0.013303
Finding patterns with min sup 0.011973
Finding patterns with min sup 0.010775
Finding patterns with min sup 0.009698
Finding patterns with min sup 0.008728
Finding patterns with min sup 0.007855
Number of found patterns: 20


Unnamed: 0,support,itemsets,length,significance
39,0.079306,"(jam, plain bread)",2,0.0
64,0.095828,"(pain au chocolat, croissant)",2,8.713604e-05
100,0.043784,"(pain au chocolat, wiener)",2,0.01886498
105,0.018174,"(almond croissant, pandoro)",2,0.004296081
130,0.052871,"(jam, angbutter, plain bread)",3,3.025358e-13
201,0.025196,"(jam, plain bread, croissant)",3,1.482179e-10
202,0.01487,"(jam, tiramisu croissant, plain bread)",3,0.02253108
203,0.0095,"(jam, plain bread, cacao deep)",3,0.001099609
204,0.021479,"(pain au chocolat, jam, plain bread)",3,2.179914e-10
205,0.016522,"(jam, plain bread, orange pound)",3,8.228641e-07


### find patterns inesperadamente discriminativos

In [11]:
# Supondo que você tenha um DataFrame de padrões
df_patterns = find_patterns(min_support=0.5, max_pvalue=0.05, min_patterns=20, 
                            min_confidence=0.7, min_lift=1.4)

# Convertendo o frozenset em uma string para melhor exportação
df_patterns['antecedents'] = df_patterns['antecedents'].apply(lambda x: ', '.join(map(str, list(x))) if isinstance(x, frozenset) else str(x))
df_patterns['consequents'] = df_patterns['consequents'].apply(lambda x: ', '.join(map(str, list(x))) if isinstance(x, frozenset) else str(x))
df_patterns['itemsets'] = df_patterns['itemsets'].apply(lambda x: ', '.join(map(str, list(x))) if isinstance(x, frozenset) else str(x))

df_patterns


Finding patterns with min sup 0.900000
Finding patterns with min sup 0.810000
Finding patterns with min sup 0.729000
Finding patterns with min sup 0.656100
Finding patterns with min sup 0.590490
Finding patterns with min sup 0.531441
Finding patterns with min sup 0.478297
Finding patterns with min sup 0.430467
Finding patterns with min sup 0.387420
Finding patterns with min sup 0.348678
Finding patterns with min sup 0.313811
Finding patterns with min sup 0.282430
Finding patterns with min sup 0.254187
Finding patterns with min sup 0.228768
Finding patterns with min sup 0.205891
Finding patterns with min sup 0.185302
Finding patterns with min sup 0.166772
Finding patterns with min sup 0.150095
Finding patterns with min sup 0.135085
Finding patterns with min sup 0.121577
Finding patterns with min sup 0.109419
Finding patterns with min sup 0.098477
Finding patterns with min sup 0.088629
Finding patterns with min sup 0.079766
Finding patterns with min sup 0.071790
Finding patterns with min

Unnamed: 0,antecedents,consequents,support,confidence,lift,itemsets,length,significance
0,jam,plain bread,0.079306,0.872727,2.465429,"jam, plain bread",2,0.0
12,"jam, angbutter",plain bread,0.052871,0.864865,2.443218,"jam, angbutter, plain bread",3,3.025358e-13
22,"jam, croissant",plain bread,0.025196,0.802632,2.267411,"jam, plain bread, croissant",3,1.482179e-10
26,"jam, tiramisu croissant",plain bread,0.01487,0.765957,2.163807,"jam, tiramisu croissant, plain bread",3,0.02253108
30,"jam, cacao deep",plain bread,0.0095,0.821429,2.320512,"jam, plain bread, cacao deep",3,0.001099609
34,"jam, pain au chocolat",plain bread,0.021479,0.825397,2.331722,"jam, pain au chocolat, plain bread",3,2.179914e-10
38,"jam, gateau chocolat",plain bread,0.006196,0.789474,2.23024,"jam, plain bread, gateau chocolat",3,0.002199979
46,"jam, orange pound",plain bread,0.016522,0.888889,2.511085,"jam, plain bread, orange pound",3,8.228641e-07
50,"jam, wiener",plain bread,0.011565,0.875,2.471849,"jam, plain bread, wiener",3,2.2811e-05
54,"jam, americano",plain bread,0.011979,0.90625,2.56013,"jam, americano, plain bread",3,0.0001170352
