# File and libraries

In [None]:
import pandas as pd
import scipy.stats as stats
from mlxtend.frequent_patterns import apriori, association_rules



- apriori method - is used to first identify the most frequent itemsets in the data.
    - How does it work?
        - It takes a dataset of transactions (or a binary dataframe where each column represents an item and each row represents a transaction).
        - It finds frequent itemsets, meaning items that appear together in at least a minimum number of transactions (defined by min_support).

- association_rules - After obtaining frequent itemsets using apriori, you can extract association rules using association_rules.

    - How does it work?
        - It uses the frequent itemsets to generate rules like:
            {A} → {B} (if a customer buys A, there is a high probability they will buy B).
        - It evaluates rule strength using metrics such as:
        -   support → how often the rule appears in the dataset.
        - confidence → how often B is bought when A is bought.
        - lift → whether A and B occur together more often than expected by chance.

***Summary***
- Use Apriori (apriori) to find groups of items frequently bought together.
- Use Association Rules (association_rules) to generate rules like “if customers buy A, they are likely to buy B.”

In [None]:
import pandas as pd
import numpy as np


filepath=r'/Users/cozmaeug/Private/IST PG - DS/DaSH ENG/ist_dash_2024_rec/non_supervised_analysis/notebooks/dataset_2/df_bakery_encoded.csv'

file_tag = "Bakery Clustering"

data = pd.read_csv(filepath)


In [None]:
%run "scripts/dslabs_functions.py"

dslabs_functions lodaded


In [None]:
%run "scripts/data_functions.py"

data_functions lodaded


# Data processing

## Missing Values

### Filling MV with median
For K-means clustering, it's particularly important to avoid outliers or large deviations caused by extreme values, so median might be a safer choice than the mean.

In [None]:
data['total'] = data['total'].fillna(data['total'].median())
data['Purchase value'] = data['Purchase value'].fillna(data['Purchase value'].median())



### Drop MV

In [None]:
data=data.copy()
data = data.dropna(axis=0, how="any") #axis=0 tells dropna to remove rows that have at least one NaN value.

# Discretization if necessary

In [None]:
data.dtypes

# Pattern mining application

## Parameterizable pattern discovery



In [None]:

from scipy.stats import binom
N = len(data)
probs = {col : data[[col]].eq(1).sum()[col]/N for col in data.columns}

def add_significance(patterns):
    patterns['significance'] = 0.0
    for i, pattern in patterns.iterrows():
        prob = 1
        for item in pattern['itemsets']: prob = prob * probs[item]
        patterns.at[i,'significance'] = 1-binom.cdf(pattern['support']*N-1, N, prob)

In [None]:
def find_patterns(mine_rules=True, min_patterns=10, min_length=4, max_pvalue=0.1, 
                  min_support=0.6, min_confidence=0.8, min_lift=1.4):
    patterns = {}
    min_support = 1
    while min_support>0:
    
        min_support = min_support*0.9
        print("Finding patterns with min sup %f"%min_support)
        patterns = apriori(data, min_support=min_support, use_colnames=True)
    
        if mine_rules and len(patterns)>0:
            patterns = association_rules(patterns, metric="lift", min_threshold=min_lift)
            patterns = patterns[['antecedents','consequents','support','confidence','lift']]
            patterns = patterns[(patterns['confidence'] >= min_confidence)]
            patterns['itemsets'] = [x | y for x, y in zip(patterns['antecedents'], patterns['consequents'])]
        
        patterns['length'] = patterns['itemsets'].apply(lambda x: len(x))
        patterns = patterns[(patterns['length'] >= min_length)]
        add_significance(patterns)
        patterns = patterns[(patterns['significance'] <= max_pvalue)]
            
        if len(patterns) >= min_patterns: break
    
    print("Number of found patterns:",len(patterns))
    return patterns

### find patterns inesperadamente frequentes

- using max of p_value=0.05 (5%)
    - This way we make sure we only consider patterns that have relevent statistic significance (most likely they are not random) 

In [None]:
df_freq_patterns = find_patterns(
    mine_rules = False,
    min_patterns = 60, 
    min_length = 4, 
    max_pvalue = 0.05, 
    min_support=0.30
)
# Exportando para CSV novamente
df_freq_patterns.to_csv('dataset_2/bakery_freq_patterns', sep=';',index=False)

df_freq_patterns

### find patterns inesperadamente discriminativos

In [None]:
# Supondo que você tenha um DataFrame de padrões
df_patterns = find_patterns(min_support=0.3, max_pvalue=0.05, min_patterns=15, 
                            min_confidence=0.9, min_lift=1.5)

# Convertendo o frozenset em uma string para melhor exportação
df_patterns['antecedents'] = df_patterns['antecedents'].apply(lambda x: ', '.join(map(str, list(x))) if isinstance(x, frozenset) else str(x))
df_patterns['consequents'] = df_patterns['consequents'].apply(lambda x: ', '.join(map(str, list(x))) if isinstance(x, frozenset) else str(x))
df_patterns['itemsets'] = df_patterns['itemsets'].apply(lambda x: ', '.join(map(str, list(x))) if isinstance(x, frozenset) else str(x))

# Exportando para CSV novamente
df_patterns.to_csv('data/citi_bike_discr_patterns.csv', sep=';',index=False)

df_patterns


## Parameterizable pattern discovery func vers 2 