In [1]:
# Importar librerías necesarias
from mlxtend.frequent_patterns import fpgrowth, association_rules
import pandas as pd
import numpy as np
import time

In [2]:
# Cargar datos
df = pd.read_parquet('../datasets/online_retail_clean.parquet')
df.head()

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,2023-12-01 08:26:00,2.55,17850.0,United Kingdom
1,536365,71053,WHITE METAL LANTERN,6,2023-12-01 08:26:00,3.39,17850.0,United Kingdom
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,2023-12-01 08:26:00,2.75,17850.0,United Kingdom
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,2023-12-01 08:26:00,3.39,17850.0,United Kingdom
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,2023-12-01 08:26:00,3.39,17850.0,United Kingdom


In [3]:
# Crear matriz binaria (transacciones x productos)
basket = (
    df.groupby(
        by=['InvoiceNo', 'StockCode']
    )['Quantity'].sum().unstack().fillna(0)
)
basket = basket.gt(0).astype(int)
basket

StockCode,10002,10080,10120,10123C,10124A,10124G,10125,10133,10135,11001,...,90214V,90214W,90214Y,90214Z,BANK CHARGES,C2,DOT,M,PADS,POST
InvoiceNo,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
536365,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
536366,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
536367,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
536368,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
536369,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
581583,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
581584,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
581585,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
581586,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [4]:
# Aplicar algoritmo FWGrowth
frequent_itemsets = fpgrowth(basket, min_support=0.01, use_colnames=True)



In [5]:
# Generar reglas de asociación
rules = association_rules(
    frequent_itemsets,
    metric='confidence',
    min_threshold=0.5
)

In [6]:
# Filtrar reglas de asociación con lift > 1
rules = rules.loc[rules['lift'] > 1].sort_values('lift', ascending=False)
rules

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,representativity,leverage,conviction,zhangs_metric,jaccard,certainty,kulczynski
306,(23172),(23171),0.012087,0.014569,0.010900,0.901786,61.895899,1.0,0.010724,10.033475,0.995881,0.691781,0.900334,0.824967
305,(23171),(23172),0.014569,0.012087,0.010900,0.748148,61.895899,1.0,0.010724,3.922595,0.998390,0.691781,0.745067,0.824967
223,(22746),"(22748, 22745)",0.013598,0.013706,0.010037,0.738095,53.851894,1.0,0.009850,3.765850,0.994960,0.581250,0.734456,0.735189
220,"(22748, 22745)",(22746),0.013706,0.013598,0.010037,0.732283,53.851894,1.0,0.009850,3.684501,0.995069,0.581250,0.728593,0.735189
302,(23174),(23175),0.014461,0.014677,0.011116,0.768657,52.370391,1.0,0.010904,4.259137,0.995299,0.616766,0.765211,0.763005
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
233,(22698),(22423),0.030002,0.091895,0.016674,0.555755,6.047715,1.0,0.013917,2.044155,0.860464,0.158462,0.510800,0.368600
295,(23173),(22423),0.019102,0.091895,0.010468,0.548023,5.963567,1.0,0.008713,2.009182,0.848524,0.104133,0.502285,0.330970
189,(22697),(22423),0.037287,0.091895,0.020181,0.541245,5.889809,1.0,0.016755,1.979497,0.862370,0.185149,0.494821,0.380429
186,(22699),(22423),0.042251,0.091895,0.022664,0.536398,5.837074,1.0,0.018781,1.958805,0.865239,0.203291,0.489485,0.391511


In [7]:
# Mostrar las 10 mejores reglas
print(rules[['antecedents', 'consequents', 'support', 'confidence', 'lift']].head(10))

        antecedents     consequents   support  confidence       lift
306         (23172)         (23171)  0.010900    0.901786  61.895899
305         (23171)         (23172)  0.010900    0.748148  61.895899
223         (22746)  (22748, 22745)  0.010037    0.738095  53.851894
220  (22748, 22745)         (22746)  0.010037    0.732283  53.851894
302         (23174)         (23175)  0.011116    0.768657  52.370391
301         (23175)         (23174)  0.011116    0.757353  52.370391
224         (22745)  (22748, 22746)  0.010037    0.588608  50.735237
219  (22748, 22746)         (22745)  0.010037    0.865116  50.735237
307         (23170)         (23172)  0.010630    0.600610  49.689732
308         (23172)         (23170)  0.010630    0.879464  49.689732
