In [3]:
import pandas as pd
# Cargar los datos
url_products = "https://github.com/it-ces/Rules-puj/blob/main/products.csv?raw=true"
url_orders = "https://github.com/it-ces/Rules-puj/blob/main/order_products__train.csv?raw=true"

products = pd.read_csv(url_products)
orders = pd.read_csv(url_orders)
# Merge de las tablas (similar a tu código funcional)
dfMerged = pd.merge(orders, products, on="product_id", how="inner")
# Agrupar productos por transacción para crear listas de transacciones

transactions = dfMerged.groupby("order_id")["product_name"].apply(list).tolist()
print(f"Primeras transacciones:\n{transactions[:5]}")
from mlxtend.preprocessing import TransactionEncoder

# Codificar las transacciones
te = TransactionEncoder()
te_array = te.fit(transactions).transform(transactions)
dfBinary = pd.DataFrame(te_array, columns=te.columns_)

# Validar la matriz binaria
print(f"Matriz binaria generada (primeras filas):\n{dfBinary.head()}")

from mlxtend.frequent_patterns import apriori

# Calcular conjuntos frecuentes
min_support = 0.01  # Umbral ajustable
frequent_itemsets = apriori(dfBinary, min_support=min_support, use_colnames=True)

# Verificar los conjuntos frecuentes
print(f"Conjuntos frecuentes encontrados:\n{frequent_itemsets.head()}")

Primeras transacciones:
[['Bulgarian Yogurt', 'Organic 4% Milk Fat Whole Milk Cottage Cheese', 'Organic Celery Hearts', 'Cucumber Kirby', 'Lightly Smoked Sardines in Olive Oil', 'Bag of Organic Bananas', 'Organic Hass Avocado', 'Organic Whole String Cheese'], ['Grated Pecorino Romano Cheese', 'Spring Water', 'Organic Half & Half', 'Super Greens Salad', 'Cage Free Extra Large Grade AA Eggs', 'Prosciutto, Americano', 'Organic Garnet Sweet Potato (Yam)', 'Asparagus'], ['Shelled Pistachios', 'Organic Biologique Limes', 'Organic Raw Unfiltered Apple Cider Vinegar', 'Organic Baby Arugula', 'Organic Hot House Tomato', 'Green Peas', 'Bunched Cilantro', 'Flat Parsley, Bunch', 'Fresh Dill'], ['Roasted Turkey', 'Organic Cucumber', 'Organic Grape Tomatoes', 'Organic Pomegranate Kernels', 'Organic Raspberries', 'Organic Whole Strawberries', 'Organic Blueberries'], ['Natural Spring Water', 'Organic Orange Juice With Calcium & Vitamin D', 'Whole Milk Greek Blended Vanilla Bean Yogurt', 'Bag of Organi

In [4]:
from mlxtend.frequent_patterns import association_rules

# Generar reglas
rules = association_rules(frequent_itemsets, metric="confidence", min_threshold=0.3)

# Verificar las reglas
print(f"Reglas de asociación generadas (primeras filas):\n{rules[['antecedents', 'consequents', 'support', 'confidence', 'lift']].head()}")
# Filtrar reglas por lift
filtered_rules = rules[rules['lift'] >= 1.0]
print(f"\nReglas filtradas (lift >= 1.0):\n{filtered_rules[['antecedents', 'consequents', 'support', 'confidence', 'lift']]}")
from scipy.sparse import csr_matrix

# Convertir a matriz dispersa
order_matrix_sparse = csr_matrix(dfBinary.values)
print(f"Tamaño de la matriz dispersa: {order_matrix_sparse.shape}")

# Calcular soporte manualmente para comparar
import numpy as np
item_support = np.array(order_matrix_sparse.sum(axis=0)).flatten() / order_matrix_sparse.shape[0]
print(f"Ítems con soporte (manual):\n{item_support[:10]}")  # Muestra los primeros 10
import time

# Medir tiempo con mlxtend.apriori
start_time = time.time()
frequent_itemsets_apriori = apriori(dfBinary, min_support=0.02, use_colnames=True)
mlxtend_time = time.time() - start_time

# Medir tiempo con matriz dispersa
start_time = time.time()
# Código con matriz dispersa...
sparse_time = time.time() - start_time
print(f"Tiempo con mlxtend.apriori: {mlxtend_time:.2f} segundos")
print(f"Tiempo con matriz dispersa: {sparse_time:.2f} segundos")

Reglas de asociación generadas (primeras filas):
              antecedents               consequents   support  confidence  \
0  (Organic Hass Avocado)  (Bag of Organic Bananas)  0.018444    0.331825   
1   (Organic Raspberries)  (Bag of Organic Bananas)  0.013566    0.320952   
2   (Organic Raspberries)    (Organic Strawberries)  0.012728    0.301118   

      lift  
0  2.81256  
1  2.72040  
2  3.62671  

Reglas filtradas (lift >= 1.0):
              antecedents               consequents   support  confidence  \
0  (Organic Hass Avocado)  (Bag of Organic Bananas)  0.018444    0.331825   
1   (Organic Raspberries)  (Bag of Organic Bananas)  0.013566    0.320952   
2   (Organic Raspberries)    (Organic Strawberries)  0.012728    0.301118   

      lift  
0  2.81256  
1  2.72040  
2  3.62671  
Tamaño de la matriz dispersa: (131209, 39123)
Ítems con soporte (manual):
[1.67671425e-04 7.62142841e-06 7.62142841e-06 4.57285704e-05
 7.62142841e-05 7.62142841e-06 2.28642852e-05 8.38357125e-05
