In [4]:
import numpy as np
import pandas as pd
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules
from scipy.sparse import csr_matrix
import time

In [8]:
products = pd.read_csv('order_products__train.csv')
orders = pd.read_csv('products.csv')

dfMerged = pd.merge(orders, products, on="product_id", how="inner")

# Agrupar productos por transacción para crear listas de transacciones

transactions = dfMerged.groupby("order_id")["product_name"].apply(list).tolist()
print(f"Primeras transacciones:\n{transactions[:5]}")

item_mapping = {item: idx for idx, item in enumerate(sorted(set(item for transaction in transactions for item in transaction)))}
rows, cols = [], []
for row_idx, transaction in enumerate(transactions):
    for item in transaction:
        rows.append(row_idx)
        cols.append(item_mapping[item])

# Crear la matriz dispersa
order_matrix_sparse = csr_matrix(([1] * len(rows), (rows, cols)), shape=(len(transactions), len(item_mapping)))

Primeras transacciones:
[['Organic Celery Hearts', 'Organic 4% Milk Fat Whole Milk Cottage Cheese', 'Bag of Organic Bananas', 'Organic Whole String Cheese', 'Lightly Smoked Sardines in Olive Oil', 'Organic Hass Avocado', 'Bulgarian Yogurt', 'Cucumber Kirby'], ['Spring Water', 'Prosciutto, Americano', 'Grated Pecorino Romano Cheese', 'Super Greens Salad', 'Cage Free Extra Large Grade AA Eggs', 'Asparagus', 'Organic Garnet Sweet Potato (Yam)', 'Organic Half & Half'], ['Organic Raw Unfiltered Apple Cider Vinegar', 'Shelled Pistachios', 'Organic Biologique Limes', 'Organic Baby Arugula', 'Organic Hot House Tomato', 'Bunched Cilantro', 'Green Peas', 'Fresh Dill', 'Flat Parsley, Bunch'], ['Roasted Turkey', 'Organic Whole Strawberries', 'Organic Pomegranate Kernels', 'Organic Raspberries', 'Organic Cucumber', 'Organic Blueberries', 'Organic Grape Tomatoes'], ['Organic Whole Grassmilk Milk', 'Garbanzo Beans', 'Geranium Liquid Dish Soap', 'Corn Maize Tortillas', 'Organic Chocolate Almondmilk Pu

In [9]:
# Función para calcular soporte de elementos individuales
def calculate_support(item_indices, data_matrix):
    """ Calcula el soporte de un conjunto de ítems. """
    item_mask = data_matrix[:, item_indices].toarray().all(axis=1)
    support = np.sum(item_mask) / data_matrix.shape[0]
    return support

In [10]:
# Función para generar ítems frecuentes utilizando matrices dispersas
def apriori_manual(data_matrix, min_support):
    """ Implementación del algoritmo Apriori optimizada. """
    num_items = data_matrix.shape[1]
    frequent_itemsets = []
    current_itemsets = [[i] for i in range(num_items)]
    
    while current_itemsets:
        next_itemsets = []
        item_supports = []
        
        # Calcular soporte para cada conjunto actual
        for itemset in current_itemsets:
            support = calculate_support(itemset, data_matrix)
            if support >= min_support:
                frequent_itemsets.append((itemset, support))
                item_supports.append(itemset)
        
        # Generar nuevas combinaciones de ítems frecuentes actuales
        for i in range(len(item_supports)):
            for j in range(i + 1, len(item_supports)):
                combined_itemset = sorted(set(item_supports[i]) | set(item_supports[j]))
                if len(combined_itemset) == len(item_supports[i]) + 1:
                    next_itemsets.append(combined_itemset)
        
        current_itemsets = next_itemsets  # Actualizar conjuntos actuales
    
    return frequent_itemsets

In [11]:
# Ajustar los umbrales para balancear calidad y rendimiento
min_support = 0.01  # Ajusta según la calidad de reglas requerida

# Medir tiempo con mlxtend.apriori
start_time = time.time()
frequent_itemsets_apriori = apriori(pd.DataFrame(order_matrix_sparse.toarray(), columns=item_mapping.keys()), 
                                    min_support=min_support, use_colnames=True)
mlxtend_time = time.time() - start_time

MemoryError: Unable to allocate 38.2 GiB for an array with shape (131209, 39123) and data type int64

In [12]:
# Medir tiempo con Apriori manual optimizado
start_time = time.time()
frequent_itemsets_manual = apriori_manual(order_matrix_sparse, min_support)
sparse_time = time.time() - start_time

print(f"Frecuentes calculados manualmente: {len(frequent_itemsets_manual)} conjuntos.")
print(f"Tiempo con mlxtend.apriori: {mlxtend_time:.2f} segundos")
print(f"Tiempo con matriz dispersa (manual): {sparse_time:.2f} segundos.")


KeyboardInterrupt: 

In [None]:
# Analizar métricas de las reglas
def generate_rules(frequent_itemsets, min_confidence):
    """ Generar reglas de asociación. """
    rules = []
    for itemset, support in frequent_itemsets:
        if len(itemset) > 1:
            for i in range(len(itemset)):
                antecedent = itemset[:i] + itemset[i+1:]
                consequent = [itemset[i]]
                antecedent_support = calculate_support(antecedent, order_matrix_sparse)
                
                # Calcular confianza y lift
                if antecedent_support > 0:
                    confidence = support / antecedent_support
                    if confidence >= min_confidence:
                        rules.append({
                            'antecedent': antecedent,
                            'consequent': consequent,
                            'support': support,
                            'confidence': confidence
                        })
    return rules

min_confidence = 0.3
rules = generate_rules(frequent_itemsets_manual, min_confidence)
print(f"Reglas generadas: {len(rules)}")
for rule in rules[:5]:  # Mostrar las primeras reglas
    print(f"Regla: {rule}")