In [1]:
from utils import *
from functools import reduce
import polars as pl
from datetime import datetime
from collections import defaultdict
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from collections import Counter

In [2]:
def build_item_user_matrix(df):
    """
    Builds an item-user matrix from a given DataFrame, where the rows represent SKUs (items)
    and the columns represent users. The values in the matrix are the total quantities of
    items purchased by each user.

    Args:
        df (pd.DataFrame): A pandas DataFrame containing transaction data with at least
                           the following columns:
                           - 'account_id': The unique identifier for each user (customer).
                           - 'sku_id': A list of SKUs (items) purchased in each transaction.
                           - 'items_phys_cases': The quantity of each item purchased.

    Returns:
        Dict[str, Dict[int, int]]: A nested dictionary where:
                                   - The keys of the outer dictionary are SKUs (items).
                                   - The keys of the inner dictionary are account IDs (users).
                                   - The values of the inner dictionary are the total quantities of each SKU
                                     purchased by the corresponding user.

    Example:
        df = pd.DataFrame({
            'account_id': [1, 2, 1],
            'sku_id': [['item1', 'item2'], ['item2'], ['item1']],
            'items_phys_cases': [3, 2, 1]
        })
        build_item_user_matrix(df)
        # Output: {'item1': {1: 4}, 'item2': {1: 3, 2: 2}}

    Notes:
        - This function assumes that the 'sku_id' column contains lists of SKUs, and each
          list represents the items purchased in a single transaction.
        - The 'items_phys_cases' column represents the total quantity of items in that transaction,
          which is distributed equally across the SKUs in the 'sku_id' list.
    """
    # Initialize an empty item-user matrix using defaultdict
    item_user_matrix = defaultdict(lambda: defaultdict(int))
    # Iterate over each row in the DataFrame
    for _, row in df.iterrows():
        # For each SKU in the transaction, update the matrix with the quantity purchased
        for sku in row['sku_id']:
            item_user_matrix[sku][row['account_id']] += row['items_phys_cases']
    # Return the completed item-user matrix
    return item_user_matrix

def predict_next_k_basket(history, item_similarity, items, k):
    """
    Predicts the top-k items that a customer is likely to purchase in their next shopping basket.

    Args:
        history (List[List[str]]): A list of lists where each inner list contains the SKUs (items) 
                                   that the customer has purchased in previous baskets.
        item_similarity (np.ndarray): A 2D numpy array representing the cosine similarity between items. 
                                      The similarity matrix should have the same order as the `items` list.
        items (List[str]): A list of all unique SKUs (items) that are included in the item similarity matrix.
        k (int): The number of top items to recommend for the next basket.

    Returns:
        List[str]: A list of the top-k predicted SKUs (items) that the customer is likely to purchase next.

    Example:
        history = [["sku1", "sku2"], ["sku3", "sku1"], ["sku2", "sku4"]]
        item_similarity = np.array([[1.0, 0.8, 0.4], [0.8, 1.0, 0.3], [0.4, 0.3, 1.0]])
        items = ["sku1", "sku2", "sku3", "sku4"]
        k = 2
        predict_next_k_basket(history, item_similarity, items, k)
        # Output: ["sku2", "sku3"]

    Notes:
        - The function first flattens the history to calculate the frequency of each SKU.
        - It then identifies the most similar items to those frequently purchased by the customer.
        - Finally, it returns the top-k items based on the similarity scores.
    """
    # Flatten the history to get a list of all items purchased by the customer
    all_items = [sku for basket in history for sku in basket]
    # Count the frequency of each item in the purchase history
    item_freq = Counter(all_items)

    recommendations = []
    # Generate recommendations based on item similarity
    for item, freq in item_freq.items():
        if item in items:
            # Find indices of the top-k most similar items
            similar_items_idx = np.argsort(item_similarity[items.index(item)])[::-1][:k]
            # Extend the recommendations list with similar items, weighted by frequency
            recommendations.extend([(items[i], item_similarity[items.index(item), i] * freq) for i in similar_items_idx])
    # Sort recommendations by score (descending order) and return the top-k items
    recommendations = sorted(recommendations, key=lambda x: x[1], reverse=True)
    return [r[0] for r in recommendations[:k]]

In [3]:
# Read the csv files of the atributes
file_path_atributes = "../data/processed/atributes.parquet"
atributes = pl.read_parquet(file_path_atributes)
# Read the csv files of the transactions
file_path_transactions = "../data/processed/transactions.parquet"
transactions = pl.read_parquet(file_path_transactions)
# Read the csv files of the baskets
file_path_baskets = "../data/processed/baskets.parquet"
baskets = pl.read_parquet(file_path_baskets)
# Read the csv files of the temporal metrics of the transactions
file_path_temporal_metrics = "../data/processed/temporal_metrics.parquet"
temporal_metrics = pl.read_parquet(file_path_temporal_metrics)
# Display the first rows of the DataFrame of the transactions
print(baskets.head())

shape: (5, 6)
┌──────────────┬────────────┬────────────────┬───────────────────┬──────────────┬──────────────────┐
│ invoice_date ┆ account_id ┆ order_id       ┆ sku_id            ┆ sku_id_count ┆ items_phys_cases │
│ ---          ┆ ---        ┆ ---            ┆ ---               ┆ ---          ┆ ---              │
│ date         ┆ str        ┆ str            ┆ list[str]         ┆ u32          ┆ i64              │
╞══════════════╪════════════╪════════════════╪═══════════════════╪══════════════╪══════════════════╡
│ 2022-05-24   ┆ 398523     ┆ 512-3599611-0  ┆ ["7026", "23902", ┆ 7            ┆ 18               │
│              ┆            ┆                ┆ … "23287"]        ┆              ┆                  │
│ 2022-05-24   ┆ 185697     ┆ 512-3601992-0  ┆ ["25119", "2201", ┆ 4            ┆ 10               │
│              ┆            ┆                ┆ … "7038"]         ┆              ┆                  │
│ 2022-05-24   ┆ 417417     ┆ 102-38960009-0 ┆ ["7651"]          ┆ 1         

## Predicción usando TIFUKNN

Se implementarán los algoritmos TIFUKNN y , pues son los dos algoritmos que combinan de forma adecuada la simplicidad con buena performance en los benchmark públicos según la literatura. El proceso es el siguiente:

0.- Transformo el dataset de transactions a pandas 

1.- Matriz de artículos y usuarios: creamos una matriz donde las filas representan los artículos y las columnas representan a los usuarios. Los valores de la matriz son las cantidades de artículos comprados por cada usuario.

2.- Similitud de coseno: calculamos la similitud de coseno entre artículos para identificar SKU similares.

3.- Predicción: el algoritmo TIFUKNN predice la próxima cesta en función de los artículos más similares a los del historial de compras del cliente.

4.- Evaluación: calculamos la precisión y la recuperación en K para medir qué tan bien el modelo predice la próxima cesta.

In [4]:
pd_transactions = transactions.to_pandas()
transactions.head()

Unnamed: 0_level_0,account_id,sku_id,invoice_date,order_id,items_phys_cases
i64,str,str,date,str,i64
0,"""430606""","""7038""",2022-07-29,"""512-3880249-0""",100
1,"""323267""","""14933""",2022-07-29,"""512-3882307-0""",1
2,"""357825""","""21971""",2022-07-23,"""512-3852880-0""",8
3,"""444926""","""7038""",2022-08-05,"""512-3913163-0""",20
4,"""450771""","""7030""",2022-08-16,"""512-3957000-0""",5


In [5]:
pd_transactions['biweekly_period'] = pd_transactions['invoice_date'].dt.isocalendar().week // 2
biweekly_train_baskets = pd_transactions.groupby(['account_id', 'biweekly_period']).agg({
    'sku_id': lambda x: list(x),
    'items_phys_cases': 'sum'
}).reset_index()
max_period = pd_transactions['biweekly_period'].max()
train_data = pd_transactions[pd_transactions['biweekly_period'] < max_period - 1]
eval_data = pd_transactions[pd_transactions['biweekly_period'] >= max_period - 1]
print("Biweekly Train Baskets:")
print(biweekly_train_baskets.shape)
print("Training Data:")
print(train_data.shape)
print("Evaluation Data:")
print(eval_data.shape)

Biweekly Train Baskets:
(23536, 4)
Training Data:
(199015, 7)
Evaluation Data:
(81813, 7)


In [6]:
biweekly_train_baskets.head()

Unnamed: 0,account_id,biweekly_period,sku_id,items_phys_cases
0,100640,12,"[25452, 23287, 1418, 1422, 25454, 11522, 25384...",11
1,100640,13,"[19336, 23902, 2218, 19336, 25452, 1483, 24118...",13
2,100640,14,"[24118, 21973]",6
3,100640,15,"[7038, 23902, 24118, 1416, 1418]",5
4,100640,16,"[24118, 21973, 21973, 7038, 25644, 1483, 16527...",10


In [7]:
train_data.head()

Unnamed: 0,Unnamed: 1,account_id,sku_id,invoice_date,order_id,items_phys_cases,biweekly_period
0,0,430606,7038,2022-07-29,512-3880249-0,100,15
1,1,323267,14933,2022-07-29,512-3882307-0,1,15
2,2,357825,21971,2022-07-23,512-3852880-0,8,14
3,3,444926,7038,2022-08-05,512-3913163-0,20,15
5,5,174250,14191,2022-07-01,512-3757476-0,70,13


In [8]:
item_user_matrix = build_item_user_matrix(train_data)
users = train_data['account_id'].unique()
items = list(item_user_matrix.keys())
item_user_vectors = np.array([
    [item_user_matrix[item].get(user, 0) for user in users]
    for item in items
])
item_similarity = cosine_similarity(item_user_vectors)
train_data_grouped = train_data.groupby('account_id').agg({'sku_id': lambda x: list(x)})
k_items=5
predicted_baskets = {}
for account_id, history in train_data_grouped['sku_id'].items():
    predicted_baskets[account_id] = predict_next_k_basket(history, item_similarity, items, k=k_items)


In [9]:
# Evaluate the model
precisions = []
recalls = []
f1_scores = []
precision_over_n_scores = []  # Renaming the list to avoid conflict
n_items=3
# Group the evaluation data by account_id and biweekly_period
eval_data_grouped = eval_data.groupby('account_id').agg({'sku_id': lambda x: list(x)})

for account_id, actual_basket in eval_data_grouped['sku_id'].items():
    if account_id in predicted_baskets:
        predicted_basket = predicted_baskets[account_id]
        precision = precision_at_k(predicted_basket, actual_basket[0],k_items)
        recall = recall_at_k(predicted_basket, actual_basket[0],k_items)
        f1 = f1_at_k(precision, recall)
        precision_at_k_over_n_score = precision_at_k_over_n(predicted_basket, actual_basket[0],n_items,k_items)  # Renaming variable
        
        precisions.append(precision)
        recalls.append(recall)
        f1_scores.append(f1)
        precision_over_n_scores.append(precision_at_k_over_n_score)  # Updating the list name

average_precision = np.mean(precisions)
average_recall = np.mean(recalls)
average_f1 = np.mean(f1_scores)
average_precision_at_k_over_n = np.mean(precision_over_n_scores)  # Using updated list name

print(f"Average Precision of k = 5: {average_precision:.4f}")
print(f"Average Recall: {average_recall:.4f}")
print(f"Average F1 Score: {average_f1:.4f}")
print(f"Average Precision of k=5 over n: {average_precision_at_k_over_n:.4f}")

Average Precision of k = 5: 0.4524
Average Recall: 0.5564
Average F1 Score: 0.4943
Average Precision of k=5 over n: 0.7160
