In [1]:
from utils import *
from functools import reduce
import polars as pl
from datetime import datetime
from collections import defaultdict
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from collections import Counter

In [2]:
# Read the csv files of the atributes
file_path_atributes = "../data/processed/atributes.parquet"
atributes = pl.read_parquet(file_path_atributes)
# Read the csv files of the transactions
file_path_transactions = "../data/processed/transactions.parquet"
transactions = pl.read_parquet(file_path_transactions)
# Read the csv files of the baskets
file_path_baskets = "../data/processed/baskets.parquet"
baskets = pl.read_parquet(file_path_baskets)
# Read the csv files of the temporal metrics of the transactions
file_path_temporal_metrics = "../data/processed/temporal_metrics.parquet"
temporal_metrics = pl.read_parquet(file_path_temporal_metrics)
# Display the first rows of the DataFrame of the transactions
print(baskets.head())

shape: (5, 6)
┌──────────────┬────────────┬────────────────┬───────────────────┬──────────────┬──────────────────┐
│ invoice_date ┆ account_id ┆ order_id       ┆ sku_id            ┆ sku_id_count ┆ items_phys_cases │
│ ---          ┆ ---        ┆ ---            ┆ ---               ┆ ---          ┆ ---              │
│ date         ┆ str        ┆ str            ┆ list[str]         ┆ u32          ┆ i64              │
╞══════════════╪════════════╪════════════════╪═══════════════════╪══════════════╪══════════════════╡
│ 2022-05-24   ┆ 398523     ┆ 512-3599611-0  ┆ ["7026", "23902", ┆ 7            ┆ 18               │
│              ┆            ┆                ┆ … "23287"]        ┆              ┆                  │
│ 2022-05-24   ┆ 185697     ┆ 512-3601992-0  ┆ ["25119", "2201", ┆ 4            ┆ 10               │
│              ┆            ┆                ┆ … "7038"]         ┆              ┆                  │
│ 2022-05-24   ┆ 417417     ┆ 102-38960009-0 ┆ ["7651"]          ┆ 1         

## Modelado

Se implementarán los algoritmos TIFUKNN y , pues son los dos algoritmos que combinan de forma adecuada la simplicidad con buena performance en los benchmark públicos según la literatura. El proceso es el siguiente:

1.- Matriz de artículos y usuarios: creamos una matriz donde las filas representan los artículos y las columnas representan a los usuarios. Los valores de la matriz son las cantidades de artículos comprados por cada usuario.

2.- Similitud de coseno: calculamos la similitud de coseno entre artículos para identificar SKU similares.

3.- Predicción: el algoritmo TIFUKNN predice la próxima cesta en función de los artículos más similares a los del historial de compras del cliente.

4.- Evaluación: calculamos la precisión y la recuperación en K para medir qué tan bien el modelo predice la próxima cesta.

In [3]:
pd_transactions = transactions.to_pandas()
transactions.head()

Unnamed: 0_level_0,account_id,sku_id,invoice_date,order_id,items_phys_cases
i64,str,str,date,str,i64
0,"""430606""","""7038""",2022-07-29,"""512-3880249-0""",100
1,"""323267""","""14933""",2022-07-29,"""512-3882307-0""",1
2,"""357825""","""21971""",2022-07-23,"""512-3852880-0""",8
3,"""444926""","""7038""",2022-08-05,"""512-3913163-0""",20
4,"""450771""","""7030""",2022-08-16,"""512-3957000-0""",5


In [4]:
#Define Biweekly Periods
pd_transactions['biweekly_period'] = pd_transactions['invoice_date'].dt.isocalendar().week // 2
#Aggregate Transactions into Biweekly Baskets
biweekly_train_baskets = pd_transactions.groupby(['account_id', 'biweekly_period']).agg({
    'sku_id': lambda x: list(x),
    'items_phys_cases': 'sum'
}).reset_index()
#Separate the Data into Training and Evaluation Sets
max_period = pd_transactions['biweekly_period'].max()
train_data = pd_transactions[pd_transactions['biweekly_period'] < max_period - 1]
eval_data = pd_transactions[pd_transactions['biweekly_period'] >= max_period - 1]
print("Biweekly Train Baskets:")
print(biweekly_train_baskets.shape)
print("Training Data:")
print(train_data.shape)
print("Evaluation Data:")
print(eval_data.shape)

Biweekly Train Baskets:
(23536, 4)
Training Data:
(199015, 7)
Evaluation Data:
(81813, 7)


In [5]:
biweekly_train_baskets.head()

Unnamed: 0,account_id,biweekly_period,sku_id,items_phys_cases
0,100640,12,"[25452, 23287, 1418, 1422, 25454, 11522, 25384...",11
1,100640,13,"[19336, 23902, 2218, 19336, 25452, 1483, 24118...",13
2,100640,14,"[24118, 21973]",6
3,100640,15,"[7038, 23902, 24118, 1416, 1418]",5
4,100640,16,"[24118, 21973, 21973, 7038, 25644, 1483, 16527...",10


In [6]:
train_data.head()

Unnamed: 0,Unnamed: 1,account_id,sku_id,invoice_date,order_id,items_phys_cases,biweekly_period
0,0,430606,7038,2022-07-29,512-3880249-0,100,15
1,1,323267,14933,2022-07-29,512-3882307-0,1,15
2,2,357825,21971,2022-07-23,512-3852880-0,8,14
3,3,444926,7038,2022-08-05,512-3913163-0,20,15
5,5,174250,14191,2022-07-01,512-3757476-0,70,13


In [7]:
# Build the item-user matrix
# each row corresponds to an SKU and each column to a customer. The values in the matrix will be the total items_phys_cases purchased by the customer for that SKU.
def build_item_user_matrix(df):
    item_user_matrix = defaultdict(lambda: defaultdict(int))

    for _, row in df.iterrows():
        for sku in row['sku_id']:
            item_user_matrix[sku][row['account_id']] += row['items_phys_cases']

    return item_user_matrix

item_user_matrix = build_item_user_matrix(train_data)

In [8]:
#Compute the Cosine Similarity between Items based on the item-user matrix.
users = train_data['account_id'].unique()
items = list(item_user_matrix.keys())
item_user_vectors = np.array([
    [item_user_matrix[item].get(user, 0) for user in users]
    for item in items
])

# Compute the cosine similarity between items
item_similarity = cosine_similarity(item_user_vectors)


In [32]:
def predict_next_basket(history, item_similarity, items, k=5):
    all_items = [sku for basket in history for sku in basket]
    item_freq = Counter(all_items)

    recommendations = []
    for item, freq in item_freq.items():
        if item in items:
            similar_items_idx = np.argsort(item_similarity[items.index(item)])[::-1][:k]
            recommendations.extend([(items[i], item_similarity[items.index(item), i] * freq) for i in similar_items_idx])

    recommendations = sorted(recommendations, key=lambda x: x[1], reverse=True)
    return [r[0] for r in recommendations[:k]]

predicted_baskets = {}

for account_id, history in train_data_grouped['sku_id'].items():
    predicted_baskets[account_id] = predict_next_basket(history, item_similarity, items, k=5)

In [49]:
def precision_at_k(predicted, actual, k=5):
    predicted_set = set(predicted[:k])
    actual_set = set(actual)
    return len(predicted_set & actual_set) / len(predicted_set)

def recall_at_k(predicted, actual, k=5):
    predicted_set = set(predicted[:k])
    actual_set = set(actual)
    return len(predicted_set & actual_set) / len(actual_set)

def f1_at_k(precision, recall):
    if precision + recall == 0:
        return 0
    return 2 * (precision * recall) / (precision + recall)

def precision_at_k_over_2(predicted, actual, k=5):
    predicted_set = set(predicted[:k])
    actual_set = set(actual)
    result = len(predicted_set & actual_set) / 3
    return result

# Evaluate the model
precisions = []
recalls = []
f1_scores = []
precision_over_2_scores = []  # Renaming the list to avoid conflict

# Group the evaluation data by account_id and biweekly_period
eval_data_grouped = eval_data.groupby('account_id').agg({'sku_id': lambda x: list(x)})

for account_id, actual_basket in eval_data_grouped['sku_id'].items():
    if account_id in predicted_baskets:
        predicted_basket = predicted_baskets[account_id]
        precision = precision_at_k(predicted_basket, actual_basket[0], k=5)
        recall = recall_at_k(predicted_basket, actual_basket[0], k=5)
        f1 = f1_at_k(precision, recall)
        precision_at_k_over_2_score = precision_at_k_over_2(predicted_basket, actual_basket[0], k=5)  # Renaming variable
        
        precisions.append(precision)
        recalls.append(recall)
        f1_scores.append(f1)
        precision_over_2_scores.append(precision_at_k_over_2_score)  # Updating the list name

average_precision = np.mean(precisions)
average_recall = np.mean(recalls)
average_f1 = np.mean(f1_scores)
average_precision_at_k_over_2 = np.mean(precision_over_2_scores)  # Using updated list name

print(f"Average Precision of k = 5: {average_precision:.4f}")
print(f"Average Recall: {average_recall:.4f}")
print(f"Average F1 Score: {average_f1:.4f}")
print(f"Average Precision of k=5 over 2: {average_precision_at_k_over_2:.4f}")


Average Precision of k = 5: 0.4524
Average Recall: 0.5564
Average F1 Score: 0.4943
Average Precision of k=5 over 2: 0.7160
