In [1]:
from utils import *
from functools import reduce
import polars as pl
import numpy as np
import pandas as pd
from datetime import datetime
from collections import defaultdict
from collections import Counter
from collections import defaultdict
from typing import Dict, Any
import gower
from scipy.cluster.hierarchy import linkage, dendrogram, fcluster
import matplotlib.pyplot as plt

In [2]:
# Read the csv files of the transactions
file_path_transactions = "../data/processed/transactions.parquet"
transactions = pl.read_parquet(file_path_transactions)
pd_transactions = transactions.to_pandas()
pd_transactions.sort_values(by=['account_id', 'invoice_date'])
# Read the csv files of the baskets
file_path_baskets = "../data/processed/baskets.parquet"
baskets = pl.read_parquet(file_path_baskets)
# Read the csv files of the temporal metrics of the transactions
file_path_temporal_metrics = "../data/processed/temporal_metrics.parquet"
temporal_metrics = pl.read_parquet(file_path_temporal_metrics)
pd_temporal_metrics = temporal_metrics.to_pandas()
# Read the csv files of the atributes
file_path_atributes = "../data/processed/atributes.parquet"
atributes = pl.read_parquet(file_path_atributes)
pd_atributes = atributes.to_pandas()
pd_atributes['poc'] = pd_atributes['poc'].astype(str)
pd_atributes.head()

Unnamed: 0,column_0,poc,bussinesssegment,totalvolumen,skudistintospromediosxorden,skudistintostotales,concentracion,segmentounico,canal
0,10,175519,HighUsage,5.18752,4.1,16,Medio,4.Activos,Kioscos/Maxikioscos
1,13,28533,HighUsage,4.76866,3.9211,34,Alto,4.Activos,Tradicional
2,19,32182,PowerUsage,5.9793,6.75,34,Alto,4.Activos,Tradicional
3,20,327976,MinimalUsage,6.02852,3.5833,14,Alto,4.Activos,COMIDA
4,24,354640,PowerUsage,7.525,3.2,18,Bajo,4.Activos,Tradicional


## Entrenamiento

### Separación de los datos

In [4]:
pd_transactions['biweekly_period'] = pd_transactions['invoice_date'].dt.isocalendar().week // 2
biweekly_train_baskets = pd_transactions.groupby(['account_id', 'biweekly_period']).agg({
    'sku_id': lambda x: list(x),
    'items_phys_cases': 'sum'
}).reset_index()
max_period = pd_transactions['biweekly_period'].max()
train_data = pd_transactions[pd_transactions['biweekly_period'] < max_period ]
eval_data = pd_transactions[pd_transactions['biweekly_period'] == max_period ]
print('Max period')
print(max_period)
print("Biweekly Train Baskets:")
print(biweekly_train_baskets.shape)
print("Training Data:")
print(train_data.shape)
print("Evaluation Data:")
print(eval_data.shape)
print("Percentage of Evaluation Data:")
print(100*eval_data.shape[0]/train_data.shape[0])

Max period
17
Biweekly Train Baskets:
(23536, 4)
Training Data:
(244171, 7)
Evaluation Data:
(36657, 7)
Percentage of Evaluation Data:
15.012839362577866


## Aplicar la distancia de Gower y la agrupación jerárquica para segmentar a los usuarios

Distancia de Gower: esta métrica de distancia es ideal para conjuntos de datos con tipos mixtos (numéricos, ordinales categóricos y nominales categóricos).

Agrupamiento jerárquico: utilizamos la función de enlace con el método "promedio" para realizar la agrupación aglomerativa.

Dendrograma: el dendrograma ayuda a visualizar el proceso de agrupación y a decidir dónde cortar el árbol para formar los clústeres.

Asignación de clústeres: utilizamos fcluster para asignar etiquetas de clústeres en función del dendrograma.

In [None]:
# Compute Gower distance matrix
gower_dist = gower.gower_matrix(pd_atributes)

In [None]:
# Perform hierarchical clustering using the Gower distance matrix
Z = linkage(gower_dist, method='average')

# Plot the dendrogram to visualize the clusters
plt.figure(figsize=(14, 10))
dendrogram(Z, labels=pd_atributes['poc'].values)
plt.title("Hierarchical Clustering Dendrogram")
plt.xlabel("POC")
plt.ylabel("Distance")
plt.show()

In [None]:
num_clusters = 7
clusters = fcluster(Z, t=num_clusters, criterion='maxclust')
pd_atributes['cluster'] = clusters
pd_atributes.groupby(by=clusters).count()['poc']

In [None]:
# Group by the clusters to understand their characteristics
numerical = ["totalvolumen","skudistintospromediosxorden","skudistintostotales"]
pd_atributes.groupby('cluster')[numerical].describe().T

## Entrenamiento del modelo utilizando la variable de clusterización que creamos y el algoritmo TIFUKNN.

El proceso es el siguiente:

* Agregar segmentación al dataset de transacciones: asignar a cada usuario en el conjunto de datos de transacciones a un clúster según los resultados de la segmentación.

* Construir una matriz de usuario-artículo por grupo: construimos la matriz de usuario-artículo por separado para cada grupo.

* Calcular la similitud de los elementos: calculamos la similitud del coseno entre los elementos dentro de cada grupo.

* Predecir la próxima canasta con TIFUKNN: mejoraamos el algoritmo TIFUKNN considerando solo usuarios similares dentro del mismo grupo.

* Evaluar el modelo: medimos la efectividad de tus predicciones usando precisión, recuperación y puntaje F1. Además de nuestra de efectividad sobre 3 SKU

Este enfoque garantiza que la predicción de la próxima canasta se adapte al comportamiento específico de los usuarios dentro del mismo grupo, lo que genera recomendaciones más personalizadas y precisas. 

In [None]:
pd_transactions_cluster = pd_transactions.merge(pd_atributes[['poc', 'cluster']], how='left', left_on='account_id', right_on='poc')
print(pd_transactions_cluster.shape[0])
print(100* (pd_transactions_cluster.poc.isnull().sum()/pd_transactions.shape[0]))
pd_transactions_cluster.isnull().sum()

In [None]:
pd_transactions_cluster.dropna(inplace=True)
print(pd_transactions_cluster.shape)
pd_transactions_cluster.isnull().sum()

In [None]:
pd_transactions_cluster.to_parquet("../data/processed/pd_transactions_cluster.parquet")

## Inference 

This is the model trained on the all dataset and it's showing you how to use the function to predict the next basket for one account id. As you can see the algorithm takes 6 seconds in calculate the matrix with item-user similarity by cluster and the prediction happens in less than 1 second, so it's possible to use a lambda function in AWS or something similar in Azure or GCP to delivery it in production.

We have choosed to leave the number of items (k_item) in five, since it was the best combination of good results in training and a number of items that could be display as recommendation on the app withouth generate the need for scrolling down too much

In [None]:
cluster_item_user_matrices = build_item_user_matrix_by_cluster(pd_transactions_cluster, 
                                                               cluster_col='cluster',
                                                               sku_col='sku_id',
                                                               user_col='account_id',
                                                               qty_col='items_phys_cases')
cluster_item_similarity = compute_similarity_by_cluster(cluster_item_user_matrices, 
                                                        pd_transactions_cluster, 
                                                        cluster_col='cluster', 
                                                        user_col='account_id')

In [None]:
k_item = 5
predicted_baskets = {}
for account_id in pd_transactions_cluster['account_id'].unique():
    predicted_baskets[account_id] = predict_next_basket_clustered(account_id, 
                                                                  cluster_item_similarity=cluster_item_similarity, 
                                                                  df=pd_transactions_cluster, 
                                                                  user_col='account_id', 
                                                                  cluster_col='cluster', 
                                                                  sku_col='sku_id', 
                                                                  k=k_item)
# Display some of the predictions
print("Predicted Baskets:")
for account_id, basket in predicted_baskets.items():
    print(f"Account {account_id}: {basket}")
    break

In [None]:
print(100*len(predicted_baskets.keys())/pd_atributes.shape[0])
print(100*len(predicted_baskets.keys())/len(pd_transactions.account_id.unique()))