In [44]:
import pandas as pd
import numpy as np
import networkx as nx
from sklearn.model_selection import train_test_split



In [45]:
commerces  = pd.read_csv("data/commerces.csv")
products = pd.read_csv("data/product.csv")
transactions = pd.read_csv("data/transactions.csv")

In [46]:
# Crear el grafo dirigido
G = nx.DiGraph()


In [78]:
commerces.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 2 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   id_commerce  100 non-null    int64 
 1   district     100 non-null    object
dtypes: int64(1), object(1)
memory usage: 1.7+ KB


In [48]:
# Agregar las conexiones de los distritos
for _, row in commerces.iterrows():
    G.add_edge('C' + str(row['id_commerce']), row['district'])


In [79]:
products.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50 entries, 0 to 49
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   id_product  50 non-null     int64 
 1   name        50 non-null     object
 2   category    50 non-null     object
 3   price       50 non-null     int64 
dtypes: int64(2), object(2)
memory usage: 1.7+ KB


In [50]:
# Agregar las conexiones de las categorías de productos
for _, row in products.iterrows():
    G.add_edge("P" + str(row['id_product']), row['category'])

In [80]:
transactions.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500000 entries, 0 to 499999
Data columns (total 4 columns):
 #   Column       Non-Null Count   Dtype
---  ------       --------------   -----
 0   id_commerce  500000 non-null  int64
 1   id_product   500000 non-null  int64
 2   quantity     500000 non-null  int64
 3   price        500000 non-null  int64
dtypes: int64(4)
memory usage: 15.3 MB


In [146]:
# Crear un conjunto de train y test separando productos por usuario
def split_train_test(df, test_size=0.2):
    train_data = []
    test_data = []

    # Iterar por cada usuario
    for user, user_data in df.groupby('id_commerce'):
        # Dividir en train y test para los productos del usuario
        user_train, user_test = train_test_split(user_data, test_size=test_size, random_state=42)
        
        # Agregar los datos divididos
        train_data.append(user_train)
        test_data.append(user_test)

    # Combinar los datos de todos los usuarios
    train_df = pd.concat(train_data)
    test_df = pd.concat(test_data)

    return train_df, test_df


def split_data_per_client(transactions, test_size=0.4):
    train_data = pd.DataFrame()
    test_data = pd.DataFrame()

    # Agrupar transacciones por cliente (id_commerce)
    grouped = transactions.groupby('id_commerce')

    for commerce_id, group in grouped:
        # Obtener los productos comprados por el comercio
        products = group['id_product'].unique()

        # Usar train_test_split para dividir en 80% train y 20% test
        train_products, test_products = train_test_split(products, test_size=test_size)

        # Filtrar las transacciones de entrenamiento y prueba
        train_data = pd.concat([train_data, group[group['id_product'].isin(train_products)]])
        test_data = pd.concat([test_data, group[group['id_product'].isin(test_products)]])

    return train_data, test_data


# Dividir el conjunto de interacciones
train_transactions, test_transactions = split_data_per_client(transactions, test_size=0.2)

In [147]:
train_transactions.shape

(400144, 4)

In [148]:
test_transactions.shape

(99856, 4)

In [149]:
400574 + 99426

500000

In [150]:
# Agregar las interacciones con pesos (cantidad de compras)
for _, row in train_transactions.iterrows():
    G.add_edge("C" + str(row['id_commerce']), "P"+ str(row['id_product']), weight=row['quantity'])



In [151]:

# Aplicar el algoritmo de PageRank con pesos
pagerank_scores_train = nx.pagerank(G, weight='weight')

In [152]:
pagerank_scores_train

{'C1': 0.002781725252583384,
 'Providencia': 0.002963426226898468,
 'C2': 0.002781725252583384,
 'Penalolen': 0.003023324502756195,
 'C3': 0.002781725252583384,
 'C4': 0.002781725252583384,
 'Nunoa': 0.0029858362851968474,
 'C5': 0.002781725252583384,
 'C6': 0.002781725252583384,
 'Macul': 0.0029214453691000317,
 'C7': 0.002781725252583384,
 'C8': 0.002781725252583384,
 'C9': 0.002781725252583384,
 'La Florida': 0.002962374025893638,
 'C10': 0.002781725252583384,
 'C11': 0.002781725252583384,
 'C12': 0.002781725252583384,
 'C13': 0.002781725252583384,
 'C14': 0.002781725252583384,
 'C15': 0.002781725252583384,
 'C16': 0.002781725252583384,
 'C17': 0.002781725252583384,
 'C18': 0.002781725252583384,
 'C19': 0.002781725252583384,
 'C20': 0.002781725252583384,
 'C21': 0.002781725252583384,
 'C22': 0.002781725252583384,
 'C23': 0.002781725252583384,
 'C24': 0.002781725252583384,
 'C25': 0.002781725252583384,
 'C26': 0.002781725252583384,
 'C27': 0.002781725252583384,
 'C28': 0.002781725252

In [153]:
# Convertir los resultados a un DataFrame
df_pagerank_train = pd.DataFrame(list(pagerank_scores_train.items()), columns=["Nodo", "PageRank"])
df_pagerank_train.sort_values(by="PageRank", ascending=False, inplace=True)

# Mostrar los resultados de PageRank
df_pagerank_train


# Convertir los resultados a un DataFrame
#df = pd.DataFrame(list(pagerank_scores.items()), columns=["Nodo", "PageRank"])
#df.sort_values(by="PageRank", ascending=False, inplace=True)


Unnamed: 0,Nodo,PageRank
113,Ropa,0.085645
106,Hogar,0.066192
108,Electrónica,0.066179
111,Alimentos,0.060664
116,Juguetes,0.053661
...,...,...
56,C52,0.002782
57,C53,0.002782
58,C54,0.002782
59,C55,0.002782


In [172]:
# Función para recomendar productos a un usuario basado en PageRank
def recomendar_productos(usuario, productos_comprados, distrito):
    # Obtener todos los productos del grafo
    productos = [n for n in G.nodes() if n.startswith("P")]
    print(productos)
    
    # Obtener productos del distrito del usuario
    productos_distrito = [n for n in G.neighbors(distrito) if n.startswith("P")]
    
    # Filtrar productos que no ha comprado el usuario
    productos_a_recomendar = [p for p in productos_distrito if p not in productos_comprados]
    
    # Si no hay productos disponibles en el distrito, recomendar productos globales
    if not productos_a_recomendar:
        productos_a_recomendar = [p for p in productos if p not in productos_comprados]
    
    # Filtrar y ordenar por PageRank los productos que no ha comprado el usuario
    recomendaciones = df_pagerank_train[df_pagerank_train['Nodo'].isin(productos_a_recomendar)].sort_values(by="PageRank", ascending=False)
    
    return recomendaciones




# Ejemplo: recomendar productos a U1 que pertenece al distrito D1
usuario = "C1"
#productos_comprados = ["P1", "P2", "P48"]
productos_comprados = ["P" + str(i) for i in train_transactions[train_transactions["id_commerce"] == 1].id_product.unique()]

distrito_usuario = "Providencia"

recomendaciones = recomendar_productos(usuario, productos_comprados, distrito_usuario)

# Mostrar las recomendaciones
recomendaciones


['Providencia', 'Penalolen', 'P1', 'P2', 'P3', 'P4', 'P5', 'P6', 'P7', 'P8', 'P9', 'P10', 'P11', 'P12', 'P13', 'P14', 'P15', 'P16', 'P17', 'P18', 'P19', 'P20', 'P21', 'P22', 'P23', 'P24', 'P25', 'P26', 'P27', 'P28', 'P29', 'P30', 'P31', 'P32', 'P33', 'P34', 'P35', 'P36', 'P37', 'P38', 'P39', 'P40', 'P41', 'P42', 'P43', 'P44', 'P45', 'P46', 'P47', 'P48', 'P49', 'P50']


Unnamed: 0,Nodo,PageRank
140,P31,0.007796
131,P22,0.007765
121,P12,0.007689
153,P44,0.00758
132,P23,0.007466
138,P29,0.007396
133,P24,0.007385
135,P26,0.007383
152,P43,0.007276
110,P4,0.00701


In [164]:
train_transactions[train_transactions.id_commerce == 1].id_product.unique()

array([32, 33, 35, 13, 45, 11,  6, 36, 20,  8,  5,  1, 47,  9, 39,  7, 27,
       38, 50, 49, 42, 15, 46, 18, 17, 40, 37, 41, 16, 34,  3, 10, 30, 48,
        2, 19, 25, 21, 28, 14])

In [166]:
test_transactions[test_transactions.id_commerce == 1].id_product.unique()

array([43, 44, 23, 22,  4, 24, 12, 31, 26, 29])

In [157]:
pagerank_scores_train

{'C1': 0.002781725252583384,
 'Providencia': 0.002963426226898468,
 'C2': 0.002781725252583384,
 'Penalolen': 0.003023324502756195,
 'C3': 0.002781725252583384,
 'C4': 0.002781725252583384,
 'Nunoa': 0.0029858362851968474,
 'C5': 0.002781725252583384,
 'C6': 0.002781725252583384,
 'Macul': 0.0029214453691000317,
 'C7': 0.002781725252583384,
 'C8': 0.002781725252583384,
 'C9': 0.002781725252583384,
 'La Florida': 0.002962374025893638,
 'C10': 0.002781725252583384,
 'C11': 0.002781725252583384,
 'C12': 0.002781725252583384,
 'C13': 0.002781725252583384,
 'C14': 0.002781725252583384,
 'C15': 0.002781725252583384,
 'C16': 0.002781725252583384,
 'C17': 0.002781725252583384,
 'C18': 0.002781725252583384,
 'C19': 0.002781725252583384,
 'C20': 0.002781725252583384,
 'C21': 0.002781725252583384,
 'C22': 0.002781725252583384,
 'C23': 0.002781725252583384,
 'C24': 0.002781725252583384,
 'C25': 0.002781725252583384,
 'C26': 0.002781725252583384,
 'C27': 0.002781725252583384,
 'C28': 0.002781725252

In [158]:

# Función para obtener las top-k recomendaciones personalizadas por usuario
def obtener_top_k_recomendaciones(pagerank_scores, usuario, productos_vistos, k):
    # Ordenar por PageRank y obtener los productos que el usuario no ha visto
    sorted_scores = sorted(pagerank_scores.items(), key=lambda x: x[1], reverse=True)
    
    # Filtrar solo los productos que el usuario no ha visto
    top_k_recomendaciones = [nodo for nodo, score in sorted_scores if nodo.startswith("P") and nodo not in productos_vistos][:k]
    
    return top_k_recomendaciones

# Función para obtener las top-k recomendaciones solo de productos que el usuario no haya comprado
def obtener_top_k_recomendaciones(pagerank_scores, usuario, productos_vistos, k):
    # Ordenar por PageRank y obtener los productos que el usuario no ha visto
    sorted_scores = sorted(pagerank_scores.items(), key=lambda x: x[1], reverse=True)
    
    # Filtrar solo los productos que el usuario no ha visto (excluir los comprados en train)
    top_k_recomendaciones = [nodo for nodo, score in sorted_scores if nodo.startswith("P") and nodo not in productos_vistos][:k]
    
    return top_k_recomendaciones

# Función para obtener las top-k recomendaciones solo de productos que el usuario no haya comprado
def obtener_top_k_recomendaciones(pagerank_scores, usuario, train_df, k):
    # Obtener los productos que el usuario ya ha visto en el conjunto de train
    productos_vistos = set(train_df[train_df['id_commerce'] == usuario]['id_product'].tolist())
    
    # Filtrar las recomendaciones basadas en productos que el usuario no ha visto/comprado
    sorted_scores = sorted(pagerank_scores.items(), key=lambda x: x[1], reverse=True)
    
    # Filtrar productos que el usuario no ha visto en el conjunto de train
    top_k_recomendaciones = [nodo for nodo, score in sorted_scores if nodo.startswith("P") and nodo not in productos_vistos][:k]
    
    return top_k_recomendaciones




In [159]:

vistos = ["P" + str(i) for i in train_transactions[train_transactions["id_commerce"] == 1].id_product.unique()]
top_k_recomendaciones = obtener_top_k_recomendaciones(pagerank_scores_train,1,train_transactions, 5)

In [160]:
top_k_recomendaciones 

['P32', 'P27', 'P2', 'P46', 'P35']

In [161]:
train_transactions[train_transactions["id_commerce"] == 1].id_product.unique()

array([32, 33, 35, 13, 45, 11,  6, 36, 20,  8,  5,  1, 47,  9, 39,  7, 27,
       38, 50, 49, 42, 15, 46, 18, 17, 40, 37, 41, 16, 34,  3, 10, 30, 48,
        2, 19, 25, 21, 28, 14])

In [162]:
test_transactions[test_transactions["id_commerce"] == 1].id_product.unique()

array([43, 44, 23, 22,  4, 24, 12, 31, 26, 29])