In [1]:
import pandas as pd
import numpy as np
import networkx as nx
from sklearn.model_selection import train_test_split



In [2]:
commerces  = pd.read_csv("data/commerces.csv")
products = pd.read_csv("data/product.csv")
transactions = pd.read_csv("data/transactions.csv")

In [3]:
# Crear el grafo dirigido
G = nx.DiGraph()


In [4]:
commerces.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 2 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   id_commerce  100 non-null    int64 
 1   district     100 non-null    object
dtypes: int64(1), object(1)
memory usage: 1.7+ KB


In [5]:
# Agregar las conexiones de los distritos
for _, row in commerces.iterrows():
    
    commerce = row['id_commerce']
    district = row['district']
    print(commerce, district)
    
    # Agregar los nodos con atributos
    G.add_node(commerce, tipo='commerce')
    G.add_node(district, tipo='district')
    
    # Agregar la arista con el peso
    #G.add_edge(commerce, district)    
    G.add_edge('C' + str(commerce), district)


1 Providencia
2 Penalolen
3 Penalolen
4 Nunoa
5 Nunoa
6 Macul
7 Macul
8 Providencia
9 La Florida
10 Macul
11 Macul
12 Providencia
13 Macul
14 Providencia
15 Penalolen
16 La Florida
17 Providencia
18 La Florida
19 Nunoa
20 La Florida
21 Macul
22 Nunoa
23 Providencia
24 La Florida
25 Nunoa
26 La Florida
27 Macul
28 La Florida
29 Nunoa
30 Penalolen
31 Nunoa
32 Macul
33 Penalolen
34 La Florida
35 Nunoa
36 Macul
37 Penalolen
38 Nunoa
39 La Florida
40 Nunoa
41 La Florida
42 Providencia
43 Penalolen
44 Nunoa
45 Penalolen
46 Nunoa
47 Nunoa
48 Penalolen
49 Penalolen
50 Providencia
51 Nunoa
52 Macul
53 La Florida
54 Penalolen
55 Providencia
56 Penalolen
57 Providencia
58 Providencia
59 Providencia
60 Providencia
61 Providencia
62 Macul
63 Penalolen
64 Nunoa
65 La Florida
66 Providencia
67 Penalolen
68 Penalolen
69 Providencia
70 La Florida
71 Providencia
72 Providencia
73 Penalolen
74 La Florida
75 La Florida
76 Macul
77 Penalolen
78 Penalolen
79 Penalolen
80 Penalolen
81 Nunoa
82 Penalolen
83 L

In [6]:
products.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50 entries, 0 to 49
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   id_product  50 non-null     int64 
 1   name        50 non-null     object
 2   category    50 non-null     object
 3   price       50 non-null     int64 
dtypes: int64(2), object(2)
memory usage: 1.7+ KB


In [7]:
# Agregar las conexiones de las categorías de productos
for _, row in products.iterrows():
    
    product = row['id_product']
    category = row['category']
    
    # Agregar los nodos con atributos
    G.add_node(product, tipo='product')
    G.add_node(category, tipo='category')
        
    
    G.add_edge("P" + str(product), category)

In [8]:
transactions.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500000 entries, 0 to 499999
Data columns (total 4 columns):
 #   Column       Non-Null Count   Dtype
---  ------       --------------   -----
 0   id_commerce  500000 non-null  int64
 1   id_product   500000 non-null  int64
 2   quantity     500000 non-null  int64
 3   price        500000 non-null  int64
dtypes: int64(4)
memory usage: 15.3 MB


In [9]:
# Crear un conjunto de train y test separando productos por usuario
def split_train_test(df, test_size=0.2):
    train_data = []
    test_data = []

    # Iterar por cada usuario
    for user, user_data in df.groupby('id_commerce'):
        # Dividir en train y test para los productos del usuario
        user_train, user_test = train_test_split(user_data, test_size=test_size, random_state=42)
        
        # Agregar los datos divididos
        train_data.append(user_train)
        test_data.append(user_test)

    # Combinar los datos de todos los usuarios
    train_df = pd.concat(train_data)
    test_df = pd.concat(test_data)

    return train_df, test_df


def split_data_per_client(transactions, test_size=0.4):
    train_data = pd.DataFrame()
    test_data = pd.DataFrame()

    # Agrupar transacciones por cliente (id_commerce)
    grouped = transactions.groupby('id_commerce')

    for commerce_id, group in grouped:
        # Obtener los productos comprados por el comercio
        products = group['id_product'].unique()

        # Usar train_test_split para dividir en 80% train y 20% test
        train_products, test_products = train_test_split(products, test_size=test_size)

        # Filtrar las transacciones de entrenamiento y prueba
        train_data = pd.concat([train_data, group[group['id_product'].isin(train_products)]])
        test_data = pd.concat([test_data, group[group['id_product'].isin(test_products)]])

    return train_data, test_data


# Dividir el conjunto de interacciones
train_transactions, test_transactions = split_data_per_client(transactions, test_size=0.2)

In [10]:
train_transactions.shape

(399979, 4)

In [11]:
test_transactions.shape

(100021, 4)

In [12]:
400574 + 99426

500000

In [13]:
# Agregar las interacciones con pesos (cantidad de compras)
for _, row in train_transactions.iterrows():    
    commerce = row['id_commerce']
    product = row['id_product']    
    G.add_edge("C" + str(commerce), "P"+ str(product), weight=row['quantity'])

In [14]:

# Aplicar el algoritmo de PageRank con pesos
pagerank_scores_train = nx.pagerank(G, weight='weight')

In [15]:
pagerank_scores_train

{1: 0.0021766020392817954,
 'Providencia': 0.002354158015034166,
 'C1': 0.0021766020392817954,
 2: 0.0021766020392817954,
 'Penalolen': 0.0024130037258625146,
 'C2': 0.0021766020392817954,
 3: 0.0021766020392817954,
 'C3': 0.0021766020392817954,
 4: 0.0021766020392817954,
 'Nunoa': 0.002372634862955303,
 'C4': 0.0021766020392817954,
 5: 0.0021766020392817954,
 'C5': 0.0021766020392817954,
 6: 0.0021766020392817954,
 'Macul': 0.002315955412729135,
 'C6': 0.0021766020392817954,
 7: 0.0021766020392817954,
 'C7': 0.0021766020392817954,
 8: 0.0021766020392817954,
 'C8': 0.0021766020392817954,
 9: 0.0021766020392817954,
 'La Florida': 0.0023532466904813082,
 'C9': 0.0021766020392817954,
 10: 0.0021766020392817954,
 'C10': 0.0021766020392817954,
 11: 0.0021766020392817954,
 'C11': 0.0021766020392817954,
 12: 0.0021766020392817954,
 'C12': 0.0021766020392817954,
 13: 0.0021766020392817954,
 'C13': 0.0021766020392817954,
 14: 0.0021766020392817954,
 'C14': 0.0021766020392817954,
 15: 0.00217660

In [16]:
# Convertir los resultados a un DataFrame
df_pagerank_train = pd.DataFrame(list(pagerank_scores_train.items()), columns=["Nodo", "PageRank"])
df_pagerank_train.sort_values(by="PageRank", ascending=False, inplace=True)

# Mostrar los resultados de PageRank
df_pagerank_train


# Convertir los resultados a un DataFrame
#df = pd.DataFrame(list(pagerank_scores.items()), columns=["Nodo", "PageRank"])
#df.sort_values(by="PageRank", ascending=False, inplace=True)


Unnamed: 0,Nodo,PageRank
212,Ropa,0.067572
207,Electrónica,0.051722
205,Hogar,0.051269
210,Alimentos,0.047819
215,Juguetes,0.041548
...,...,...
94,C45,0.002177
95,46,0.002177
96,C46,0.002177
97,47,0.002177


In [17]:
#nodo = "Providencia"
#predecesor_a_excluir = "C1"


def rec(u= None, d = None):

    #todos los nodos asociados al distrito
    print(u, d)
    predecesores_filtrados = [n for n in G.predecessors(d) if n != u]

    prods = set()

    #visitar cada uno de los comercios del mismo district
    for comercio in predecesores_filtrados:
        for producto in G.neighbors(comercio):
            if G.nodes[producto].get("tipo") != 'district':
                #print(producto)
                if u not in list(G.predecessors(producto)):
                    prods.add(producto)

    recomendaciones = df_pagerank_train[df_pagerank_train['Nodo'].isin(prods)].sort_values(by="PageRank", ascending=False)
    return recomendaciones    

In [18]:
a = rec("C1","Providencia")
a 

C1 Providencia


Unnamed: 0,Nodo,PageRank
208,P2,0.00628
240,P31,0.006213
221,P12,0.006129
241,P32,0.006116
245,P36,0.005939
225,P16,0.005848
220,P11,0.005808
223,P14,0.00578
256,P47,0.005577
254,P45,0.005487


In [19]:
b = rec("C8","Providencia")
b

C8 Providencia


Unnamed: 0,Nodo,PageRank
240,P31,0.006213
249,P40,0.00608
246,P37,0.006044
218,P9,0.006042
259,P50,0.005992
217,P8,0.005931
225,P16,0.005848
253,P44,0.005684
222,P13,0.005636
254,P45,0.005487


In [20]:
#!!!! IMPORTANTE



# Suponiendo que ya tienes los predecesores filtrados
#nodo = "Nunoa"
#predecesor_a_excluir = "C22"

nodo = "Providencia"
predecesor_a_excluir = "C8" 

predecesores_filtrados = [n for n in G.predecessors(nodo) if n != predecesor_a_excluir]
#predecesores_filtrados

prods = set()

for n in predecesores_filtrados:
    for vecino in G.neighbors(n):
        if G.nodes[vecino].get("tipo") != 'district':
            #print(vecino)
            if predecesor_a_excluir not in list(G.predecessors(vecino)):
                prods.add(vecino)
prods


recomendaciones = df_pagerank_train[df_pagerank_train['Nodo'].isin(prods)].sort_values(by="PageRank", ascending=False)
recomendaciones    

Unnamed: 0,Nodo,PageRank
240,P31,0.006213
249,P40,0.00608
246,P37,0.006044
218,P9,0.006042
259,P50,0.005992
217,P8,0.005931
225,P16,0.005848
253,P44,0.005684
222,P13,0.005636
254,P45,0.005487


In [21]:
train_transactions[train_transactions.id_commerce == 8].id_product.unique()

array([39, 23, 25, 35, 46, 15, 47, 42,  7,  5, 14, 36, 33, 27, 21,  4, 17,
       26, 18,  2, 24, 43,  3,  1, 12, 20,  6, 30, 34, 10, 28, 11, 38, 32,
       22, 19, 49, 29, 41, 48])

In [22]:
test_transactions[test_transactions.id_commerce == 8].id_product.unique()

array([31,  9,  8, 50, 40, 45, 13, 37, 44, 16])

In [23]:
train_transactions[train_transactions.id_commerce == 1].id_product.unique()

array([33, 35, 13, 43,  6, 44, 20,  8, 23,  5,  1,  9, 22,  4, 39,  7, 27,
       38, 50, 24, 49, 42, 15, 46, 18, 17, 40, 37, 41, 34,  3, 10, 26, 30,
       48, 29, 19, 25, 21, 28])

In [24]:

test_transactions[test_transactions.id_commerce == 1].id_product.unique()


array([32, 45, 11, 36, 47, 16, 12, 31,  2, 14])

In [25]:
pagerank_scores_train

{1: 0.0021766020392817954,
 'Providencia': 0.002354158015034166,
 'C1': 0.0021766020392817954,
 2: 0.0021766020392817954,
 'Penalolen': 0.0024130037258625146,
 'C2': 0.0021766020392817954,
 3: 0.0021766020392817954,
 'C3': 0.0021766020392817954,
 4: 0.0021766020392817954,
 'Nunoa': 0.002372634862955303,
 'C4': 0.0021766020392817954,
 5: 0.0021766020392817954,
 'C5': 0.0021766020392817954,
 6: 0.0021766020392817954,
 'Macul': 0.002315955412729135,
 'C6': 0.0021766020392817954,
 7: 0.0021766020392817954,
 'C7': 0.0021766020392817954,
 8: 0.0021766020392817954,
 'C8': 0.0021766020392817954,
 9: 0.0021766020392817954,
 'La Florida': 0.0023532466904813082,
 'C9': 0.0021766020392817954,
 10: 0.0021766020392817954,
 'C10': 0.0021766020392817954,
 11: 0.0021766020392817954,
 'C11': 0.0021766020392817954,
 12: 0.0021766020392817954,
 'C12': 0.0021766020392817954,
 13: 0.0021766020392817954,
 'C13': 0.0021766020392817954,
 14: 0.0021766020392817954,
 'C14': 0.0021766020392817954,
 15: 0.00217660

In [26]:

vistos = ["P" + str(i) for i in train_transactions[train_transactions["id_commerce"] == 1].id_product.unique()]
top_k_recomendaciones = obtener_top_k_recomendaciones(pagerank_scores_train,1,train_transactions, 5)

NameError: name 'obtener_top_k_recomendaciones' is not defined

In [None]:
top_k_recomendaciones 

In [None]:
train_transactions[train_transactions["id_commerce"] == 1].id_product.unique()

In [None]:
test_transactions[test_transactions["id_commerce"] == 1].id_product.unique()

In [None]:
x = 10  # Variable global

def mi_funcion():
    x = 5  # Esta es una variable local que "somete" a la global dentro de la función
    print(f"Valor de x dentro de la función: {x}")

mi_funcion()
print(f"Valor de x fuera de la función: {x}")
