In [59]:
import pandas as pd
from itertools import combinations

# Especifica la ruta del archivo CSV
path = 'datathon/dataset/outfit_preprocessed.csv'

df = pd.read_csv(path)

df.head(10)

Unnamed: 0,cod_outfit,cod_modelo_color
0,1,51000622-02
1,1,43067759-01
2,1,53060518-02
3,1,53030594-08
4,1,43077762-01
5,1,43063724-OR
6,1,43075794-OR
7,2086,51000622-02
8,2086,53003778-OR
9,2086,53010582-TM


## Obtenim totes les possibles arestes del graf

In [60]:
# Generar todas las combinaciones posibles de productos (pares)
all_products = sorted(set(df['cod_modelo_color']))
all_pairs = list(combinations(all_products, 2))

all_pairs_df = pd.DataFrame(all_pairs, columns=['producto1', 'producto2'])
all_pairs_df['weight'] = 0

# Agrupar por outfit y encontrar todos los pares de productos dentro de cada outfit
pairs = df.groupby('cod_outfit')['cod_modelo_color'].apply(lambda x: list(combinations(sorted(x), 2)))
flat_pairs = [pair for sublist in pairs for pair in sublist]
pair_df = pd.DataFrame(flat_pairs, columns=['producto1', 'producto2'])

# Contar la frecuencia de cada par
edges_with_weights = pair_df.groupby(['producto1', 'producto2']).size().reset_index(name='weight')

# Todas las combinaciones posibles de pares de productos
final_edges = all_pairs_df.merge(edges_with_weights, on=['producto1', 'producto2'], how='left')
final_edges['weight'] = final_edges['weight_y'].fillna(0) + final_edges['weight_x']
final_edges.drop(['weight_x', 'weight_y'], axis=1, inplace=True)

final_edges

Unnamed: 0,producto1,producto2,weight
0,37000577-10,37000577-30,0.0
1,37000577-10,37010681-99,0.0
2,37000577-10,37010684-99,0.0
3,37000577-10,37010684-CU,0.0
4,37000577-10,37010741-37,0.0
...,...,...,...
39984148,67091003-99,87040069-OR,0.0
39984149,67091003-99,87062013-OR,0.0
39984150,67106705-99,87040069-OR,0.0
39984151,67106705-99,87062013-OR,0.0


In [61]:
import itertools

# Agrupa por pares únicos de productos y cuenta las ocurrencias
conteo_parejas = df.groupby('cod_outfit')['cod_modelo_color'].apply(lambda x: list(itertools.combinations(x, 2))).explode().value_counts().reset_index(name='cantidad')

# Muestra el nuevo DataFrame con las parejas de productos y la cantidad de veces que salen juntos en outfits
print(conteo_parejas)

                            index  cantidad
0      (53091158-OR, 53080809-OR)        81
1      (53090511-CU, 53030691-OR)        74
2      (53090511-CU, 53080809-OR)        74
3      (57023725-09, 57044409-TO)        61
4      (57023725-09, 57010282-01)        57
...                           ...       ...
70254  (57066023-01, 57000342-TC)         1
70255  (57000265-12, 57000345-12)         1
70256  (57000345-12, 57040377-70)         1
70257  (47005943-01, 47005857-OR)         1
70258  (57035922-99, 57995935-52)         1

[70259 rows x 2 columns]


In [62]:
parejas = set(conteo_parejas["index"])

Obtenim Parelles Negatives (weight = 0)

In [63]:
import random

data_0 = []

for _ in range(20000):
    n = random.randint(0, 39000000)
    obs = final_edges.iloc[n]
    pair = (obs["producto1"], obs["producto2"])

    while float(obs["weight"]) != 0.0 or pair in parejas:
        n = random.randint(0, 39000000)
        obs = final_edges.iloc[n]
        pair = (obs["producto1"], obs["producto2"])

    data_0.append(str(pair))

In [65]:
d_0 = pd.DataFrame(data_0, columns=["index"])

d_0.to_csv("d_0.csv", index=False)

Unnamed: 0,index
0,"('53083770-99', '57089403-99')"
1,"('43095781-07', '57040105-06')"
2,"('57058269-PL', '57065922-99')"
3,"('47087884-05', '53055744-99')"
4,"('57036303-99', '67040592-08')"
...,...
19995,"('53090538-07', '57086019-TN')"
19996,"('53063709-56', '57085148-92')"
19997,"('47025869-30', '57086713-85')"
19998,"('51020888-40', '51093029-99')"


In [9]:
final_edges

Unnamed: 0,producto1,producto2,weight
0,37000577-10,37000577-30,0.0
1,37000577-10,37010681-99,0.0
2,37000577-10,37010684-99,0.0
3,37000577-10,37010684-CU,0.0
4,37000577-10,37010741-37,0.0
...,...,...,...
39984148,67091003-99,87040069-OR,0.0
39984149,67091003-99,87062013-OR,0.0
39984150,67106705-99,87040069-OR,0.0
39984151,67106705-99,87062013-OR,0.0
