In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import os

In [None]:
np.random.seed(42)

In [None]:
df = pd.read_csv("online_retail_II_clean.csv")

In [None]:
df = df.groupby(["Customer ID", "StockCode"]).agg({
    "Quantity": "sum",
    "Price": "mean",
}).reset_index()

df["NetPrice"] = df["Quantity"] * df["Price"]

In [None]:
df_matrix = df.pivot_table(
    index="Customer ID",
    columns="StockCode",
    values="NetPrice",
    fill_value=0
)

df_matrix.shape

In [None]:
S = df_matrix.values

min_purchases = 20
valid_users = S.astype(bool).sum(axis=1) >= min_purchases

S = S[valid_users, :]

print(f"Matriz base: {S.shape[0]} usuarios, {S.shape[1]} productos")

In [None]:
num_splits = 50
mask_prop = 0.5

os.makedirs("splits_random", exist_ok=True)

In [None]:
for split in range(1, num_splits + 1):
    np.random.seed(split)

    mask = np.zeros_like(S, dtype=bool)

    for i in range(S.shape[0]):
        non_zero_indices = np.where(S[i] > 0)[0]
        n_mask = max(1, int(mask_prop * len(non_zero_indices)))
        if len(non_zero_indices) > 0:
            masked_indices = np.random.choice(non_zero_indices, size=n_mask, replace=False)
            mask[i, masked_indices] = True

    # Generate train/test
    S_train = S.copy()
    S_train[mask] = 0
    S_test = np.where(mask, S, 0)

    # Filter users without purchases in train
    valid_users = S_train.astype(bool).sum(axis=1) > 0
    S_train = S_train[valid_users, :]
    S_test = S_test[valid_users, :]

    # Normalize (participation)
    S_train_norm = S_train / S_train.sum()

    # Create split directory
    split_dir = f"splits_random/split_{split:02d}"
    os.makedirs(split_dir, exist_ok=True)

    # Save data
    train_matrix = pd.DataFrame(S_train_norm, index=None, columns=None)
    test_matrix = pd.DataFrame(S_test, index=None, columns=None)

    train_matrix.to_csv(os.path.join(split_dir, "train.csv"), index=False)
    test_matrix.to_csv(os.path.join(split_dir, "test.csv"), index=False)

    np.save(os.path.join(split_dir, "S_train.npy"), S_train_norm)
    np.save(os.path.join(split_dir, "S_test.npy"), S_test)

    num_transactions_train = np.sum(S_train > 0)
    num_transactions_test = np.sum(S_test > 0)
    density_train = num_transactions_train / S_train.size
    density_test = num_transactions_test / S_test.size

    print(f"Split {split:02d}:")
