In [None]:
import os
import random
from math import ceil

import numpy as np
import pandas as pd
from sklearn.cluster import KMeans, MiniBatchKMeans
from sklearn.decomposition import IncrementalPCA, PCA
from sklearn.metrics import silhouette_score
from sklearn.preprocessing import StandardScaler
from tqdm.auto import tqdm

from utils.data import (
    extract_embedding, get_interactions_dataframe,
    mark_evaluation_rows, get_holdout,
)
from utils.hashing import pre_hash, HashesContainer
from utils.sampling import StrategyHandler
from utils.similarity import HybridScorer, VisualSimilarityHandler


# Triplet sampling

In [None]:
# Dataset
# * UGallery: No max profile size, allow group interactions.
# * Wikimedia: Max profile size (10), allow group interactions.
# * Pinterest: Max profile size (10), disable group interactions. Use cache and mode big.
# * Tradesy: Max profile size (10), disable group interactions. Use cache and mode big.
DATASET = "Wikimedia"
assert DATASET in ["UGallery", "Wikimedia", "Pinterest", "Tradesy"]


In [None]:
# Mode
# Use 'MODE_PROFILE = True' for CuratorNet-like training 
# Use 'MODE_PROFILE = False' for VBPR-like training
MODE_PROFILE = True
MODE_PROFILE _VERBOSE = "profile" if MODE_PROFILE else "user"
# Use 'MODE_BIG = True' for big embeddings
# Use 'MODE_BIG = False' for regular-size embeddings
MODE_BIG = False

In [None]:
# Feature extractor
FEATURE_EXTRACTOR = "resnet50"

In [None]:
# Paths (general)
EMBEDDING_PATH = os.path.join("data", DATASET, f"{DATASET.lower()}_embedding-{FEATURE_EXTRACTOR}.npy")
INTERACTIONS_PATH = os.path.join("data", DATASET, f"{DATASET.lower()}.csv")
OUTPUT_TRAIN_PATH = os.path.join("data", DATASET, f"{MODE_PROFILE_VERBOSE}-train.csv")
OUTPUT_VALID_PATH = os.path.join("data", DATASET, f"{MODE_PROFILE_VERBOSE}-validation.csv")
OUTPUT_EVAL_PATH = os.path.join("data", DATASET, f"{MODE_PROFILE_VERBOSE}-evaluation.csv")

# Caching
USE_CACHE = True
CACHE_PCA_PATH = os.path.join("data", DATASET, f"{DATASET.lower()}_pca.npy")
CACHE_LABELS_PATH = os.path.join("data", DATASET, f"{DATASET.lower()}_labels.npy")

# General constants
RNG_SEED = 0

# Visual cluster constants
CLUSTERING_RNG = None
CLUSTERING_N_CLUSTERS = 100
CLUSTERING_N_INIT = 8
CLUSTERING_N_TIMES = 20
PCA_COMPONENTS = 200
SILHOUETTE_SAMPLE_SIZE = 40_000 if MODE_BIG else None

# Sampling constants
EVAL_ROWS_PER_USER = 10
GROUP_USER_INTERACTIONS_BY_TIMESTAMP = True
MAX_PROFILE_SIZE = None
TOTAL_SAMPLES_TRAIN = 10_000_000
TOTAL_SAMPLES_VALID = 500_000

# Sampling constants (original)
ARTIST_BOOST = 0.2
CONFIDENCE_MARGIN = 0.18
FAKE_COEF = 0. if MODE_PROFILE is True else 0.
FINE_GRAINED_THRESHOLD = 0.7
assert all(
    0. <= var <= 1.
    for var in [ARTIST_BOOST, CONFIDENCE_MARGIN, FINE_GRAINED_THRESHOLD, FAKE_COEF]
)


In [None]:
# Freezing RNG seed if needed
if RNG_SEED is not None:
    print(f"\nUsing random seed... ({RNG_SEED})")
    random.seed(RNG_SEED)
    np.random.seed(RNG_SEED)


In [None]:
# Load embedding from file
print(f"\nLoading embedding from file... ({EMBEDDING_PATH})")
embedding = np.load(EMBEDDING_PATH, allow_pickle=True)

# Extract features and "id2index" mapping
print("\nExtracting data into variables...")
features, item_id2index, _ = extract_embedding(embedding, verbose=True)
print(f">> Features shape: {features.shape}")
del embedding  # Release some memory


In [None]:
# Creating visual clusters
if USE_CACHE:
    print(f"\nCreating visual clusters (steps 1 & 2): Loading cached PCA ({CACHE_PCA_PATH})")
    features = np.load(CACHE_PCA_PATH, allow_pickle=True)
else:
    print("\nCreating visual clusters (step 1): z-score normalization of embedding")
    if MODE_BIG:
        features = StandardScaler().partial_fit(features).transform(features)
    else:
        features = StandardScaler().fit_transform(features)
    print(f">> Features shape: {features.shape}")

    print("\nCreating visual clusters (step 2): Conduct PCA to reduce dimension")
    if MODE_BIG:
        features = IncrementalPCA(n_components=PCA_COMPONENTS).fit_transform(features)
    else:
        features = PCA(n_components=PCA_COMPONENTS).fit_transform(features)
    np.save(CACHE_PCA_PATH, features, allow_pickle=True)
print(f">> Features shape: {features.shape}")

if USE_CACHE:
    print(f"\nCreating visual clusters (step 3): Loading cached labels ({CACHE_LABELS_PATH})")
    clusterer_labels = np.load(CACHE_LABELS_PATH, allow_pickle=True)
    best_score = silhouette_score(features, clusterer_labels, sample_size=SILHOUETTE_SAMPLE_SIZE)
else:
    print("\nCreating visual clusters (step 3): Perform k-means clustering")
    best_score = float("-inf")
    best_clusterer = None
    for i in range(CLUSTERING_N_TIMES):
        if MODE_BIG:
            clusterer = MiniBatchKMeans(
                n_clusters=CLUSTERING_N_CLUSTERS,
                batch_size=2000,
                max_iter=2000,
                n_init=CLUSTERING_N_INIT,
                random_state=CLUSTERING_RNG,
            ).fit(features)
        else:
            clusterer = KMeans(
                n_clusters=CLUSTERING_N_CLUSTERS,
                max_iter=2000,
                n_init=CLUSTERING_N_INIT,
                random_state=CLUSTERING_RNG,
            ).fit(features)
        score = silhouette_score(features, clusterer.labels_, sample_size=SILHOUETTE_SAMPLE_SIZE)
        if score > best_score:
            best_clusterer = clusterer
            best_score = score
        if CLUSTERING_RNG is not None:
            break
        print(f">> Silhouette score ({i + 1:02}/{CLUSTERING_N_TIMES:02}): {score:.6f}")
    clusterer_labels = best_clusterer.labels_
    del best_clusterer  # Release some memory
    np.save(CACHE_LABELS_PATH, clusterer_labels, allow_pickle=True)
print(f">> Best Silhouette score: {best_score}")


In [None]:
# Load interactions CSVs
print(f"\nLoading interactions from files...")
interactions_df = get_interactions_dataframe(
    INTERACTIONS_PATH,
    display_stats=True,
)

# Apply 'item_id2index', to work with indexes only
print("\nApply 'item_id2index' mapping for items...")
interactions_df["item_id"] = interactions_df["item_id"].map(str)
n_missing_ids = interactions_df[~interactions_df["item_id"].isin(item_id2index)]["item_id"].count()
interactions_df = interactions_df[interactions_df["item_id"].isin(item_id2index)]
interactions_df["item_id"] = interactions_df["item_id"].map(item_id2index)
print(f">> Mapping applied, ({n_missing_ids} values in 'item_id2index')")

# Store mapping from user_id to index (0-index, no skipping)
print("\nCreate 'user_id2index' mapping for users...")
unique_user_ids = interactions_df["user_id"].unique()
new_user_ids = np.argsort(unique_user_ids)
user_id2index = dict(zip(unique_user_ids, new_user_ids))

# Apply 'user_id2index', to work with indexes only
print("\nApply 'user_id2index' mapping for users...")
n_missing_ids = interactions_df[~interactions_df["user_id"].isin(user_id2index)]["user_id"].count()
interactions_df = interactions_df[interactions_df["user_id"].isin(user_id2index)]
interactions_df["user_id"] = interactions_df["user_id"].map(user_id2index)
print(f">> Mapping applied, ({n_missing_ids} values in 'user_id2index')")

# Create helper mapping from idx to data
print("\nCreating mappings from index to data")
artist_by_idx = np.full((features.shape[0],), -1)
for item_id, artist_id in interactions_df.set_index("item_id").to_dict()["artist_id"].items():
    artist_by_idx[item_id] = artist_id
cluster_by_idx = clusterer_labels

# Create helper mapping from data to idxs
print("\nCreating mappings from data to index")
artistId2artworkIndexes = interactions_df.groupby("artist_id")["item_id"].apply(list).to_dict()
clustId2artIndexes = dict()
for i, cluster in enumerate(cluster_by_idx):
    if cluster not in clustId2artIndexes:
        clustId2artIndexes[cluster] = list()
    clustId2artIndexes[cluster].append(i)

# Form interactions baskets, grouping by timestamp and user_id
if GROUP_USER_INTERACTIONS_BY_TIMESTAMP:
    print("\nForm interactions groups (baskets), by timestamp and user_id...")
    interactions_df = interactions_df.groupby(["timestamp", "user_id"])["item_id"].apply(list)
    interactions_df = interactions_df.reset_index()
    interactions_df = interactions_df.sort_values("timestamp")
    interactions_df = interactions_df.reset_index(drop=True)
else:
    print("\nInteractions groups (baskets), by timestamp and user_id, skipped")
    
# Mark interactions used for evaluation procedure
print("\nApply evaluation split...")
interactions_df = mark_evaluation_rows(interactions_df, threshold=EVAL_ROWS_PER_USER)
# Check if new column exists and has boolean dtype
assert interactions_df["evaluation"].dtype.name == "bool"
print(f">> Interactions: {interactions_df.shape}")

# Split interactions data according to evaluation column
evaluation_df, interactions_df = get_holdout(interactions_df)
assert not interactions_df.empty
assert not evaluation_df.empty
print(f">> Evaluation: {evaluation_df.shape} | Interactions: {interactions_df.shape}")


In [None]:
print("\nCreating helpers instances...")
# Creating hashes container for duplicates detection
hashes_container = HashesContainer()
# Creating custom score helpers
vissimhandler = VisualSimilarityHandler(clusterer_labels, features)
hybrid_scorer = HybridScorer(vissimhandler, artist_by_idx, artist_boost=ARTIST_BOOST)

# Sampling constants
print("\nCalculating important values...")
N_REAL_STRATEGIES = 2
N_FAKE_STRATEGIES = 2
print(f">> There are {N_REAL_STRATEGIES} real strategies and {N_FAKE_STRATEGIES} fake strategies")
N_SAMPLES_PER_REAL_STRAT_TRAIN = ceil((1 - FAKE_COEF) * TOTAL_SAMPLES_TRAIN / N_REAL_STRATEGIES)
N_SAMPLES_PER_REAL_STRAT_VALID = ceil((1 - FAKE_COEF) * TOTAL_SAMPLES_VALID / N_REAL_STRATEGIES)
N_SAMPLES_PER_FAKE_STRAT_TRAIN = ceil(FAKE_COEF * TOTAL_SAMPLES_TRAIN / N_FAKE_STRATEGIES)
N_SAMPLES_PER_FAKE_STRAT_VALID = ceil(FAKE_COEF * TOTAL_SAMPLES_VALID / N_FAKE_STRATEGIES)
N_USERS = interactions_df["user_id"].nunique()
N_ITEMS = len(features)
print(f">> N_USERS = {N_USERS} | N_ITEMS = {N_ITEMS}")

# Actual sampling section
print("\nCreating samples using custom strategies...")
strategy_handler = StrategyHandler(
    interactions_df,
    vissimhandler, hybrid_scorer,
    clustId2artIndexes, cluster_by_idx,
    artistId2artworkIndexes, artist_by_idx,
    threshold=FINE_GRAINED_THRESHOLD,
    confidence_margin=CONFIDENCE_MARGIN,
    user_as_items=MODE_PROFILE,
    max_profile_size=MAX_PROFILE_SIZE,
)


In [None]:
print(">> Strategy #1: Given real profile, recommend profile")
# Sampling training samples
samples_train_1 = strategy_handler.strategy_1(
    ceil(N_SAMPLES_PER_REAL_STRAT_TRAIN / N_USERS),  # samples_per_user
    hashes_container,  # hashes_container
)
assert len(samples_train_1) >= N_SAMPLES_PER_REAL_STRAT_TRAIN
# Sampling validation samples
samples_valid_1 = strategy_handler.strategy_1(
    ceil(N_SAMPLES_PER_REAL_STRAT_VALID / N_USERS),  # samples_per_user
    hashes_container,  # hashes_container
)
assert len(samples_valid_1) >= N_SAMPLES_PER_REAL_STRAT_VALID
print(f">> Strategy #1 Training samples ({len(samples_train_1)}) and validation samples ({len(samples_valid_1)})")


In [None]:
print(">> Strategy #2: Given fake profile, recommend profile")
# Sampling training samples
samples_train_2 = strategy_handler.strategy_2(
    ceil(N_SAMPLES_PER_FAKE_STRAT_TRAIN / N_ITEMS),  # samples_per_item
    hashes_container,  # hashes_container
)
assert len(samples_train_2) >= N_SAMPLES_PER_FAKE_STRAT_TRAIN
# Sampling validation samples
samples_valid_2 = strategy_handler.strategy_2(
    ceil(N_SAMPLES_PER_FAKE_STRAT_VALID / N_ITEMS),  # samples_per_item
    hashes_container,  # hashes_container
)
assert len(samples_valid_2) >= N_SAMPLES_PER_FAKE_STRAT_VALID
print(f">> Strategy #2: Training samples ({len(samples_train_2)}) and validation samples ({len(samples_valid_2)})")


In [None]:
print(">> Strategy #3: Given real profile, recommend items according to hybrid scorer")
# Sampling training samples
samples_train_3 = strategy_handler.strategy_3(
    ceil(N_SAMPLES_PER_REAL_STRAT_TRAIN / N_USERS),  # samples_per_user
    hashes_container,  # hashes_container
)
assert len(samples_train_3) >= N_SAMPLES_PER_REAL_STRAT_TRAIN
# Sampling validation samples
samples_valid_3 = strategy_handler.strategy_3(
    ceil(N_SAMPLES_PER_REAL_STRAT_VALID / N_USERS),  # samples_per_user
    hashes_container,  # hashes_container
)
assert len(samples_valid_3) >= N_SAMPLES_PER_REAL_STRAT_VALID
print(f">> Strategy #3: Training samples ({len(samples_train_3)}) and validation samples ({len(samples_valid_3)})")


In [None]:
print(">> Strategy #4: Given fake profile, recommend items according to hybrid scorer")
# Sampling training samples
samples_train_4 = strategy_handler.strategy_4(
    ceil(N_SAMPLES_PER_FAKE_STRAT_TRAIN / N_ITEMS),  # samples_per_item
    hashes_container,  # hashes_container
)
assert len(samples_train_4) >= N_SAMPLES_PER_FAKE_STRAT_TRAIN
# Sampling validation samples
samples_valid_4 = strategy_handler.strategy_4(
    ceil(N_SAMPLES_PER_FAKE_STRAT_VALID / N_ITEMS),  # samples_per_item
    hashes_container,  # hashes_container
)
assert len(samples_valid_4) >= N_SAMPLES_PER_FAKE_STRAT_VALID
print(f">> Strategy #4: Training samples ({len(samples_train_4)}) and validation samples ({len(samples_valid_4)})")


In [None]:
# Log out detected collisions
print(f"\nLog detected collisions...")
print(f">> Total hash collisions: {hashes_container.collisions}")
print(f">> Total visual collisions: {vissimhandler.count}")


In [None]:
# Merge triples into a single list
print("\nMerging strategies samples into a single list")
TRAINING_DATA = [samples_train_1, samples_train_2, samples_train_3, samples_train_4]
for i, samples in enumerate(TRAINING_DATA, start=1):
    print(f">> Strategy {i}: Size: {len(samples):8d} | Sample: {samples[0] if samples else None}")
TRAINING_DATA = [
    triple
    for strategy_samples in TRAINING_DATA
    for triple in strategy_samples
]
print(f">> Training samples: {len(TRAINING_DATA)}")
# Merge strategies samples
VALIDATION_DATA = [samples_valid_1, samples_valid_2, samples_valid_3, samples_valid_4]
for i, samples in enumerate(VALIDATION_DATA, start=1):
    print(f">> Strategy {i}: Size: {len(samples):8d} | Sample: {samples[0] if samples else None}")
VALIDATION_DATA = [
    triple
    for strategy_samples in VALIDATION_DATA
    for triple in strategy_samples
]
print(f">> Validation samples: {len(VALIDATION_DATA)}")


In [None]:
# Search for duplicated hashes
print(f"\nNaive triples validation and looking for duplicates...")
validation_hash_check = HashesContainer()
all_samples = [
    triple
    for subset in (TRAINING_DATA, VALIDATION_DATA)
    for triple in subset
]
user_ids = interactions_df["user_id"].unique()
user_data = dict()
for triple in tqdm(all_samples, desc="Naive validation"):
    profile, pi, ni, ui = triple
    if MODE_PROFILE:
        assert validation_hash_check.enroll(pre_hash((profile, pi, ni)))
    else:
        assert validation_hash_check.enroll(pre_hash((ui, pi, ni), contains_iter=False))
    assert 0 <= pi < N_ITEMS
    assert 0 <= ni < N_ITEMS
    assert pi != ni
    assert not vissimhandler.same(pi, ni)
    if ui == -1:
        continue
    assert ui in user_ids
    if not ui in user_data:
        user = interactions_df[interactions_df["user_id"] == ui]
        user_data[ui] = set(np.hstack(user["item_id"].values))
    user_artworks = user_data[ui]
    assert all(i in user_artworks for i in profile)
    spi = hybrid_scorer.get_score(ui, user_artworks, pi)
    sni = hybrid_scorer.get_score(ui, user_artworks, ni)
    assert spi > sni
print(">> No duped hashes found")


In [None]:
print("\nCreating output files (train and valid)...")
# Training dataframe
df_train = pd.DataFrame(TRAINING_DATA, columns=["profile", "pi", "ni", "ui"])
df_train["profile"] = df_train["profile"].map(lambda l: " ".join(map(str, l)))
print(f">> Saving training samples ({OUTPUT_TRAIN_PATH})")
df_train.to_csv(OUTPUT_TRAIN_PATH, index=False)

# Validation dataframe
df_validation = pd.DataFrame(VALIDATION_DATA, columns=["profile", "pi", "ni", "ui"])
df_validation["profile"] = df_validation["profile"].map(lambda l: " ".join(map(str, l)))
print(f">> Saving validation samples ({OUTPUT_VALID_PATH})")
df_validation.to_csv(OUTPUT_VALID_PATH, index=False)

# Evaluation dataframe
df_evaluation = evaluation_df.copy()
if GROUP_USER_INTERACTIONS_BY_TIMESTAMP:
    df_evaluation["predict"] = df_evaluation["predict"].map(lambda l: " ".join(map(str, l)))
df_evaluation["profile"] = df_evaluation["profile"].map(lambda l: " ".join(map(str, l)))
print(f">> Saving evaluation data ({OUTPUT_EVAL_PATH})")
df_evaluation.to_csv(OUTPUT_EVAL_PATH, index=False)
