In [1]:
import pandas as pd 
import numpy as np

# Generate synthetic data with latent factors

In [None]:
def simulate_like_dislike_matrix(
    n_users=10_000,
    n_movies=5_000,
    n_factors=10,
    user_factor_std=1.0,
    movie_factor_std=1.0,
    random_state=12
):
    rng = np.random.default_rng(random_state)

    # Latent factors
    U = rng.normal(0, user_factor_std, size=(n_users, n_factors))
    V = rng.normal(0, movie_factor_std, size=(n_movies, n_factors))

    # Affinity & probabilities
    scores = U @ V.T
    probs = 1 / (1 + np.exp(-scores))

    # Bernoulli draws
    likes = rng.binomial(1, probs)

    # Users factor DataFrame
    user_factors_df = pd.DataFrame(
        U,
        columns=[f"factor_{k}" for k in range(n_factors)]
    )
    user_factors_df.insert(0, "user_id", np.arange(n_users))

    # Movies factor DataFrame
    movie_factors_df = pd.DataFrame(
        V,
        columns=[f"factor_{k}" for k in range(n_factors)]
    )
    movie_factors_df.insert(0, "movie_id", np.arange(n_movies))

    # Interactions DataFrame
    user_ids = np.repeat(np.arange(n_users), n_movies)
    movie_ids = np.tile(np.arange(n_movies), n_users)

    interactions_df = pd.DataFrame({
        "user_id": user_ids,
        "movie_id": movie_ids,
        "liked": likes.ravel()
    })

    return probs, likes, user_factors_df, movie_factors_df, interactions_df