In [1]:
%reload_ext autoreload
%autoreload 2

import warnings
warnings.filterwarnings('ignore')
from logging import getLogger
from pathlib import Path
import os
import sys
sys.path.append(os.pardir)

import numpy as np
import pandas as pd 
from tqdm import tqdm
from sklearn.utils import check_random_state

import matplotlib.pyplot as plt
from utils import fix_seed, empty_metrics
from run import run_dynamic_match
from visualization_seed import plot_match_per, plot_number_user_retain, plot_user_retain

from synthetic_data import generate_data, generate_reward_data, train_model
import conf

In [2]:
reward_data = generate_reward_data(
    dim = conf.dim,
    T = conf.T,
    alpha_param=conf.alpha_param,
    beta_param=conf.beta_param,
    random_state = conf.random_state)

model = train_model(
    dim = conf.dim,
    T = conf.T,
    n_train = conf.n_train,
    reward_data = reward_data,
    alpha_param=conf.alpha_param,
    beta_param=conf.beta_param,
    random_state = conf.random_state)

T=conf.T
n_x=conf.n_x
n_y=conf.n_y

In [None]:
import os
logger = getLogger(__name__)
logger.info(f"The current working directory is {Path().cwd()}")

# log path
log_path = Path("../result/T")
df_path = log_path / "df"
df_path.mkdir(exist_ok=True, parents=True)


# DataFrame to store results of all seeds
all_data = pd.DataFrame(columns=["seed", "t", "method", "match_x", "match_y", "active_users_x", "active_users_y", "user_retain_x", "user_retain_y", "true_user_retain_x", "true_user_retain_y"])

for seed in tqdm(range(conf.num_seeds), desc="Processing seeds"):
    random_ = check_random_state(conf.random_state + 1 + seed)
    results = {method: empty_metrics(conf.T, n_x, n_y) for method in conf.method_list}

    dataset = generate_data(
        n_x = conf.n_x,
        n_y = conf.n_y,
        dim = conf.dim,
        rel_noise = conf.rel_noise,
        T = conf.T,
        K = conf.K,
        kappa=conf.kappa,
        reward_data=reward_data,
        alpha_param=conf.alpha_param,
        beta_param=conf.beta_param,
        random_state = conf.random_state + 1 + seed,
        random_=random_,
        ) 
    
    run_dynamic_match(
        dataset, 
        model=model,
        proportion=conf.proportion,
        reward_type=conf.reward_type,
        ranking_metric=conf.ranking_metric, 
        results=results, 
        noise=conf.noise,
        candidate_retention=conf.candidate_retention,
        random_state=conf.random_state+1+seed, 
    )
    

    temp_data = []
    for method, metrics in results.items():
        for t in range(1, conf.T):
            temp_data.append({
                "seed": seed,
                "t": t,
                "method": method,
                "match_x": metrics["match_x"][t].mean(),
                "match_y": metrics["match_y"][t].mean(),
                "exposure_x": metrics["exposure_x"][t].mean(),
                "exposure_y": metrics["exposure_y"][t].mean(),
                "fair_x": metrics["fair_x"][t].mean(),
                "fair_y": metrics["fair_y"][t].mean(),
                "active_users_x": metrics["active_users_x"][t].mean(),
                "active_users_y": metrics["active_users_y"][t].mean(),
                "user_retain_x": metrics["user_retain_x"][t].mean(),
                "user_retain_y": metrics["user_retain_y"][t].mean(),
                "true_user_retain_x": metrics["true_user_retain_x"][t].mean(),
                "true_user_retain_y": metrics["true_user_retain_y"][t].mean(),
            })

    all_data = pd.concat([all_data, pd.DataFrame(temp_data)], ignore_index=True)

all_data["t"] = pd.to_numeric(all_data["t"], errors="coerce")
all_data.to_csv(df_path / "all_data_results.csv", index=False)


In [4]:
all_data.to_csv(df_path / "all_data_results.csv", index=False)

In [5]:
from visualization_seed import plot_histogram
method_list = conf.method_list

In [6]:
alpha = np.concatenate([dataset['alpha_x'].reshape(-1), dataset['alpha_y'].reshape(-1)], axis=0)
for method in method_list:
    results[method]["match"] = np.concatenate([results[method]["match_x"], results[method]["match_y"]], axis=1)
    results[method]["exposure"] = np.concatenate([results[method]["exposure_x"], results[method]["exposure_y"]], axis=1)
    results[method]["fair"] = np.concatenate([results[method]["fair_x"], results[method]["fair_y"]], axis=1)
    results[method]["true_user_retain"] = np.concatenate([results[method]["true_user_retain_x"], results[method]["true_user_retain_y"]], axis=1)
    results[method]["active_users"] = np.concatenate([results[method]["active_users_x"], results[method]["active_users_y"]], axis=1)
    results[method]["active_users"] = np.concatenate([results[method]["active_users_x"], results[method]["active_users_y"]], axis=1)

    results[method]["active_match"] = results[method]["match"].copy()
    results[method]["active_match"][results[method]["active_users"]== 0] = np.nan
    results[method]["effective_match"] = results[method]["match"].copy()
    results[method]["effective_match"] = results[method]["effective_match"] - alpha
    results[method]["effective_active_match"] = results[method]["effective_match"].copy()
    results[method]["effective_active_match"][results[method]["active_users"]== 0] = np.nan


In [7]:
import pickle
with open(df_path / "results.pkl", "wb") as f:
    pickle.dump(results, f)

In [8]:
method_list = ['MRet (best)', 'FairCo (lam=100)']

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(18, 6))
plot_match_per(all_data, ax=axes[0], side="both", n_x=n_x, n_y=n_y)
plot_number_user_retain(all_data, ax=axes[1], side="both", n_x=n_x, n_y=n_y)