In [1]:
import os.path as osp
from omegaconf import OmegaConf
import numpy as np
from sklearn.metrics import average_precision_score, f1_score
from scipy.stats import ttest_rel
import matplotlib.pyplot as plt
import pandas as pd
import scipy
import torch
from tqdm.notebook import tqdm

eps = np.finfo("float").eps

plt.style.use(["science", "ieee"])
out_path = osp.join("../outputs/figures")
metric_res_dicts_path = osp.join(out_path, "metric_res_dicts.npy")


In [2]:
def build_label_ratio_dicts(results_path):
    res_dict = OmegaConf.load(results_path)

    # Build absolute path
    res_dict = {
        key: osp.join(res_dict["base_path"], value)
        for key, value in res_dict.items()
        if key != "base_path"
    }

    no_cf_dict = {key: value for key, value in res_dict.items() if "_no_cf" in key}
    with_cf_dict = {key: value for key, value in res_dict.items() if "_with_cf" in key}
    return no_cf_dict, with_cf_dict


def load_preds(base_path):
    no_cf_dict, with_cf_dict = build_label_ratio_dicts(base_path)

    labels = np.load(osp.join(list(no_cf_dict.values())[0], "labels.npy"))

    no_cf_preds, with_cf_preds = [], []
    for (key_a, path_a), (key_b, path_b) in zip(
        no_cf_dict.items(), with_cf_dict.items()
    ):
        preds_a = np.load(osp.join(path_a, "preds.npy"))
        preds_b = np.load(osp.join(path_b, "preds.npy"))

        ap_a = average_precision_score(labels, preds_a)  # ,average='micro')
        ap_b = average_precision_score(labels, preds_b)  # ,average='micro')

        print(f"{key_a} {key_b} [{ap_a:.3f} {ap_b:.3f}]. size={len(preds_a)}")
        no_cf_preds.append(preds_a)
        with_cf_preds.append(preds_b)

    return {
        "no_cf_preds": no_cf_preds,
        "with_cf_preds": with_cf_preds,
        "labels": labels,
    }


In [3]:
dataset_mapping = {
    "Beauty": "Beauty",
    "Toys_and_Games": "Toys",
    "Clothing_Shoes_and_Jewelry": "Clothing",
    "movielens": "MovieLens",
}

preds_dict = {}
for dataset_name, print_name in dataset_mapping.items():
    print(dataset_name)
    preds_dict[print_name] = load_preds(
        osp.join(f"../outputs/{dataset_name}/results.yaml")
    )

Beauty
label_ratio_0.1_no_cf label_ratio_0.1_with_cf [0.070 0.052]. size=3624
label_ratio_0.2_no_cf label_ratio_0.2_with_cf [0.239 0.238]. size=3624
label_ratio_0.3_no_cf label_ratio_0.3_with_cf [0.268 0.271]. size=3624
label_ratio_0.4_no_cf label_ratio_0.4_with_cf [0.301 0.296]. size=3624
label_ratio_0.5_no_cf label_ratio_0.5_with_cf [0.314 0.319]. size=3624
label_ratio_0.6_no_cf label_ratio_0.6_with_cf [0.329 0.330]. size=3624
label_ratio_0.7_no_cf label_ratio_0.7_with_cf [0.327 0.330]. size=3624
label_ratio_0.8_no_cf label_ratio_0.8_with_cf [0.344 0.338]. size=3624
label_ratio_0.9_no_cf label_ratio_0.9_with_cf [0.349 0.353]. size=3624
label_ratio_1.0_no_cf label_ratio_1.0_with_cf [0.354 0.358]. size=3624
Toys_and_Games
label_ratio_0.1_no_cf label_ratio_0.1_with_cf [0.078 0.098]. size=3572
label_ratio_0.2_no_cf label_ratio_0.2_with_cf [0.168 0.183]. size=3572
label_ratio_0.3_no_cf label_ratio_0.3_with_cf [0.295 0.294]. size=3572
label_ratio_0.4_no_cf label_ratio_0.4_with_cf [0.315 0.

In [4]:
def mean_confidence_interval(data, confidence=0.9):
    a = 1.0 * np.array(data)
    n = len(a)
    se = scipy.stats.sem(a)
    h = se * scipy.stats.t.ppf((1 + confidence) / 2.0, n - 1)
    return h


def compare_results(preds_dict: dict, eval_function,metric_dict:dict):
    for dataset_name, dataset_dict in preds_dict.items():
        print(dataset_name)
        if dataset_name in metric_dict:
            print(f'{dataset_name} exists in metric_dict')
            continue

        labels = dataset_dict["labels"]
        no_cf_preds = dataset_dict["no_cf_preds"]
        with_cf_preds = dataset_dict["with_cf_preds"]
        df = single_set_compare_results(
            labels, no_cf_preds, with_cf_preds, eval_function
        )
        metric_dict[dataset_name] = df
        print(df[["no_cf", "with_cf"]].round(2).T)
    return metric_dict

def single_label_ratio_compare_results(label_ratio,labels,preds_a, preds_b,eval_function):
        # Define output
        res_dict = {'label_ratio': label_ratio}

        # Evaluate performance
        perf_a = eval_function(labels, preds_a)
        perf_b = eval_function(labels, preds_b)

        res_dict["pvalue"] = ttest_rel(perf_a, perf_b).pvalue

        # No CF
        res_dict["no_cf"] = np.mean(perf_a)
        res_dict["no_cf_std"] = np.std(perf_a)
        res_dict["no_cf_ci"] = mean_confidence_interval(perf_a)

        # With CF
        res_dict["with_cf"] = np.mean(perf_b)
        res_dict["with_cf_std"] = np.std(perf_b)
        res_dict["with_cf_ci"] = mean_confidence_interval(perf_b)

        return res_dict

def single_set_compare_results(
    labels, no_cf_pred_list, with_cf_pred_list, eval_function
):  

    # Defining a dict
    res_dicts = []
    total = len(no_cf_pred_list)
    label_ratios = np.arange(0.1, 1.1, 0.1)
    for label_ratio, preds_a, preds_b in tqdm(zip(label_ratios, no_cf_pred_list, with_cf_pred_list), total=total):

        res_dict = single_label_ratio_compare_results(label_ratio,labels,preds_a, preds_b,eval_function)
        res_dicts.append(res_dict)

    df = pd.DataFrame(res_dicts)
    df.set_index('label_ratio')
    df["improvement"] = df["with_cf"] / df["no_cf"] - 1.0

    return df


def plot_performance(df):
    fig, ax = plt.subplots(1, 1, dpi=150)
    ax.plot(df.index, df["no_cf"], label="Baseline")
    ax.plot(df.index, df["with_cf"], label="With CF")

    for i, key in enumerate(["no_cf", "with_cf"]):
        ax.fill_between(
            df.index,
            df[key] - df[key + "_ci"],
            df[key] + df[key + "_ci"],
            color=f"C{i}",
            alpha=0.1,
        )

    ax.set_xlabel("Label ratio")
    ax.legend()
    return ax


def plot_pvalues(df):
    fig, ax = plt.subplots(1, 1, dpi=150)
    ax.plot(df.index, df["pvalue"])
    plt.axhline(y=0.05, color="r", linestyle="-", label="p=0.05")
    ax.set_xlabel("Label ratio")
    ax.set_ylabel("p-value")
    ax.legend()
    plt.show()


# Define metrics

In [5]:
def calc_top1_acc(labels,preds):
    return np.array([labels[n][top1] for n, top1 in enumerate(np.argmax(preds,axis=1))])

def calc_ap_score(labels,preds):
    aps = []
    num_experiments = 100
    num_samples = int(0.9 * len(labels))

    idxs_list = np.random.randint(low=0,high=len(labels), size=(num_experiments,num_samples))
    for idxs in idxs_list:
        labels_chosen, preds_chosen = labels[idxs],preds[idxs]
        mask = labels_chosen.sum(axis=0) > 0
        ap = average_precision_score(labels_chosen[:,mask],preds_chosen[:,mask])
        aps.append(ap)
    return np.array(aps)

def cale_f1_score(labels, preds,thresh=0.95):
    f1s = []
    num_experiments = 10
    num_samples = int(0.9 * len(labels))
    for _ in range(num_experiments):
        idxs = np.random.randint(0,len(labels),num_samples)
        labels_chosen, preds_chosen = labels[idxs],preds[idxs]
        mask = labels_chosen.sum(axis=0) > 0
        f1 = f1_score(labels_chosen[:,mask],preds_chosen[:,mask]>thresh,average='samples')
        f1s.append(f1)
    return np.array(f1s)

metric_funcs = {'ap':calc_ap_score, 'top1_acc': calc_top1_acc}#, 'f1':cale_f1_score}


# Compute metrics

In [6]:
if osp.exists(metric_res_dicts_path):
    metric_res_dicts= np.load(metric_res_dicts_path,allow_pickle=True).item()
else:
    metric_res_dicts = {}

for metric_name, metric_func in metric_funcs.items():
    print(metric_name)

    # Initilize output: if metric exsits, use previous results
    single_metric_res_dict = {}
    if metric_name in metric_res_dicts:
        single_metric_res_dict = metric_res_dicts[metric_name]

    # metric -> dataset -> performance dataframe
    single_metric_res_dict = compare_results(preds_dict, metric_func,single_metric_res_dict)

    # Add to dict
    metric_res_dicts[metric_name] = single_metric_res_dict
    np.save(metric_res_dicts_path,metric_res_dicts)
    print()

np.save(metric_res_dicts_path,metric_res_dicts)



ap
Beauty


  0%|          | 0/10 [00:00<?, ?it/s]

            0     1     2     3     4     5     6     7     8     9
no_cf    0.07  0.25  0.28  0.31  0.32  0.34  0.34  0.36  0.36  0.36
with_cf  0.05  0.25  0.28  0.30  0.33  0.34  0.34  0.35  0.36  0.37
Toys


  0%|          | 0/10 [00:00<?, ?it/s]

            0     1    2     3     4     5     6     7     8     9
no_cf    0.08  0.17  0.3  0.32  0.35  0.37  0.38  0.40  0.41  0.41
with_cf  0.10  0.19  0.3  0.33  0.37  0.39  0.40  0.41  0.42  0.42
Clothing


  0%|          | 0/10 [00:00<?, ?it/s]

# Plot performance

In [None]:
for metric_name, single_metric_res_dict in metric_res_dicts.items():
    print(metric_name)
    fig, axs = plt.subplots(1, 4, figsize=(6, 1.5))
    for row, (dataset_name, df) in enumerate(single_metric_res_dict.items()):
        ax = axs[row]

        ax.plot(df.index, df["no_cf"], label="Baseline")
        ax.plot(df.index, df["with_cf"], label="With CF")

        for i, key in enumerate(["no_cf", "with_cf"]):
            ax.fill_between(
                df.index,
                df[key] - df[key + "_ci"],
                df[key] + df[key + "_ci"],
                color=f"C{i}",
                alpha=0.1,
            )

        ax.set_xlabel("Label ratio \n\n ({}) {}".format(chr(row + 97), dataset_name))
    axs[0].legend()
    plt.tight_layout()
    plt.show()

    fig, axs = plt.subplots(1, 4, figsize=(6, 1.5))
    for row, (dataset_name, df) in enumerate(single_metric_res_dict.items()):
        ax = axs[row]

        ax.bar(df.index, df["improvement"], width=0.075)
        ax.set_xlabel("Label ratio \n\n ({}) {}".format(chr(row + 97), dataset_name))
    axs[0].set_ylabel("Relative \n improvement (\%)")

    plt.tight_layout()
    plt.show()
    
    break


# mAP

In [None]:
def compute_recall_at_k(labels, preds,k: int = 5):
    recall_sum, item_num = 0, 0

    recalls = []
    for pred, label in zip(torch.tensor(preds), torch.tensor(labels)):
        _, pred_idx = torch.topk(pred, k=k)  # The predicted labels
        label_idx = torch.where(label == 1)[0]  # The ground truth labels

        # In case there are no labels
        if len(label_idx) == 0:
            continue

        # Recal per item
        recall_i = sum(el in pred_idx for el in label_idx) / len(label_idx)

        recalls.append(recall_i)

    return recalls

df = compare_results(labels, no_cf_preds, with_cf_preds, compute_recall_at_k)

ax = plot_performance(df)
ax.set_ylabel('Recall@5')
plt.show()

print(df.round(2))

In [None]:
def compute_recall_at_k(labels, preds,k: int = 10):
    recall_sum, item_num = 0, 0

    recalls = []
    for pred, label in zip(torch.tensor(preds), torch.tensor(labels)):
        _, pred_idx = torch.topk(pred, k=k)  # The predicted labels
        label_idx = torch.where(label == 1)[0]  # The ground truth labels

        # In case there are no labels
        if len(label_idx) == 0:
            continue

        # Recal per item
        recall_i = sum(el in pred_idx for el in label_idx) / len(label_idx)

        recalls.append(recall_i)

    return recalls

df = compare_results(labels, no_cf_preds, with_cf_preds, compute_recall_at_k)

ax = plot_performance(df)
ax.set_ylabel('Recall@10')
plt.show()

print(df.round(2))

In [None]:
def compute_f1_score(labels, preds,thresh=0.95):
    f1s = []
    num_experiments = 75
    num_samples = int(0.9 * len(labels))
    for _ in range(num_experiments):
        idxs = np.random.randint(0,len(labels),num_samples)
        labels_chosen, preds_chosen = labels[idxs],preds[idxs]
        mask = labels_chosen.sum(axis=0) > 0
        f1 = f1_score(labels_chosen[:,mask],preds_chosen[:,mask]>thresh,average='samples')
        f1s.append(f1)
    return np.array(f1s)

df = compare_results(labels, no_cf_preds, with_cf_preds, compute_f1_score)

ax = plot_performance(df)
ax.set_ylabel('F1 score')
plt.show()

print(df.round(2))