# Annotator Simulation
In this notebook, we simulate annotators for the classification data sets for which only the ground truth labels were given. Before executing this notebook, you need to download or create the data sets by running the notebook [`data_set_creation_download.ipynb`](./data_set_creation_download.ipynb).

In [None]:
%load_ext autoreload
%autoreload 2

import sys

sys.path.append("../")

import numpy as np
import torch

from evaluation.data_utils import DATA_PATH
from evaluation.architecture_utils import get_gt_net

from IPython.display import display, HTML

from lfma.utils import (
    annot_sim_clf_cluster,
    compute_annot_perf_clf,
    generate_expert_cluster_combinations,
)

from sklearn.cluster import MiniBatchKMeans
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.utils import check_random_state

from torch.utils.data import TensorDataset, DataLoader
from torchvision.transforms import Resize

# Set random state to ensure reproducibility.
RANDOM_STATE = 0

# Check device.
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

### Annotator Sets
In the following, we define the simulation of annotator sets (described in the accompanied article) as data set configuration dictionaries.

In [None]:
# Configuration of annotator sets simulation per data set.
configs_data_sets = [
    {
        "data_set_name": "toy-classification",
        "random_state": 1,
        "n_annotators": 10,
        "n_adversarial_annotators": 1,
        "n_cluster_specialized_annotators": 2,
        "n_target_specialized_annotators": 1,
        "n_clusters": 4,
    },
    {
        "data_set_name": "toy-classification",
        "random_state": 5,
        "n_annotators": 100,
        "n_adversarial_annotators": 10,
        "n_cluster_specialized_annotators": 20,
        "n_target_specialized_annotators": 10,
        "n_clusters": 4,
        "name_appendix": "-inductive",
    },
        {
            "data_set_name": "letter",
            "random_state": 6,
            "n_annotators": 10,
            "n_adversarial_annotators": 1,
            "n_cluster_specialized_annotators": 2,
            "n_target_specialized_annotators": 1,
            "n_clusters": 10,
        },
        {
            "data_set_name": "letter",
            "random_state": 7,
            "n_annotators": 110,
            "n_adversarial_annotators": 1,
            "n_cluster_specialized_annotators": 2,
            "n_target_specialized_annotators": 1,
            "n_random_annotators": 100,
            "n_clusters": 10,
            "name_appendix": "-random",
        },
        {
            "data_set_name": "letter",
            "random_state": 9,
            "n_annotators": 100,
            "n_adversarial_annotators": 10,
            "n_cluster_specialized_annotators": 20,
            "n_target_specialized_annotators": 10,
            "n_clusters": 10,
            "name_appendix": "-inductive",
        },
        {
            "data_set_name": "fmnist",
            "random_state": 10,
            "n_annotators": 10,
            "n_adversarial_annotators": 1,
            "n_cluster_specialized_annotators": 2,
            "n_target_specialized_annotators": 1,
            "n_clusters": 10,
        },
        {
            "data_set_name": "fmnist",
            "random_state": 11,
            "n_annotators": 110,
            "n_adversarial_annotators": 1,
            "n_cluster_specialized_annotators": 2,
            "n_target_specialized_annotators": 1,
            "n_random_annotators": 100,
            "n_clusters": 10,
            "name_appendix": "-random",
        },
        {
            "data_set_name": "fmnist",
            "random_state": 13,
            "n_annotators": 100,
            "n_adversarial_annotators": 10,
            "n_cluster_specialized_annotators": 20,
            "n_target_specialized_annotators": 10,
            "n_clusters": 10,
            "name_appendix": "-inductive",
        },
        {
            "data_set_name": "cifar10",
            "random_state": 14,
            "n_annotators": 10,
            "n_adversarial_annotators": 1,
            "n_cluster_specialized_annotators": 2,
            "n_target_specialized_annotators": 1,
            "n_clusters": 10,
        },
        {
            "data_set_name": "cifar10",
            "random_state": 15,
            "n_annotators": 110,
            "n_adversarial_annotators": 1,
            "n_cluster_specialized_annotators": 2,
            "n_target_specialized_annotators": 1,
            "n_random_annotators": 100,
            "n_clusters": 10,
            "name_appendix": "-random",
        },
        {
            "data_set_name": "cifar10",
            "random_state": 17,
            "n_annotators": 100,
            "n_adversarial_annotators": 10,
            "n_cluster_specialized_annotators": 20,
            "n_target_specialized_annotators": 10,
            "n_clusters": 10,
            "name_appendix": "-inductive",
        },
        {
            "data_set_name": "svhn",
            "random_state": 18,
            "n_annotators": 10,
            "n_adversarial_annotators": 1,
            "n_cluster_specialized_annotators": 2,
            "n_target_specialized_annotators": 1,
            "n_clusters": 10,
        },
        {
            "data_set_name": "svhn",
            "random_state": 19,
            "n_annotators": 110,
            "n_adversarial_annotators": 1,
            "n_cluster_specialized_annotators": 2,
            "n_target_specialized_annotators": 1,
            "n_random_annotators": 100,
            "n_clusters": 10,
            "name_appendix": "-random",
        },
        {
            "data_set_name": "svhn",
            "random_state": 21,
            "n_annotators": 100,
            "n_adversarial_annotators": 10,
            "n_cluster_specialized_annotators": 20,
            "n_target_specialized_annotators": 10,
            "n_clusters": 10,
            "name_appendix": "-inductive",
        },
]

### Simulation
In the following, we perform the simulation of the above configured annotator sets.

In [None]:
for data_dict in configs_data_sets:
    data_set_name = data_dict.get("data_set_name")
    random_state = data_dict.get("random_state", None)
    n_annotators = data_dict.get("n_annotators", 10)
    n_adv_annotators = data_dict.get("n_adversarial_annotators", 0)
    n_cluster_spec_annotators = data_dict.get("n_cluster_specialized_annotators", 0)
    n_target_spec_annotators = data_dict.get("n_target_specialized_annotators", 0)
    n_random_annotators = data_dict.get("n_random_annotators", 0)
    n_clusters = data_dict.get("n_clusters", 10)
    name_appendix = data_dict.get("name_appendix", "")
    annotator_types = np.zeros(n_annotators, dtype=float)

    print(data_set_name)

    # Load data.
    try:
        X = np.load(f"{DATA_PATH}/{data_set_name}-X.npy")
        y_true = np.load(f"{DATA_PATH}/{data_set_name}-y-true.npy")
        classes = np.unique(y_true)
        n_classes = len(classes)
    except FileNotFoundError as e:
        print(f"{e}. Continue with the next data set.")
        continue

    # Transform potential images to vectors.
    if data_set_name in ["cifar10", "svhn"]:
        resnet_dict = get_gt_net(
            data_set_name=data_set_name, n_classes=n_classes, n_features=None, dropout_rate=0, pretrained=True
        )[0]
        resnet = resnet_dict["gt_embed_x"].to(DEVICE)
        tensor_x = torch.Tensor(X)
        transform = Resize((224, 224))
        dataset = TensorDataset(tensor_x)
        dataloader = DataLoader(dataset, batch_size=128, shuffle=False)
        X = []
        with torch.no_grad():
            for x in dataloader:
                x_embed = resnet(transform(x[0].to(DEVICE))).cpu().numpy()
                X.append(x_embed)
        X = np.concatenate(X)
    elif data_set_name in ["fmnist"]:
        X = X.reshape(*X.shape[:-3], -1)
        X = StandardScaler().fit_transform(X)
    else:
        X = StandardScaler().fit_transform(X)

    # Randomly generate accuracies.
    random_state = check_random_state(random_state)
    min_acc = 1 / n_classes
    A = random_state.uniform(min_acc, 1, (n_annotators - n_target_spec_annotators - n_random_annotators) * n_clusters)
    A = A.reshape((n_annotators - n_target_spec_annotators - n_random_annotators, n_clusters))

    # Generate adversarial annotators.
    A[:n_adv_annotators] = 0.05
    annotator_types[:n_adv_annotators] = 1

    # Generate specialized annotators who are either really well on a cluster or bad.
    annotator_types[n_adv_annotators : n_adv_annotators + n_cluster_spec_annotators] = 2
    cluster_indices = np.arange(n_clusters)
    n_good_clusters = int(n_clusters / 2 + 0.5)
    good_clusters_indices = generate_expert_cluster_combinations(
        n_annotators=n_cluster_spec_annotators,
        n_clusters=n_clusters,
        n_expert_clusters=n_good_clusters,
        random_state=random_state,
    )
    for a_idx, a in enumerate(
        range(
            n_adv_annotators,
            n_adv_annotators + n_cluster_spec_annotators,
        )
    ):
        A[a] = 0.05
        A[a, good_clusters_indices[a_idx]] = 0.95

    # Simulate labels of cluster-based annotators.
    if len(A) > 0:
        y, y_cluster = annot_sim_clf_cluster(
            X=X,
            y_true=y_true,
            cluster_annot_perfs=A,
            random_state=random_state,
        )
    else:
        y = np.zeros((X.shape[0], 0))
        y_cluster = MiniBatchKMeans(n_clusters=n_clusters).fit_predict(X)

    # Generate specialized annotators who are either really well on a cluster or bad.
    if n_target_spec_annotators > 0:
        annotator_types[y.shape[1] : y.shape[1] + n_target_spec_annotators] = 3
        A = np.full((n_target_spec_annotators, n_classes), 0.05)
        class_indices = np.arange(n_classes)
        n_good_classes = int(n_classes / 2 + 0.5)
        good_class_indices = generate_expert_cluster_combinations(
            n_annotators=n_target_spec_annotators,
            n_clusters=n_classes,
            n_expert_clusters=n_good_classes,
            random_state=random_state,
        )
        for a in range(n_target_spec_annotators):
            A[a, good_class_indices[a]] = 0.95

        # Simulate labels of target-based annotators.
        y_target, _ = annot_sim_clf_cluster(
            X=y_true.reshape(-1, 1),
            y_true=y_true,
            cluster_annot_perfs=A,
            random_state=random_state,
        )

        y = np.hstack((y, y_target))

    if n_random_annotators > 0:
        annotator_types[y.shape[1] : y.shape[1] + n_random_annotators] = 4
        y_random = np.column_stack(
            [random_state.choice(classes, replace=True, size=len(X)) for _ in range(n_random_annotators)]
        )
        y = np.column_stack((y, y_random))

    # Generate annotator features.
    n_annotator_types = len(np.unique(annotator_types))
    A = np.zeros((n_annotators, n_annotator_types + n_classes + n_clusters))
    A[:, :n_annotator_types] = OneHotEncoder(sparse=False).fit_transform(annotator_types.reshape(-1, 1))
    for a_idx in range(n_annotators):
        class_acc = np.diag(
            confusion_matrix(y_true=y_true, y_pred=y[:, a_idx], labels=np.arange(n_classes), normalize="true")
        )
        A[a_idx, n_annotator_types : n_annotator_types + n_classes] = class_acc
        for c in range(n_clusters):
            is_cluster_c = np.equal(y_cluster, c)
            A[a_idx, n_annotator_types + n_classes + c] = np.equal(y[is_cluster_c, a_idx], y_true[is_cluster_c]).mean()

    # Print performances of simulated annotators.
    performances = compute_annot_perf_clf(y_true=y_true, y=y)
    display(HTML(performances.to_html()))

    # Save annotator features and simulated annotations.
    np.save(f"{DATA_PATH}/{data_set_name}-A{name_appendix}.npy", A.astype(np.float32))
    np.save(f"{DATA_PATH}/{data_set_name}-y{name_appendix}.npy", y.astype(np.int64))