In [41]:
import torch
import numpy as np
import evaluate
from torch.utils.data import DataLoader
from datasets import load_dataset, DatasetDict
from transformers import (
    AutoConfig,
    AutoModelForQuestionAnswering,
    AutoModelForSequenceClassification,
    AutoTokenizer,
    DataCollatorWithPadding
)

model_name_or_path = '../1_finetune_for_MRC'
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)
metric = evaluate.load("glue", 'mnli')

def load_target_dataset():
    raw_datasets = load_dataset("multi_nli")
    train_dataset = raw_datasets['train'].filter(lambda x: x["genre"] == "slate" or x["genre"] == "travel")
    train_dataset = train_dataset.train_test_split(train_size=0.01) # this is here for preprocessing labels

    raw_datasets = DatasetDict({
        'train': train_dataset['train'],
        'validation_matched': raw_datasets['validation_matched'].filter(lambda x: x['genre'] == 'slate' or x['genre'] == 'travel'),
        'validation_mismatched': raw_datasets['validation_mismatched'],
    })

    def preprocess_function(examples):
        # this is set up for MNLI, taken from run_glue_no_trainer
        sentence1_key = 'premise'
        sentence2_key = 'hypothesis'

        # Tokenize the texts
        texts = (
            (examples[sentence1_key],) if sentence2_key is None else (examples[sentence1_key], examples[sentence2_key])
        )
        result = tokenizer(*texts, padding=False, max_length=128, truncation=True)
        result["labels"] = examples["label"]
        return result

    processed_datasets = raw_datasets.map(
        preprocess_function,
        batched=True,
        remove_columns=raw_datasets["train"].column_names,
        desc="Running tokenizer on dataset",
    )

    target_dataset = processed_datasets['validation_matched']
    target_dataset = target_dataset.train_test_split(test_size=0.010)['test'] # TODO for dev
    return target_dataset

def load_pretrained_model_mnli():
    config = AutoConfig.from_pretrained(model_name_or_path, num_labels=3, finetuning_task='mnli')
    model = AutoModelForSequenceClassification.from_pretrained(
        model_name_or_path,
        from_tf=bool(".ckpt" in model_name_or_path),
        config=config,
    )
    return model

def load_pretrained_model_squad():
    config = AutoConfig.from_pretrained(model_name_or_path)
    model = AutoModelForQuestionAnswering.from_pretrained(
        model_name_or_path,
        from_tf=bool(".ckpt" in model_name_or_path),
        config=config,
    )
    return model

target_dataset = load_target_dataset() # this is target datset D in the leep paper
model = load_pretrained_model_mnli() # this is theta in the leep paper, pretrained on some other dataset

model.eval()

Found cached dataset multi_nli (/fs/classhomes/spring2023/cmsc828a/c828a017/.cache/huggingface/datasets/multi_nli/default/0.0.0/591f72eb6263d1ab527561777936b199b714cda156d35716881158a2bd144f39)
100%|██████████| 3/3 [00:01<00:00,  1.99it/s]
Loading cached processed dataset at /fs/classhomes/spring2023/cmsc828a/c828a017/.cache/huggingface/datasets/multi_nli/default/0.0.0/591f72eb6263d1ab527561777936b199b714cda156d35716881158a2bd144f39/cache-0e2f950f890e5695.arrow
Loading cached processed dataset at /fs/classhomes/spring2023/cmsc828a/c828a017/.cache/huggingface/datasets/multi_nli/default/0.0.0/591f72eb6263d1ab527561777936b199b714cda156d35716881158a2bd144f39/cache-c75e104972489eea.arrow
Loading cached processed dataset at /fs/classhomes/spring2023/cmsc828a/c828a017/.cache/huggingface/datasets/multi_nli/default/0.0.0/591f72eb6263d1ab527561777936b199b714cda156d35716881158a2bd144f39/cache-fcf01ce179d8a164.arrow
Loading cached processed dataset at /fs/classhomes/spring2023/cmsc828a/c828a017/.c

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0): TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
       

In [42]:
# step 1: obtain dummy distribution \theta(x_i)
target_label_space = [0, 1, 2] # Y
source_label_space = [0, 1, 2] # Z
ground_truths = np.array(target_dataset['labels']) # y_i
n = len(target_dataset)

target_dataloader = DataLoader(target_dataset, collate_fn=DataCollatorWithPadding(tokenizer), batch_size=8)
predictions = []

for step, batch in enumerate(target_dataloader):
    with torch.no_grad():
        batch_outputs = model(**batch)
        batch_predictions = batch_outputs.logits.argmax(dim=-1)
        predictions += batch_predictions

label, count = np.unique(predictions, return_counts=True)

You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


In [43]:
dict(zip(*(label, count)))

{0: 3, 1: 25, 2: 12}

In [44]:
dummy_distribution = {z_i: 0 for z_i in source_label_space} # initialize all label probabilities to 0 s.t. if it's not predicted, it still shows up
dummy_distribution.update({label[i]: count[i]/n for i in range(len(label))})
dummy_distribution

{0: 0.075, 1: 0.625, 2: 0.3}

In [45]:
# step 2.1: compute empirical joint distribution P(y, z)
empirical_joint_distribution = {}

for y_i in target_label_space:
    for z_i in source_label_space:
        empirical_joint_distribution[(y_i, z_i)] = dummy_distribution[z_i]*sum(ground_truths == y_i)/n

empirical_joint_distribution

{(0, 0): 0.03,
 (0, 1): 0.25,
 (0, 2): 0.12,
 (1, 0): 0.0225,
 (1, 1): 0.1875,
 (1, 2): 0.09,
 (2, 0): 0.0225,
 (2, 1): 0.1875,
 (2, 2): 0.09}

In [46]:
# step 2.2: compute empirical marginal distribution P(z)
empirical_marginal_distribution = {y: 0 for y in target_label_space}

for y_i in target_label_space:
    for z_i in source_label_space:
        empirical_marginal_distribution[y_i] += empirical_joint_distribution[y_i, z_i]

empirical_marginal_distribution

{0: 0.4, 1: 0.3, 2: 0.3}

In [47]:
# step 2.3: compute empirical conditional distribution P(y|z)
empirical_conditional_distribution = {(y,z): empirical_joint_distribution[(y, z)]/empirical_marginal_distribution[z] for y in target_label_space for z in source_label_space}
empirical_conditional_distribution

{(0, 0): 0.075,
 (0, 1): 0.8333333333333334,
 (0, 2): 0.4,
 (1, 0): 0.056249999999999994,
 (1, 1): 0.625,
 (1, 2): 0.3,
 (2, 0): 0.056249999999999994,
 (2, 1): 0.625,
 (2, 2): 0.3}

In [48]:
# step 3: compute LEEP score
T = 1/n*sum([np.log(sum([empirical_conditional_distribution[(y_i, z)]*dummy_distribution[z] for z in source_label_space])) for y_i in ground_truths])
T

-0.6088557759186706