# Getting False Positives

## Load Pretrained Model and Validation Examples

In [29]:
from T5FineTuner import T5FineTuner, RPDataset
from utils import get_folds
import torch
import argparse
from transformers import T5Tokenizer
from torch.utils.data import Dataset, DataLoader

DATASET = "RP-Crowd-3"
MODEL_NAME_OR_PATH = "GermanT5/t5-efficient-oscar-german-small-el32"
WANDB_PROJECT_NAME = "rp-crowd-3-folds-t5-efficient-small-el32"
OUTPUT_DIR = "./GermanT5-RP-Mod/t5-efficient-oscar-german-small-el32/"
SOURCE = f"./Datasets/{DATASET}-folds.csv"

tokenizer = T5Tokenizer.from_pretrained(MODEL_NAME_OR_PATH)
#TODO: can I automatically pick the best checkpoint path?
checkpoint_path = "./GermanT5-RP-Mod/t5-efficient-oscar-german-small-el32/lr-0.0004-wd-0.1/epoch=3-val_accuracy=0.74-val_loss=0.28.ckpt"
train_inputs, train_targets, val_inputs, val_targets = get_folds(SOURCE)

train_dataset = RPDataset(tokenizer, train_inputs, train_targets)
valid_dataset = RPDataset(tokenizer, val_inputs, val_targets)

checkpoint = torch.load(checkpoint_path)
hparams = checkpoint["hyper_parameters"]
hparams["train_dataset"] = train_dataset
hparams["val_dataset"] = valid_dataset
new_args = argparse.Namespace(**hparams)

new_model = T5FineTuner.load_from_checkpoint(checkpoint_path, hparams=new_args)

[nltk_data] Downloading package punkt to /home/dobby/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


[nltk_data] Downloading package punkt to /home/dobby/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


## Get False Positives

In [6]:
print(len(valid_dataset))

1260


In [None]:
indices_false_positives = []
for i in range(len(valid_dataset)):
    batch = valid_dataset[i]

In [25]:
val_dataloader = DataLoader(valid_dataset, batch_size=len(valid_dataset), num_workers=4)
indices_false_positives = []

for batch in val_dataloader:
    labels = batch["target_ids"]

    outs = new_model.model.generate(input_ids=batch["source_ids"], 
                                attention_mask=batch["source_mask"], 
                                max_length=2)

    dec = [new_model.tokenizer.decode(ids) for ids in outs]
    target = [new_model.tokenizer.decode(label) for label in labels]

    new_outputs = [s[6:] for s in dec]
    new_targets = [s[:-4] for s in target]
    
    # get indices of false positives
    indices = [i for i in range(len(new_outputs)) if new_outputs[i] != new_targets[i] and new_outputs[i] == "problematisch"]

    # can I decode from the input ids?... probably not
    indices_false_positives.extend(indices)

print(indices_false_positives)


[3, 7, 11, 29, 41, 43, 57, 71, 73, 89, 93, 103, 125, 157, 171, 177, 211, 215, 217, 259, 265, 267, 273, 275, 279, 289, 291, 297, 299, 305, 311, 341, 375, 389, 399, 401, 405, 417, 435, 439, 453, 461, 463, 465, 467, 477, 481, 493, 501, 503, 511, 527, 529, 533, 535, 537, 541, 545, 549, 555, 557, 559, 563, 571, 575, 583, 585, 589, 599, 603, 609, 613, 617, 619, 627, 629, 631, 645, 647, 665, 673, 683, 685, 691, 693, 697, 705, 717, 739, 795, 797, 801, 819, 821, 829, 837, 843, 851, 855, 859, 877, 879, 885, 887, 893, 921, 937, 943, 947, 965, 967, 981, 993, 1003, 1011, 1013, 1019, 1025, 1039, 1049, 1059, 1063, 1079, 1111, 1115, 1123, 1157, 1163, 1171, 1173, 1183, 1193, 1199, 1205, 1213, 1215, 1239, 1251, 1259]


In [28]:
import pandas as pd
import numpy as np
false_positive_text = [val_inputs[i] for i in indices_false_positives]
np.save("./false_pos/RP-Mod-false-pos.csv", false_positive_text)

In [23]:
val_inputs[indices_false_positives[7]]

'classification Warum wird nicht berichtet wofür die Kurden demonstrieren Doch nicht für eine Menschenkette vom Hbf bis zum Landtag Ich brauche mehr Details '

## Explain Using Shapley Values

In [None]:
import shap
# build an explainer using a token masker
explainer = shap.Explainer(f, tokenizer)

In [26]:
from transformers import T5Tokenizer, T5ForConditionalGeneration

model = T5ForConditionalGeneration.from_pretrained("GermanT5/t5-efficient-oscar-german-small-el32")

model

T5ForConditionalGeneration(
  (shared): Embedding(32128, 512)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 512)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=512, out_features=512, bias=False)
              (k): Linear(in_features=512, out_features=512, bias=False)
              (v): Linear(in_features=512, out_features=512, bias=False)
              (o): Linear(in_features=512, out_features=512, bias=False)
              (relative_attention_bias): Embedding(32, 8)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseReluDense(
              (wi): Linear(in_features=512, out_features=2048, bias=False)
              (wo): Linear(in_features=2048, out_features=512, bias=False)
              (dropout): Dro