# Getting False Positives

## Load Pretrained Model and Validation Examples

In [3]:
from T5FineTuner import T5FineTuner, RPDataset
from utils import get_folds
import torch
import argparse
from transformers import T5Tokenizer
from torch.utils.data import Dataset, DataLoader

DATASET = "RP-Crowd-3"
MODEL_NAME_OR_PATH = "GermanT5/t5-efficient-oscar-german-small-el32"
WANDB_PROJECT_NAME = "rp-crowd-3-folds-t5-efficient-small-el32"
OUTPUT_DIR = "./GermanT5-RP-Mod/t5-efficient-oscar-german-small-el32/"
SOURCE = f"./Datasets/{DATASET}-folds.csv"

tokenizer = T5Tokenizer.from_pretrained(MODEL_NAME_OR_PATH)
#TODO: can I automatically pick the best checkpoint path?
checkpoint_path = "./GermanT5-RP-Mod/t5-efficient-oscar-german-small-el32/lr-0.0004-wd-0.1/epoch=3-val_accuracy=0.74-val_loss=0.28.ckpt"
train_inputs, train_targets, val_inputs, val_targets = get_folds(SOURCE)

train_dataset = RPDataset(tokenizer, train_inputs, train_targets)
valid_dataset = RPDataset(tokenizer, val_inputs, val_targets)

checkpoint = torch.load(checkpoint_path)
hparams = checkpoint["hyper_parameters"]
hparams["train_dataset"] = train_dataset
hparams["val_dataset"] = valid_dataset
new_args = argparse.Namespace(**hparams)

new_model = T5FineTuner.load_from_checkpoint(checkpoint_path, hparams=new_args)

## Get False Positives

In [6]:
print(len(valid_dataset))

1260


In [None]:
indices_false_positives = []
for i in range(len(valid_dataset)):
    batch = valid_dataset[i]

In [25]:
val_dataloader = DataLoader(valid_dataset, batch_size=len(valid_dataset), num_workers=4)
indices_false_positives = []

for batch in val_dataloader:
    labels = batch["target_ids"]

    outs = new_model.model.generate(input_ids=batch["source_ids"], 
                                attention_mask=batch["source_mask"], 
                                max_length=2)

    dec = [new_model.tokenizer.decode(ids) for ids in outs]
    target = [new_model.tokenizer.decode(label) for label in labels]

    new_outputs = [s[6:] for s in dec]
    new_targets = [s[:-4] for s in target]
    
    # get indices of false positives
    indices = [i for i in range(len(new_outputs)) if new_outputs[i] != new_targets[i] and new_outputs[i] == "problematisch"]

    # can I decode from the input ids?... probably not
    indices_false_positives.extend(indices)

print(indices_false_positives)


[3, 7, 11, 29, 41, 43, 57, 71, 73, 89, 93, 103, 125, 157, 171, 177, 211, 215, 217, 259, 265, 267, 273, 275, 279, 289, 291, 297, 299, 305, 311, 341, 375, 389, 399, 401, 405, 417, 435, 439, 453, 461, 463, 465, 467, 477, 481, 493, 501, 503, 511, 527, 529, 533, 535, 537, 541, 545, 549, 555, 557, 559, 563, 571, 575, 583, 585, 589, 599, 603, 609, 613, 617, 619, 627, 629, 631, 645, 647, 665, 673, 683, 685, 691, 693, 697, 705, 717, 739, 795, 797, 801, 819, 821, 829, 837, 843, 851, 855, 859, 877, 879, 885, 887, 893, 921, 937, 943, 947, 965, 967, 981, 993, 1003, 1011, 1013, 1019, 1025, 1039, 1049, 1059, 1063, 1079, 1111, 1115, 1123, 1157, 1163, 1171, 1173, 1183, 1193, 1199, 1205, 1213, 1215, 1239, 1251, 1259]


In [None]:
import pandas as pd
import numpy as np
false_positive_text = [val_inputs[i] for i in indices_false_positives]
np.save("./false_pos/RP-Mod-false-pos.csv", false_positive_text)

## Get True Negatives

In [9]:
val_dataloader = DataLoader(valid_dataset, batch_size=len(valid_dataset), num_workers=4)
indices_true_negatives = []

for batch in val_dataloader:
    labels = batch["target_ids"]

    outs = new_model.model.generate(input_ids=batch["source_ids"], 
                                attention_mask=batch["source_mask"], 
                                max_length=2)

    dec = [new_model.tokenizer.decode(ids) for ids in outs]
    target = [new_model.tokenizer.decode(label) for label in labels]

    new_outputs = [s[6:] for s in dec]
    new_targets = [s[:-4] for s in target]
    
    # get indices of false positives
    indices = [i for i in range(len(new_outputs)) if new_outputs[i] == new_targets[i] and new_outputs[i] == "unproblematisch"]

    # can I decode from the input ids?... probably not
    indices_true_negatives.extend(indices)

print(indices_true_negatives)

[1, 3, 5, 9, 13, 17, 19, 21, 23, 25, 27, 31, 33, 35, 37, 39, 45, 47, 49, 51, 53, 55, 59, 61, 63, 65, 67, 69, 75, 77, 79, 81, 83, 85, 87, 91, 95, 97, 99, 101, 105, 107, 109, 111, 113, 115, 117, 119, 123, 127, 129, 131, 133, 135, 137, 139, 141, 143, 145, 147, 149, 151, 153, 155, 159, 161, 163, 165, 167, 169, 173, 175, 179, 181, 183, 185, 187, 189, 191, 193, 195, 197, 199, 201, 203, 205, 207, 209, 213, 217, 219, 221, 223, 225, 227, 229, 231, 233, 235, 237, 239, 241, 243, 245, 247, 249, 251, 253, 255, 257, 261, 263, 269, 271, 277, 281, 283, 285, 287, 293, 295, 301, 303, 307, 309, 313, 315, 317, 319, 321, 323, 325, 327, 329, 331, 333, 335, 337, 339, 343, 345, 347, 349, 351, 353, 355, 357, 359, 361, 363, 365, 367, 369, 371, 373, 377, 379, 381, 383, 385, 387, 391, 393, 395, 397, 403, 407, 409, 411, 413, 415, 419, 421, 423, 425, 427, 431, 433, 437, 441, 443, 445, 447, 449, 451, 455, 457, 459, 465, 469, 471, 473, 475, 479, 483, 485, 487, 489, 491, 495, 497, 499, 503, 505, 507, 513, 515, 517, 51

In [10]:
import pandas as pd
import numpy as np

true_negative_text = [val_inputs[i] for i in indices_true_negatives]
np.save("./true_neg/RP-Mod-true-neg", true_negative_text)

### Test Random Strings

In [28]:
from T5FineTuner import T5FineTuner, RPDataset
from utils import get_folds
import torch
import argparse
from transformers import T5Tokenizer
from torch.utils.data import Dataset, DataLoader

DATASET = "RP-Crowd-3"
MODEL_NAME_OR_PATH = "GermanT5/t5-efficient-oscar-german-small-el32"
WANDB_PROJECT_NAME = "rp-crowd-3-folds-t5-efficient-small-el32"
OUTPUT_DIR = "./GermanT5-RP-Mod/t5-efficient-oscar-german-small-el32/"
SOURCE = f"./Datasets/{DATASET}-folds.csv"

tokenizer = T5Tokenizer.from_pretrained(MODEL_NAME_OR_PATH)
#TODO: can I automatically pick the best checkpoint path?
checkpoint_path = "./augmented/500short-RP-Mod-results/t5-efficient-oscar-german-small-el32/learning_rate-0.0004-weight_decay-0.1/epoch=5-val_accuracy=0.72-val_loss=0.29.ckpt"
train_inputs, train_targets, val_inputs, val_targets = get_folds(SOURCE)

train_dataset = RPDataset(tokenizer, train_inputs, train_targets)
valid_dataset = RPDataset(tokenizer, val_inputs, val_targets)

checkpoint = torch.load(checkpoint_path)
hparams = checkpoint["hyper_parameters"]
hparams["train_dataset"] = train_dataset
hparams["val_dataset"] = valid_dataset
new_args = argparse.Namespace(**hparams)

new_model = T5FineTuner.load_from_checkpoint(checkpoint_path, hparams=new_args)

In [26]:
text = "Ich liebe Katzen und Hunde und die Umwelt und alles das gruen ist. Mein Leben ist so schoen"*100
input_ids = torch.tensor([tokenizer.encode(text)])
outs = new_model.model.generate(input_ids, max_length=2)
tokenizer.decode(outs[0])[6:]

'unproblematisch'

In [18]:
text = "Gekommen um zu gehen..."
tokenizer.encode_plus(text, max_length=512, padding="max_length", return_tensors="pt")["attention_mask"]

tensor([[1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0

In [43]:
def classify(text):
    tokenizer_output = tokenizer.encode_plus(text, max_length=512, padding="max_length", truncation=True, return_tensors="pt")
    input_ids = tokenizer_output["input_ids"]
    attention_mask = tokenizer_output["attention_mask"]
    outs = new_model.model.generate(input_ids, attention_mask=attention_mask, max_length=2)
    return tokenizer.decode(outs[0])[6:]

#shmerpf what the fork...
classify( " In der Tat ")

'unproblematisch'

In [23]:
val_inputs[indices_false_positives[7]]

'classification Warum wird nicht berichtet wofür die Kurden demonstrieren Doch nicht für eine Menschenkette vom Hbf bis zum Landtag Ich brauche mehr Details '

## Explain Using Shapley Values

In [None]:
import shap
# build an explainer using a token masker
explainer = shap.Explainer(f, tokenizer)

### Load a model from checkpoint

In [26]:
from transformers import T5Tokenizer, T5ForConditionalGeneration

model = T5ForConditionalGeneration.from_pretrained("GermanT5/t5-efficient-oscar-german-small-el32")

model

T5ForConditionalGeneration(
  (shared): Embedding(32128, 512)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 512)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=512, out_features=512, bias=False)
              (k): Linear(in_features=512, out_features=512, bias=False)
              (v): Linear(in_features=512, out_features=512, bias=False)
              (o): Linear(in_features=512, out_features=512, bias=False)
              (relative_attention_bias): Embedding(32, 8)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseReluDense(
              (wi): Linear(in_features=512, out_features=2048, bias=False)
              (wo): Linear(in_features=2048, out_features=512, bias=False)
              (dropout): Dro