# Gradient-based word deletion

I want to see which words are masked in which order, and whether that makes sense.

In [1]:
import sys
sys.path.append('/home/jxm3/research/deidentification/unsupervised-deidentification')

In [2]:
from dataloader import WikipediaDataModule
import os

num_cpus = os.cpu_count()

dm = WikipediaDataModule(
    document_model_name_or_path="roberta-base",
    profile_model_name_or_path="google/tapas-base",
    max_seq_length=128,
    dataset_name='wiki_bio',
    dataset_train_split='train[:1024]', # not used in this notebook
    dataset_val_split='val[:20%]',
    dataset_version='1.2.0',
    word_dropout_ratio=0.0,
    word_dropout_perc=0.0,
    num_workers=1,
    train_batch_size=64,
    eval_batch_size=64
)
dm.setup("fit")

Initializing WikipediaDataModule with num_workers = 1 and mask token `<mask>`
loading wiki_bio[1.2.0] split train[:1024]


Downloading builder script:   0%|          | 0.00/2.33k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/1.24k [00:00<?, ?B/s]

Using custom data configuration default
Reusing dataset wiki_bio (/home/jxm3/.cache/huggingface/datasets/wiki_bio/default/1.2.0/c05ce066e9026831cd7535968a311fc80f074b58868cfdffccbc811dff2ab6da)


loading wiki_bio[1.2.0] split val[:20%]


Using custom data configuration default
Reusing dataset wiki_bio (/home/jxm3/.cache/huggingface/datasets/wiki_bio/default/1.2.0/c05ce066e9026831cd7535968a311fc80f074b58868cfdffccbc811dff2ab6da)


  0%|          | 0/1024 [00:00<?, ?ex/s]

Loading cached processed dataset at /home/jxm3/.cache/huggingface/datasets/wiki_bio/default/1.2.0/c05ce066e9026831cd7535968a311fc80f074b58868cfdffccbc811dff2ab6da/cache-7d07543b6205ca87.arrow


  0%|          | 0/1024 [00:00<?, ?ex/s]

  0%|          | 0/14566 [00:00<?, ?ex/s]

  0%|          | 0/14566 [00:00<?, ?ex/s]

  0%|          | 0/15 [00:00<?, ?ba/s]

In [5]:
from model import CoordinateAscentModel
from model_cfg import model_paths_dict

checkpoint_path = model_paths_dict["model_5"]


model = CoordinateAscentModel.load_from_checkpoint(
    checkpoint_path,
    document_model_name_or_path="roberta-base",
    profile_model_name_or_path="google/tapas-base",
    learning_rate=1e-5,
    pretrained_profile_encoder=False,
    lr_scheduler_factor=0.5,
    lr_scheduler_patience=1,
    train_batch_size=1,
    num_workers=1,
    gradient_clip_val=10.0,
)

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight', 'lm_head.dense.bias', 'lm_head.bias', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Initialized model with learning_rate = 1e-05 and patience 1


## 2. Define attack in TextAttack 

In [6]:
import textattack

### (a) Beam search + replace with `[MASK]`

In [7]:
class WordSwapSingleWord(textattack.transformations.word_swap.WordSwap):
    """Takes a sentence and transforms it by replacing with a single fixed word.
    """
    single_word: str
    def __init__(self, single_word: str = "?", **kwargs):
        super().__init__(**kwargs)
        self.single_word = single_word

    def _get_replacement_words(self, _word: str):
        return [self.single_word]

transformation = WordSwapSingleWord(single_word=dm.document_tokenizer.mask_token)
transformation(textattack.shared.AttackedText("Hello my name is Jack"))

[<AttackedText "<mask> my name is Jack">,
 <AttackedText "Hello <mask> name is Jack">,
 <AttackedText "Hello my <mask> is Jack">,
 <AttackedText "Hello my name <mask> Jack">,
 <AttackedText "Hello my name is <mask>">]

### (b) "Attack success" as fullfilment of the metric

In [8]:
from typing import List
import torch

class ChangeClassificationToBelowTopKClasses(textattack.goal_functions.ClassificationGoalFunction):
    k: int
    def __init__(self, *args, k: int = 1, **kwargs):
        self.k = k
        super().__init__(*args, **kwargs)

    def _is_goal_complete(self, model_output, _):
        original_class_score = model_output[self.ground_truth_output]
        num_better_classes = (model_output > original_class_score).sum()
        return num_better_classes >= self.k

    def _get_score(self, model_output, _):
        return 1 - model_output[self.ground_truth_output]
    
    
    """have to reimplement the following method to change the precision on the sum-to-one condition."""
    def _process_model_outputs(self, inputs, scores):
        """Processes and validates a list of model outputs.
        This is a task-dependent operation. For example, classification
        outputs need to have a softmax applied.
        """
        # Automatically cast a list or ndarray of predictions to a tensor.
        if isinstance(scores, list):
            scores = torch.tensor(scores)

        # Ensure the returned value is now a tensor.
        if not isinstance(scores, torch.Tensor):
            raise TypeError(
                "Must have list, np.ndarray, or torch.Tensor of "
                f"scores. Got type {type(scores)}"
            )

        # Validation check on model score dimensions
        if scores.ndim == 1:
            # Unsqueeze prediction, if it's been squeezed by the model.
            if len(inputs) == 1:
                scores = scores.unsqueeze(dim=0)
            else:
                raise ValueError(
                    f"Model return score of shape {scores.shape} for {len(inputs)} inputs."
                )
        elif scores.ndim != 2:
            # If model somehow returns too may dimensions, throw an error.
            raise ValueError(
                f"Model return score of shape {scores.shape} for {len(inputs)} inputs."
            )
        elif scores.shape[0] != len(inputs):
            # If model returns an incorrect number of scores, throw an error.
            raise ValueError(
                f"Model return score of shape {scores.shape} for {len(inputs)} inputs."
            )
        elif not ((scores.sum(dim=1) - 1).abs() < 1e-4).all():
            # Values in each row should sum up to 1. The model should return a
            # set of numbers corresponding to probabilities, which should add
            # up to 1. Since they are `torch.float` values, allow a small
            # error in the summation.
            scores = torch.nn.functional.softmax(scores, dim=1)
            if not ((scores.sum(dim=1) - 1).abs() < 1e-4).all():
                raise ValueError("Model scores do not add up to 1.")
        return scores.cpu()


## (c) Model wrapper that computes similarities of input documents with validation profiles

In [9]:
import numpy as np
import tqdm

def precompute_profile_embeddings():
    model.profile_model.cuda()
    model.profile_model.eval()

    model.val_profile_embeddings = np.zeros((len(dm.val_dataset), model.profile_embedding_dim))
    for val_batch in tqdm.tqdm(dm.val_dataloader()[0], desc="Precomputing val embeddings", colour="green", leave=False):
        with torch.no_grad():
            profile_embeddings = model.forward_profile(batch=val_batch)
        model.val_profile_embeddings[val_batch["text_key_id"]] = profile_embeddings.cpu()
    model.val_profile_embeddings = torch.tensor(model.val_profile_embeddings, dtype=torch.float32)
    model.profile_model.train()

precompute_profile_embeddings()

                                                                              5.53it/s]

In [10]:
import transformers
from model.model import Model

class MyModelWrapper(textattack.models.wrappers.ModelWrapper):
    model: Model
    tokenizer: transformers.AutoTokenizer
    profile_embeddings: torch.Tensor
    max_seq_length: int
    
    def __init__(self, model: Model, tokenizer: transformers.AutoTokenizer, max_seq_length: int = 128):
        self.model = model
        self.model.eval()
        self.tokenizer = tokenizer
        self.profile_embeddings = torch.tensor(model.val_profile_embeddings)
        self.max_seq_length = max_seq_length
                 
    def to(self, device):
        self.model.to(device)
        self.profile_embeddings.to(device)
        return self # so semantics `model = MyModelWrapper().to('cuda')` works properly

    def __call__(self, text_input_list: List[str], batch_size=32):
        model_device = next(self.model.parameters()).device
        
        doc_tokenized = self.tokenizer.batch_encode_plus(
            text_input_list,
            max_length=self.max_seq_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt',
        )
        doc_tokenized = {f'document__{k}': v for k,v in doc_tokenized.items()}
        with torch.no_grad():
            document_embeddings = self.model.forward_document(batch=doc_tokenized, document_type='document')
            document_to_profile_logits = document_embeddings @ self.profile_embeddings.T.to(model_device)
            document_to_profile_probs = torch.nn.functional.softmax(
                document_to_profile_logits, dim=-1
            )
        assert document_to_profile_probs.shape == (len(text_input_list), len(self.profile_embeddings))
        return document_to_profile_probs
            

## (d) Dataset that loads Wikipedia documents with names as labels

Oh, and it filters out examples that are too long.

In [11]:
from typing import Tuple

from collections import OrderedDict

import datasets

class WikiDataset(textattack.datasets.Dataset):
    dataset: datasets.Dataset
    
    def __init__(self, dm: WikipediaDataModule):
        self.shuffled = True
        self.dataset = [ex for ex in dm.val_dataset]
        self.label_names = list(dm.val_dataset['name'])
    
    def __len__(self) -> int:
        return len(self.dataset)
    
    def __getitem__(self, i: int) -> Tuple[OrderedDict, int]:
        input_dict = OrderedDict([
            ('document', self.dataset[i]['document'])
        ])
        return input_dict, self.dataset[i]['text_key_id']
        

## 3. Run attack once

In [13]:
class MaxNumWordsModified(textattack.constraints.PreTransformationConstraint):
    def __init__(self, max_num_words: int):
        self.max_num_words = max_num_words

    def _get_modifiable_indices(self, current_text):
        """Returns the word indices in current_text which are able to be
        modified."""

        if len(current_text.attack_attrs["modified_indices"]) >= self.max_num_words:
            return set()
        else:
            return set(range(len(current_text.words)))

    def extra_repr_keys(self):
        return ["max_num_words"]

In [14]:
model_wrapper = MyModelWrapper(model=model, tokenizer=dm.document_tokenizer)
model_wrapper.to('cuda')

  self.profile_embeddings = torch.tensor(model.val_profile_embeddings)


<__main__.MyModelWrapper at 0x7f5b80b48940>

In [77]:
from textattack.shared import utils


def get_modified_idxs_in_order(at: textattack.shared.AttackedText) -> List[int]:
    """Traverses linked-list of attacked texts from attack process
    and creates a list of the modified word indices.
    """
    modified_word_idxs = []
    while True:
        if 'newly_modified_indices' not in at.attack_attrs:
            break
        modified_word_idxs.extend(at.attack_attrs['newly_modified_indices'])
        at = at.attack_attrs['prev_attacked_text']
    modified_word_idxs = modified_word_idxs[::-1]
    return modified_word_idxs[::-1]


def diff_color_with_idxs(at: textattack.attack_results.AttackResult, color_method=None):
    """Highlights the difference between two texts using color.
    
    This version also adds idx numbers to show which words were masked in which order.

    Has to account for deletions and insertions from original text to
    perturbed. Relies on the index map stored in
    ``self.original_result.attacked_text.attack_attrs["original_index_map"]``.
    """
    t1 = at.original_result.attacked_text
    t2 = at.perturbed_result.attacked_text

    if color_method is None:
        return t1.printable_text(), t2.printable_text()

    color_1 = at.original_result.get_text_color_input()
    color_2 = at.perturbed_result.get_text_color_perturbed()

    # iterate through and count equal/unequal words
    words_1_idxs = []
    t2_equal_idxs = set()
    original_index_map = t2.attack_attrs["original_index_map"]
    for t1_idx, t2_idx in enumerate(original_index_map):
        if t2_idx == -1:
            # add words in t1 that are not in t2
            words_1_idxs.append(t1_idx)
        else:
            w1 = t1.words[t1_idx]
            w2 = t2.words[t2_idx]
            if w1 == w2:
                t2_equal_idxs.add(t2_idx)
            else:
                words_1_idxs.append(t1_idx)

    # words to color in t2 are all the words that didn't have an equal,
    # mapped word in t1
    words_2_idxs = list(sorted(set(range(t2.num_words)) - t2_equal_idxs))

    # make lists of colored words
    words_1 = [t1.words[i] for i in words_1_idxs]
    words_1 = [utils.color_text(w, color_1, color_method) for w in words_1]
    
    # First, replace words with `word_xx` where xx is the index
    # of the order that word was modified.
    word_modification_order = {word_idx: swap_idx+1 for swap_idx, word_idx in enumerate(get_modified_idxs_in_order(t2))}
    words_2 = [f'{t2.words[i]}__{word_modification_order[i]}' for i in words_2_idxs]
    words_2 = [utils.color_text(w, color_2, color_method) for w in words_2]

    t1 = at.original_result.attacked_text.replace_words_at_indices(
        words_1_idxs, words_1
    )
    t2 = at.perturbed_result.attacked_text.replace_words_at_indices(
        words_2_idxs, words_2
    )

    key_color = ("bold", "underline")
    return (
        t1.printable_text(key_color=key_color, key_color_method=color_method),
        t2.printable_text(key_color=key_color, key_color_method=color_method),
    )

In [90]:
type(results_iterable[0].original_result)

textattack.goal_function_results.classification_goal_function_result.ClassificationGoalFunctionResult

In [102]:
from textattack.loggers import CSVLogger
from textattack.shared import AttackedText

import pandas as pd
class CustomCSVLogger(CSVLogger):
    """Logs attack results to a CSV."""

    def log_attack_result(self, result: textattack.goal_function_results.ClassificationGoalFunctionResult):
        # TODO print like 'mask1', 'mask2',
        original_text, perturbed_text = diff_color_with_idxs(result, color_method=self.color_method)
        original_text = original_text.replace("\n", AttackedText.SPLIT_TOKEN)
        perturbed_text = perturbed_text.replace("\n", AttackedText.SPLIT_TOKEN)
        result_type = result.__class__.__name__.replace("AttackResult", "")
        row = {
            "original_person": result.original_result._processed_output[0],
            "original_text": original_text,
            "original_text_id_bm25": bm25.get_scores(result.original_result.attacked_text.text.split()).argmax(),
            "perturbed_person": result.perturbed_result._processed_output[0],
            "perturbed_text": perturbed_text,
            "perturbed_text_id_bm25": bm25.get_scores(result.perturbed_result.attacked_text.text.split()).argmax(),
            "original_score": result.original_result.score,
            "perturbed_score": result.perturbed_result.score,
            "original_output": result.original_result.output,
            "perturbed_output": result.perturbed_result.output,
            "ground_truth_output": result.original_result.ground_truth_output,
            "num_queries": result.num_queries,
            "result_type": result_type,
        }
        self.df = pd.concat([self.df, pd.DataFrame([row])], ignore_index=True)
        self._flushed = False

In [103]:
from typing import List

from nltk.corpus import stopwords
from rank_bm25 import BM25Okapi

eng_stopwords = stopwords.words('english')
from tqdm.auto import tqdm
tqdm.pandas()


def get_words_from_doc(s: List[str]) -> List[str]:
    words = s.split()
    return [w for w in words if not w in eng_stopwords]

def make_table_str(ex):
    ex['table_str'] = (
        ' '.join(ex['input_text']['table']['column_header'] + ex['input_text']['table']['content'])
    )
    return ex

prof_data = dm.val_dataset.map(make_table_str)
profile_corpus = prof_data['table_str']

tokenized_profile_corpus = [
    get_words_from_doc(prof) for prof in profile_corpus
]

bm25 = BM25Okapi(tokenized_profile_corpus)

Loading cached processed dataset at /home/jxm3/.cache/huggingface/datasets/wiki_bio/default/1.2.0/c05ce066e9026831cd7535968a311fc80f074b58868cfdffccbc811dff2ab6da/cache-1cd29f04bf8f9344.arrow


In [104]:
# 
#  Initialize attack
# 

from textattack import Attack
from textattack.constraints.pre_transformation import MaxWordIndexModification, RepeatModification

goal_function = ChangeClassificationToBelowTopKClasses(model_wrapper, k=1)
constraints = [
    RepeatModification(),
    MaxWordIndexModification(max_length=dm.max_seq_length),
    MaxNumWordsModified(max_num_words=50)
]
transformation = WordSwapSingleWord(single_word=dm.document_tokenizer.mask_token)
search_method = textattack.search_methods.BeamSearch(beam_width=4)

attack = Attack(
    goal_function, constraints, transformation, search_method
)

from tqdm import tqdm # tqdm provides us a nice progress bar.
from textattack.attack_results import SuccessfulAttackResult
from textattack import Attacker
from textattack import AttackArgs

attack_args = AttackArgs(num_examples=15, disable_stdout=True)
dataset = WikiDataset(dm)

attacker = Attacker(attack, dataset, attack_args)

results_iterable = attacker.attack_dataset()

logger = CustomCSVLogger(color_method='html')

# 
# Run attack
# 
from tqdm import tqdm
for result in results_iterable:
    tqdm._instances.clear() # Doesn't fix the progress bar :-(
    logger.log_attack_result(result)

from IPython.display import display, HTML

display(HTML(logger.df.to_html(escape=False)))

textattack: No entry found for goal function <class '__main__.ChangeClassificationToBelowTopKClasses'>.
textattack: Unknown if model of class <class 'model.coordinate_ascent.CoordinateAscentModel'> compatible with goal function <class '__main__.ChangeClassificationToBelowTopKClasses'>.


Attack(
  (search_method): BeamSearch(
    (beam_width):  4
  )
  (goal_function):  ChangeClassificationToBelowTopKClasses
  (transformation):  WordSwapSingleWord
  (constraints): 
    (0): RepeatModification
    (1): MaxWordIndexModification(
        (max_length):  128
      )
    (2): MaxNumWordsModified(
        (max_num_words):  50
      )
  (is_black_box):  True
) 




  0%|          | 0/15 [00:00<?, ?it/s][A
  7%|▋         | 1/15 [00:06<01:26,  6.20s/it][A
[Succeeded / Failed / Skipped / Total] 1 / 0 / 0 / 1:   7%|▋         | 1/15 [00:06<01:26,  6.21s/it][A
[Succeeded / Failed / Skipped / Total] 1 / 0 / 0 / 1:  13%|█▎        | 2/15 [00:06<00:42,  3.26s/it][A
[Succeeded / Failed / Skipped / Total] 2 / 0 / 0 / 2:  13%|█▎        | 2/15 [00:06<00:42,  3.26s/it][A
[Succeeded / Failed / Skipped / Total] 2 / 0 / 0 / 2:  20%|██        | 3/15 [00:07<00:30,  2.51s/it][A
[Succeeded / Failed / Skipped / Total] 3 / 0 / 0 / 3:  20%|██        | 3/15 [00:07<00:30,  2.51s/it][A
[Succeeded / Failed / Skipped / Total] 3 / 0 / 0 / 3:  27%|██▋       | 4/15 [00:08<00:23,  2.16s/it][A
[Succeeded / Failed / Skipped / Total] 4 / 0 / 0 / 4:  27%|██▋       | 4/15 [00:08<00:23,  2.16s/it][A
[Succeeded / Failed / Skipped / Total] 4 / 0 / 0 / 4:  33%|███▎      | 5/15 [00:09<00:19,  1.93s/it][A
[Succeeded / Failed / Skipped / Total] 5 / 0 / 0 / 5:  33%|███▎      | 5/15


+-------------------------------+---------+
| Attack Results                |         |
+-------------------------------+---------+
| Number of successful attacks: | 15      |
| Number of failed attacks:     | 0       |
| Number of skipped attacks:    | 0       |
| Original accuracy:            | 100.0%  |
| Accuracy under attack:        | 0.0%    |
| Attack success rate:          | 100.0%  |
| Average perturbed word %:     | 19.69%  |
| Average num. words per input: | 51.87   |
| Avg num queries:              | 1552.47 |
+-------------------------------+---------+


textattack: Logging to CSV at path results.csv
textattack: CSVLogger exiting without calling flush().





Unnamed: 0,original_person,original_text,original_text_id_bm25,perturbed_person,perturbed_text,perturbed_text_id_bm25,original_score,perturbed_score,original_output,perturbed_output,ground_truth_output,num_queries,result_type
0,Michael iii of alexandria,"pope michael iii of alexandria ( also known as khail iii ) was the coptic pope of alexandria and patriarch of the see of st. mark ( 880 -- 907 ) .in 882 , the governor of egypt , ahmad ibn tulun , forced khail to pay heavy contributions , forcing him to sell a church and some attached properties to the local jewish community .this building was at one time believed to have later become the site of the cairo geniza .",0,Khuwaylid ibn asad,"pope <mask__6> <mask__1> <mask__3> alexandria ( <mask__4> known as khail <mask__5> ) was the coptic pope of alexandria and patriarch of the see of st. mark ( 880 -- <mask__2> ) .in 882 , the governor of egypt , ahmad ibn tulun , forced khail to pay heavy contributions , forcing him to sell a church and some attached properties to the local jewish community .this building was at one time believed to have later become the site of the cairo geniza .",0,0.0001561642,0.760638,0,4224,0,1474,Successful
1,Hui jun,hui jun is a male former table tennis player from china .,1,Liu xiaolong,<mask__3> <mask__2> is a male former <mask__1> tennis player from china .,4330,0.0,0.984631,1,10388,1,88,Successful
2,Okan öztürk,okan Öztürk ( born 30 november 1977 ) is a turkish professional footballer .he currently plays as a striker for yeni malatyaspor .,2,Adem büyük,<mask__5> <mask__4> ( born 30 november <mask__2> ) is a turkish professional footballer .he currently plays as a striker for <mask__3> <mask__1> .,5167,0.0,0.549581,2,2279,2,301,Successful
3,Marie stephan,"marie stephan , ( born march 14 , 1996 ) is a professional squash player who represents france .she reached a career-high world ranking of world no. 101 in july 2015 .",3,Laura pomportes,"<mask__2> <mask__4> , ( born march 14 , <mask__3> ) is a professional squash player who represents france .she reached a career-high world ranking of world no. 101 in july <mask__1> .",3,0.0,0.980871,3,4726,3,328,Successful
4,Leonard l. martino,leonard l. martino is a former democratic member of the pennsylvania house of representatives .he was born in butler to michael and angela pitullio martino .,4,Lester k. fryer,<mask__4> <mask__3>. <mask__2> is a former democratic member of the pennsylvania house of representatives .he was born in butler <mask__1> michael and angela pitullio martino .,4,2.384186e-07,0.703081,4,3564,4,302,Successful
5,Salome jens,"salome jens ( born may 8 , 1935 ) is an american stage , film and television actress .she is perhaps best known for portraying the female changeling on '' '' .",5,Linda kozlowski,"<mask__4> <mask__2> ( born <mask__1> 8 , <mask__3> ) is an american stage , film and television actress .she is perhaps best known for portraying the female changeling on '' '' .",6420,0.0,0.995706,5,9248,5,302,Successful
6,Carl crawford,"carl demonte crawford ( born august 5 , 1981 ) , nicknamed `` the perfect storm '' , is an american professional baseball left fielder with the los angeles dodgers of major league baseball ( mlb ) .he bats and throws left-handed .crawford was drafted by the tampa bay devil rays in the second round ( 52nd overall ) of the 1999 major league baseball draft .he made his major league debut in 2002 .crawford has more triples ( 121 ) than any other active baseball player .",6,Josh hamilton,"<mask__30> <mask__29> <mask__28> ( <mask__27> <mask__26> <mask__5> , <mask__1> ) , <mask__25> <mask__24> <mask__23> <mask__22> <mask__21> '' , <mask__20> <mask__19> <mask__18> <mask__17> <mask__16> <mask__15> <mask__14> <mask__13> <mask__12> <mask__11> <mask__10> <mask__9> of major league baseball ( mlb ) .he bats and throws left-handed .<mask__8> was drafted by the <mask__2> <mask__3> <mask__6> <mask__4> in the second round ( 52nd overall ) of the 1999 major league baseball draft .he made his major league debut in 2002 .<mask__7> has more triples ( 121 ) than any other active baseball player .",9148,0.0,0.667488,6,9080,6,7036,Successful
7,Jim bob,"jim bob ( born james neil morrison on 22 november 1960 ) is a british musician and author , best known as the singer of indie punk band carter usm .",7,David morrell,"<mask__2> <mask__5> ( born <mask__1> neil morrison on 22 <mask__4> <mask__6> ) is a <mask__3> musician and author , best known as the singer of indie punk band carter usm .",7,1.430511e-06,0.963303,7,7586,7,508,Successful
8,Riddick parker,"riddick parker ( born november 20 , 1972 in emporia , virginia ) is a former professional american football defensive lineman for the seattle seahawks , san diego chargers , new england patriots , baltimore ravens , and san francisco 49ers of the national football league .",8,Jeff faulkner,"<mask__6> <mask__7> ( born <mask__2> <mask__3> , <mask__5> in <mask__4> , virginia ) is a former professional american football defensive lineman for the seattle seahawks , san diego chargers , new england patriots , baltimore ravens , and san francisco <mask__1> of the national football league .",8,0.0,0.669122,8,102,8,867,Successful
9,Blessed osanna of cattaro -lrb- ozana kotorska -rrb-,blessed osanna of cattaro t.o.s.d. ( ) was a catholic visionary and anchoress from cattaro ( kotor ) .she was a teenage convert from orthodoxy of serbian descent from montenegro ( zeta ) .she became a dominican tertiary and was posthumously venerated as a saint in kotor .she was later beatified in 1934 .,9,Nikola selnički,blessed <mask__6> <mask__5> <mask__7> t.o.s.d. ( ) was a catholic visionary and anchoress from <mask__3> ( <mask__1> ) .she was a teenage convert from orthodoxy of serbian descent from <mask__4> ( zeta ) .she became a dominican tertiary and was posthumously venerated as a saint in <mask__2> .she was later beatified in 1934 .,9,0.0,0.827921,9,11956,9,1167,Successful


In [105]:
# 
#  Initialize attack
# 

from textattack import Attack
from textattack.constraints.pre_transformation import MaxWordIndexModification, RepeatModification

goal_function = ChangeClassificationToBelowTopKClasses(model_wrapper, k=10)
constraints = [
    RepeatModification(),
    MaxWordIndexModification(max_length=dm.max_seq_length),
    MaxNumWordsModified(max_num_words=50)
]
transformation = WordSwapSingleWord(single_word=dm.document_tokenizer.mask_token)
search_method = textattack.search_methods.BeamSearch(beam_width=4)

attack = Attack(
    goal_function, constraints, transformation, search_method
)

from tqdm import tqdm # tqdm provides us a nice progress bar.
from textattack.attack_results import SuccessfulAttackResult
from textattack import Attacker
from textattack import AttackArgs

attack_args = AttackArgs(num_examples=15, disable_stdout=True)
dataset = WikiDataset(dm)

attacker = Attacker(attack, dataset, attack_args)

results_iterable = attacker.attack_dataset()

logger = CustomCSVLogger(color_method='html')

# 
# Run attack
# 
from tqdm import tqdm
for result in results_iterable:
    tqdm._instances.clear() # Doesn't fix the progress bar :-(
    logger.log_attack_result(result)

from IPython.display import display, HTML

display(HTML(logger.df.to_html(escape=False)))

textattack: No entry found for goal function <class '__main__.ChangeClassificationToBelowTopKClasses'>.
textattack: Unknown if model of class <class 'model.coordinate_ascent.CoordinateAscentModel'> compatible with goal function <class '__main__.ChangeClassificationToBelowTopKClasses'>.


Attack(
  (search_method): BeamSearch(
    (beam_width):  4
  )
  (goal_function):  ChangeClassificationToBelowTopKClasses
  (transformation):  WordSwapSingleWord
  (constraints): 
    (0): RepeatModification
    (1): MaxWordIndexModification(
        (max_length):  128
      )
    (2): MaxNumWordsModified(
        (max_num_words):  50
      )
  (is_black_box):  True
) 



[Succeeded / Failed / Skipped / Total] 14 / 1 / 0 / 15: 100%|██████████| 15/15 [05:56<00:00, 23.78s/it]


+-------------------------------+---------+
| Attack Results                |         |
+-------------------------------+---------+
| Number of successful attacks: | 14      |
| Number of failed attacks:     | 1       |
| Number of skipped attacks:    | 0       |
| Original accuracy:            | 100.0%  |
| Accuracy under attack:        | 6.67%   |
| Attack success rate:          | 93.33%  |
| Average perturbed word %:     | 35.92%  |
| Average num. words per input: | 51.87   |
| Avg num queries:              | 3827.13 |
+-------------------------------+---------+


textattack: Logging to CSV at path results.csv
textattack: CSVLogger exiting without calling flush().





Unnamed: 0,original_person,original_text,original_text_id_bm25,perturbed_person,perturbed_text,perturbed_text_id_bm25,original_score,perturbed_score,original_output,perturbed_output,ground_truth_output,num_queries,result_type
0,Michael iii of alexandria,"pope michael iii of alexandria ( also known as khail iii ) was the coptic pope of alexandria and patriarch of the see of st. mark ( 880 -- 907 ) .in 882 , the governor of egypt , ahmad ibn tulun , forced khail to pay heavy contributions , forcing him to sell a church and some attached properties to the local jewish community .this building was at one time believed to have later become the site of the cairo geniza .",0,Khuwaylid ibn asad,"pope <mask__10> <mask__4> <mask__8> alexandria ( <mask__5> known as khail <mask__6> ) was the coptic pope of <mask__3> <mask__7> patriarch of the <mask__1> of st. <mask__2> ( 880 -- <mask__9> ) .in 882 , the governor of egypt , ahmad ibn tulun , forced khail to pay heavy contributions , forcing him to sell a church and some attached properties to the local jewish community .this building was at one time believed to have later become the site of the cairo geniza .",0,0.0001561642,0.996305,0,4224,0,2522,Successful
1,Hui jun,hui jun is a male former table tennis player from china .,1,Kepler orellana,<mask__4> <mask__2> is a male former <mask__3> tennis player from <mask__1> .,4330,0.0,0.997194,1,3964,1,120,Successful
2,Okan öztürk,okan Öztürk ( born 30 november 1977 ) is a turkish professional footballer .he currently plays as a striker for yeni malatyaspor .,2,Mattia montini,<mask__8> <mask__7> ( born 30 november <mask__5> ) is <mask__2> <mask__3> professional footballer .he currently plays as a <mask__1> for <mask__4> <mask__6> .,5167,0.0,0.998215,2,581,2,469,Successful
3,Marie stephan,"marie stephan , ( born march 14 , 1996 ) is a professional squash player who represents france .she reached a career-high world ranking of world no. 101 in july 2015 .",3,Laura pomportes,"<mask__18> <mask__19> , ( <mask__8> <mask__16> 14 , <mask__20> ) <mask__5> <mask__3> <mask__15> <mask__11> <mask__9> <mask__4> <mask__14> <mask__2> .<mask__1> reached <mask__12> <mask__10> world <mask__6> <mask__7> world no. 101 <mask__13> july <mask__17> .",3,0.0,0.997809,3,4726,3,1320,Successful
4,Leonard l. martino,leonard l. martino is a former democratic member of the pennsylvania house of representatives .he was born in butler to michael and angela pitullio martino .,4,William w. pendleton,<mask__10> <mask__9>. <mask__8> is a former democratic member of the <mask__1> house of representatives .he was born in <mask__2> <mask__7> <mask__6> and <mask__3> <mask__4> <mask__5> .,2511,2.384186e-07,0.998071,4,1488,4,746,Successful
5,Salome jens,"salome jens ( born may 8 , 1935 ) is an american stage , film and television actress .she is perhaps best known for portraying the female changeling on '' '' .",5,Linda kozlowski,"<mask__4> <mask__2> ( born <mask__1> 8 , <mask__3> ) is an american stage , film and television actress .she is perhaps best known for portraying the female changeling on '' '' .",6420,0.0,0.995706,5,9248,5,302,Successful
6,Carl crawford,"carl demonte crawford ( born august 5 , 1981 ) , nicknamed `` the perfect storm '' , is an american professional baseball left fielder with the los angeles dodgers of major league baseball ( mlb ) .he bats and throws left-handed .crawford was drafted by the tampa bay devil rays in the second round ( 52nd overall ) of the 1999 major league baseball draft .he made his major league debut in 2002 .crawford has more triples ( 121 ) than any other active baseball player .",6,Josh hamilton,"<mask__38> <mask__37> <mask__36> ( <mask__35> <mask__34> <mask__13> , <mask__8> ) , <mask__33> <mask__32> <mask__31> <mask__30> <mask__29> '' , <mask__28> <mask__27> <mask__26> <mask__25> <mask__24> <mask__23> <mask__22> <mask__21> <mask__20> <mask__19> <mask__18> <mask__17> of major league <mask__9> ( mlb ) .he bats and throws left-handed .<mask__16> <mask__5> drafted by the <mask__10> <mask__11> <mask__14> <mask__12> in the second round ( 52nd <mask__4> ) of the <mask__2> major league baseball draft .he made his major league debut in <mask__3> .<mask__15> has <mask__7> triples ( <mask__1> ) <mask__6> any other active baseball player .",9148,0.0,0.987027,6,9080,6,8364,Successful
7,Jim bob,"jim bob ( born james neil morrison on 22 november 1960 ) is a british musician and author , best known as the singer of indie punk band carter usm .",7,David morrell,"<mask__5> <mask__8> ( born <mask__3> neil morrison on 22 <mask__7> <mask__9> ) is a <mask__6> musician and author , best known as the <mask__1> of indie <mask__2> band carter <mask__4> .",2577,1.430511e-06,0.999806,7,7586,7,748,Successful
8,Riddick parker,"riddick parker ( born november 20 , 1972 in emporia , virginia ) is a former professional american football defensive lineman for the seattle seahawks , san diego chargers , new england patriots , baltimore ravens , and san francisco 49ers of the national football league .",8,Brad clontz,"<mask__21> <mask__20> ( born <mask__16> <mask__17> , <mask__18> <mask__9> <mask__19> , virginia ) <mask__5> a <mask__13> professional american <mask__6> <mask__1> <mask__2> <mask__7> the seattle seahawks , san diego <mask__3> , new england <mask__14> , baltimore ravens , and <mask__11> <mask__12> <mask__15> of <mask__8> <mask__10> <mask__4> league .",8,0.0,0.996013,8,13821,8,2239,Successful
9,Blessed osanna of cattaro -lrb- ozana kotorska -rrb-,blessed osanna of cattaro t.o.s.d. ( ) was a catholic visionary and anchoress from cattaro ( kotor ) .she was a teenage convert from orthodoxy of serbian descent from montenegro ( zeta ) .she became a dominican tertiary and was posthumously venerated as a saint in kotor .she was later beatified in 1934 .,9,Venerable celestina bottego,blessed <mask__7> <mask__6> <mask__8> t.o.s.d. ( ) was a catholic visionary and anchoress from <mask__4> ( <mask__2> ) .she was a teenage convert from orthodoxy of <mask__1> descent from <mask__5> ( zeta ) .she became a dominican tertiary and was posthumously venerated as a saint in <mask__3> .she was later beatified in 1934 .,9,0.0,0.989361,9,4419,9,1339,Successful


In [106]:
# 
#  Initialize attack
# 

from textattack import Attack
from textattack.constraints.pre_transformation import MaxWordIndexModification, RepeatModification

goal_function = ChangeClassificationToBelowTopKClasses(model_wrapper, k=100)
constraints = [
    RepeatModification(),
    MaxWordIndexModification(max_length=dm.max_seq_length),
    MaxNumWordsModified(max_num_words=50)
]
transformation = WordSwapSingleWord(single_word=dm.document_tokenizer.mask_token)
search_method = textattack.search_methods.BeamSearch(beam_width=4)

attack = Attack(
    goal_function, constraints, transformation, search_method
)

from tqdm import tqdm # tqdm provides us a nice progress bar.
from textattack.attack_results import SuccessfulAttackResult
from textattack import Attacker
from textattack import AttackArgs

attack_args = AttackArgs(num_examples=15, disable_stdout=True)
dataset = WikiDataset(dm)

attacker = Attacker(attack, dataset, attack_args)

results_iterable = attacker.attack_dataset()

logger = CustomCSVLogger(color_method='html')

# 
# Run attack
# 
from tqdm import tqdm
for result in results_iterable:
    tqdm._instances.clear() # Doesn't fix the progress bar :-(
    logger.log_attack_result(result)

from IPython.display import display, HTML

display(HTML(logger.df.to_html(escape=False)))

textattack: No entry found for goal function <class '__main__.ChangeClassificationToBelowTopKClasses'>.
textattack: Unknown if model of class <class 'model.coordinate_ascent.CoordinateAscentModel'> compatible with goal function <class '__main__.ChangeClassificationToBelowTopKClasses'>.


Attack(
  (search_method): BeamSearch(
    (beam_width):  4
  )
  (goal_function):  ChangeClassificationToBelowTopKClasses
  (transformation):  WordSwapSingleWord
  (constraints): 
    (0): RepeatModification
    (1): MaxWordIndexModification(
        (max_length):  128
      )
    (2): MaxNumWordsModified(
        (max_num_words):  50
      )
  (is_black_box):  True
) 



[Succeeded / Failed / Skipped / Total] 11 / 4 / 0 / 15: 100%|██████████| 15/15 [08:11<00:00, 32.80s/it]


+-------------------------------+---------+
| Attack Results                |         |
+-------------------------------+---------+
| Number of successful attacks: | 11      |
| Number of failed attacks:     | 4       |
| Number of skipped attacks:    | 0       |
| Original accuracy:            | 100.0%  |
| Accuracy under attack:        | 26.67%  |
| Attack success rate:          | 73.33%  |
| Average perturbed word %:     | 53.93%  |
| Average num. words per input: | 51.87   |
| Avg num queries:              | 5066.33 |
+-------------------------------+---------+


textattack: Logging to CSV at path results.csv
textattack: CSVLogger exiting without calling flush().





Unnamed: 0,original_person,original_text,original_text_id_bm25,perturbed_person,perturbed_text,perturbed_text_id_bm25,original_score,perturbed_score,original_output,perturbed_output,ground_truth_output,num_queries,result_type
0,Michael iii of alexandria,"pope michael iii of alexandria ( also known as khail iii ) was the coptic pope of alexandria and patriarch of the see of st. mark ( 880 -- 907 ) .in 882 , the governor of egypt , ahmad ibn tulun , forced khail to pay heavy contributions , forcing him to sell a church and some attached properties to the local jewish community .this building was at one time believed to have later become the site of the cairo geniza .",0,Khuwaylid ibn asad,"<mask__4> <mask__14> <mask__8> <mask__13> alexandria ( <mask__9> known as khail <mask__10> ) was the coptic <mask__3> of <mask__7> <mask__11> patriarch of the <mask__5> of st. <mask__6> ( <mask__2> -- <mask__12> ) .in <mask__1> , the governor of egypt , ahmad ibn tulun , forced khail to pay heavy contributions , forcing him to sell a church and some attached properties to the local jewish community .this building was at one time believed to have later become the site of the cairo geniza .",12300,0.0001561642,0.999998,0,4224,0,3506,Successful
1,Hui jun,hui jun is a male former table tennis player from china .,1,Mshindo msolla,<mask__8> <mask__7> is <mask__2> <mask__5> former <mask__9> <mask__1> <mask__3> <mask__4> <mask__6> .,11934,0.0,0.999873,1,2234,1,220,Successful
2,Okan öztürk,okan Öztürk ( born 30 november 1977 ) is a turkish professional footballer .he currently plays as a striker for yeni malatyaspor .,2,Ögmundur kristinsson,<mask__7> <mask__6> ( born <mask__1> november <mask__8> ) is <mask__3> <mask__4> professional footballer .he currently plays as a <mask__2> for <mask__9> <mask__5> .,5167,0.0,0.999544,2,12664,2,517,Successful
3,Marie stephan,"marie stephan , ( born march 14 , 1996 ) is a professional squash player who represents france .she reached a career-high world ranking of world no. 101 in july 2015 .",3,Jim gregory,"<mask__24> <mask__27> , ( <mask__15> <mask__23> <mask__5> , <mask__26> ) <mask__12> <mask__10> <mask__22> <mask__18> <mask__16> <mask__11> <mask__21> <mask__9> .<mask__8> <mask__1> <mask__19> <mask__17> <mask__6> <mask__13> <mask__14> <mask__7> <mask__2>. <mask__3> <mask__20> <mask__4> <mask__25> .",3057,0.0,0.999408,3,12602,3,1432,Successful
4,Leonard l. martino,leonard l. martino is a former democratic member of the pennsylvania house of representatives .he was born in butler to michael and angela pitullio martino .,4,Edward t. begay,<mask__15> <mask__13>. <mask__14> is a former <mask__2> member of the <mask__6> <mask__1> <mask__4> <mask__3> .<mask__5> was born in <mask__7> <mask__12> <mask__11> and <mask__8> <mask__9> <mask__10> .,11934,2.384186e-07,0.999745,4,2914,4,1006,Successful
5,Salome jens,"salome jens ( born may 8 , 1935 ) is an american stage , film and television actress .she is perhaps best known for portraying the female changeling on '' '' .",5,Brittany underwood,"<mask__17> <mask__16> ( born <mask__15> 8 , <mask__14> ) <mask__2> <mask__3> <mask__8> <mask__11> , <mask__4> <mask__1> television <mask__10> .<mask__9> is <mask__13> best <mask__7> <mask__5> portraying <mask__6> <mask__12> changeling on '' '' .",12691,0.0,0.998784,5,4215,5,1082,Successful
6,Carl crawford,"carl demonte crawford ( born august 5 , 1981 ) , nicknamed `` the perfect storm '' , is an american professional baseball left fielder with the los angeles dodgers of major league baseball ( mlb ) .he bats and throws left-handed .crawford was drafted by the tampa bay devil rays in the second round ( 52nd overall ) of the 1999 major league baseball draft .he made his major league debut in 2002 .crawford has more triples ( 121 ) than any other active baseball player .",6,Brooks brown,"<mask__50> <mask__49> <mask__48> ( <mask__47> <mask__46> <mask__25> , <mask__20> ) , <mask__45> <mask__44> <mask__43> <mask__42> <mask__41> '' , <mask__40> <mask__39> <mask__38> <mask__37> <mask__36> <mask__35> <mask__34> <mask__33> <mask__32> <mask__31> <mask__30> <mask__29> of major league <mask__21> ( <mask__7> ) .he <mask__3> <mask__4> throws <mask__5> .<mask__28> <mask__17> drafted by <mask__11> <mask__22> <mask__23> <mask__26> <mask__24> <mask__9> <mask__1> second round ( 52nd <mask__16> ) of <mask__2> <mask__14> major league baseball draft .he made <mask__8> major league debut <mask__10> <mask__15> .<mask__27> <mask__12> <mask__19> <mask__6> ( <mask__13> ) <mask__18> any other active baseball player .",9148,0.0,0.999502,6,9110,6,9876,Failed
7,Jim bob,"jim bob ( born james neil morrison on 22 november 1960 ) is a british musician and author , best known as the singer of indie punk band carter usm .",7,Stuart tosh,"<mask__8> <mask__11> ( born <mask__7> <mask__6> <mask__3> on 22 <mask__10> <mask__12> ) is a <mask__9> musician and author , best known as the <mask__1> of <mask__4> <mask__5> <mask__2> carter usm .",7,1.430511e-06,0.999983,7,2287,7,952,Successful
8,Riddick parker,"riddick parker ( born november 20 , 1972 in emporia , virginia ) is a former professional american football defensive lineman for the seattle seahawks , san diego chargers , new england patriots , baltimore ravens , and san francisco 49ers of the national football league .",8,Gino berretta,"<mask__37> <mask__38> ( <mask__11> <mask__33> <mask__34> , <mask__36> <mask__26> <mask__35> , <mask__2> ) <mask__22> <mask__9> <mask__30> <mask__1> <mask__14> <mask__23> <mask__18> <mask__19> <mask__24> <mask__3> <mask__15> <mask__17> , <mask__6> <mask__5> <mask__20> , <mask__13> <mask__8> <mask__31> , <mask__12> <mask__7> , <mask__16> <mask__28> <mask__29> <mask__32> <mask__4> <mask__25> <mask__27> <mask__21> <mask__10> .",1281,0.0,0.996465,8,8592,8,2851,Failed
9,Blessed osanna of cattaro -lrb- ozana kotorska -rrb-,blessed osanna of cattaro t.o.s.d. ( ) was a catholic visionary and anchoress from cattaro ( kotor ) .she was a teenage convert from orthodoxy of serbian descent from montenegro ( zeta ) .she became a dominican tertiary and was posthumously venerated as a saint in kotor .she was later beatified in 1934 .,9,Adam kozłowiecki,blessed <mask__14> <mask__12> <mask__13> <mask__2>.o.s.d. ( ) <mask__6> a catholic <mask__1> and anchoress <mask__4> <mask__11> ( <mask__8> ) .she was a teenage convert from orthodoxy of <mask__7> descent from <mask__10> ( zeta ) .she became a dominican tertiary and was posthumously <mask__5> as a <mask__3> in <mask__9> .she was later beatified in 1934 .,9,0.0,0.999988,9,5212,9,2287,Successful


In [107]:
# 
#  Initialize attack
# 

from textattack import Attack
from textattack.constraints.pre_transformation import MaxWordIndexModification, RepeatModification

goal_function = ChangeClassificationToBelowTopKClasses(model_wrapper, k=1000)
constraints = [
    RepeatModification(),
    MaxWordIndexModification(max_length=dm.max_seq_length),
    MaxNumWordsModified(max_num_words=50)
]
transformation = WordSwapSingleWord(single_word=dm.document_tokenizer.mask_token)
search_method = textattack.search_methods.BeamSearch(beam_width=4)

attack = Attack(
    goal_function, constraints, transformation, search_method
)

from tqdm import tqdm # tqdm provides us a nice progress bar.
from textattack.attack_results import SuccessfulAttackResult
from textattack import Attacker
from textattack import AttackArgs

attack_args = AttackArgs(num_examples=15, disable_stdout=True)
dataset = WikiDataset(dm)

attacker = Attacker(attack, dataset, attack_args)

results_iterable = attacker.attack_dataset()

logger = CustomCSVLogger(color_method='html')

# 
# Run attack
# 
from tqdm import tqdm
for result in results_iterable:
    tqdm._instances.clear() # Doesn't fix the progress bar :-(
    logger.log_attack_result(result)

from IPython.display import display, HTML

display(HTML(logger.df.to_html(escape=False)))

textattack: No entry found for goal function <class '__main__.ChangeClassificationToBelowTopKClasses'>.
textattack: Unknown if model of class <class 'model.coordinate_ascent.CoordinateAscentModel'> compatible with goal function <class '__main__.ChangeClassificationToBelowTopKClasses'>.


Attack(
  (search_method): BeamSearch(
    (beam_width):  4
  )
  (goal_function):  ChangeClassificationToBelowTopKClasses
  (transformation):  WordSwapSingleWord
  (constraints): 
    (0): RepeatModification
    (1): MaxWordIndexModification(
        (max_length):  128
      )
    (2): MaxNumWordsModified(
        (max_num_words):  50
      )
  (is_black_box):  True
) 



[Succeeded / Failed / Skipped / Total] 6 / 9 / 0 / 15: 100%|██████████| 15/15 [09:27<00:00, 37.83s/it]


+-------------------------------+--------+
| Attack Results                |        |
+-------------------------------+--------+
| Number of successful attacks: | 6      |
| Number of failed attacks:     | 9      |
| Number of skipped attacks:    | 0      |
| Original accuracy:            | 100.0% |
| Accuracy under attack:        | 60.0%  |
| Attack success rate:          | 40.0%  |
| Average perturbed word %:     | 74.92% |
| Average num. words per input: | 51.87  |
| Avg num queries:              | 5579.4 |
+-------------------------------+--------+


textattack: Logging to CSV at path results.csv
textattack: CSVLogger exiting without calling flush().





Unnamed: 0,original_person,original_text,original_text_id_bm25,perturbed_person,perturbed_text,perturbed_text_id_bm25,original_score,perturbed_score,original_output,perturbed_output,ground_truth_output,num_queries,result_type
0,Michael iii of alexandria,"pope michael iii of alexandria ( also known as khail iii ) was the coptic pope of alexandria and patriarch of the see of st. mark ( 880 -- 907 ) .in 882 , the governor of egypt , ahmad ibn tulun , forced khail to pay heavy contributions , forcing him to sell a church and some attached properties to the local jewish community .this building was at one time believed to have later become the site of the cairo geniza .",0,Khuwaylid ibn asad,"<mask__29> <mask__39> <mask__34> <mask__36> <mask__1> ( <mask__37> known as khail <mask__38> ) was <mask__3> <mask__25> <mask__28> of <mask__32> <mask__33> <mask__23> of <mask__22> <mask__30> of <mask__4>. <mask__31> ( <mask__27> -- <mask__35> ) .in <mask__26> , the governor of <mask__24> , ahmad ibn tulun , forced khail <mask__11> pay <mask__19> contributions , <mask__20> <mask__17> to <mask__12> a <mask__18> <mask__21> <mask__14> <mask__13> <mask__16> <mask__15> the <mask__7> jewish community .this building was at <mask__9> <mask__8> believed <mask__10> have later become <mask__6> site of the <mask__5> <mask__2> .",11327,0.0001561642,1.0,0,4224,0,8206,Successful
1,Hui jun,hui jun is a male former table tennis player from china .,1,Mshindo msolla,<mask__9> <mask__8> <mask__1> <mask__3> <mask__6> former <mask__10> <mask__2> <mask__4> <mask__5> <mask__7> .,11934,0.0,0.99998,1,2234,1,228,Successful
2,Okan öztürk,okan Öztürk ( born 30 november 1977 ) is a turkish professional footballer .he currently plays as a striker for yeni malatyaspor .,2,Pietro manganelli,<mask__18> <mask__17> ( <mask__3> <mask__10> <mask__6> <mask__15> ) <mask__4> <mask__12> <mask__13> professional <mask__7> .<mask__1> <mask__2> <mask__5> <mask__9> a <mask__11> <mask__8> <mask__14> <mask__16> .,1224,0.0,0.999936,2,9936,2,769,Successful
3,Marie stephan,"marie stephan , ( born march 14 , 1996 ) is a professional squash player who represents france .she reached a career-high world ranking of world no. 101 in july 2015 .",3,Jim gregory,"<mask__24> <mask__27> , ( <mask__15> <mask__23> <mask__5> , <mask__26> ) <mask__12> <mask__10> <mask__22> <mask__18> <mask__16> <mask__11> <mask__21> <mask__9> .<mask__8> <mask__1> <mask__19> <mask__17> <mask__6> <mask__13> <mask__14> <mask__7> <mask__2>. <mask__3> <mask__20> <mask__4> <mask__25> .",3057,0.0,0.999408,3,12602,3,1432,Failed
4,Leonard l. martino,leonard l. martino is a former democratic member of the pennsylvania house of representatives .he was born in butler to michael and angela pitullio martino .,4,David c. brown,<mask__22> <mask__23>. <mask__21> <mask__2> <mask__6> <mask__1> <mask__10> <mask__8> <mask__3> <mask__7> <mask__14> <mask__9> <mask__12> <mask__11> .<mask__13> <mask__5> <mask__4> in <mask__15> <mask__20> <mask__19> and <mask__16> <mask__17> <mask__18> .,2433,2.384186e-07,0.999928,4,8209,4,1214,Successful
5,Salome jens,"salome jens ( born may 8 , 1935 ) is an american stage , film and television actress .she is perhaps best known for portraying the female changeling on '' '' .",5,Caroline sunshine,"<mask__25> <mask__23> ( <mask__3> <mask__22> <mask__1> , <mask__24> ) <mask__10> <mask__11> <mask__16> <mask__19> , <mask__12> <mask__9> <mask__8> <mask__18> .<mask__17> <mask__4> <mask__21> <mask__6> <mask__15> <mask__13> <mask__7> <mask__14> <mask__20> <mask__2> <mask__5> '' '' .",11804,0.0,0.998692,5,9882,5,1226,Failed
6,Carl crawford,"carl demonte crawford ( born august 5 , 1981 ) , nicknamed `` the perfect storm '' , is an american professional baseball left fielder with the los angeles dodgers of major league baseball ( mlb ) .he bats and throws left-handed .crawford was drafted by the tampa bay devil rays in the second round ( 52nd overall ) of the 1999 major league baseball draft .he made his major league debut in 2002 .crawford has more triples ( 121 ) than any other active baseball player .",6,Brooks brown,"<mask__50> <mask__49> <mask__48> ( <mask__47> <mask__46> <mask__25> , <mask__20> ) , <mask__45> <mask__44> <mask__43> <mask__42> <mask__41> '' , <mask__40> <mask__39> <mask__38> <mask__37> <mask__36> <mask__35> <mask__34> <mask__33> <mask__32> <mask__31> <mask__30> <mask__29> of major league <mask__21> ( <mask__7> ) .he <mask__3> <mask__4> throws <mask__5> .<mask__28> <mask__17> drafted by <mask__11> <mask__22> <mask__23> <mask__26> <mask__24> <mask__9> <mask__1> second round ( 52nd <mask__16> ) of <mask__2> <mask__14> major league baseball draft .he made <mask__8> major league debut <mask__10> <mask__15> .<mask__27> <mask__12> <mask__19> <mask__6> ( <mask__13> ) <mask__18> any other active baseball player .",9148,0.0,0.999502,6,9110,6,9876,Failed
7,Jim bob,"jim bob ( born james neil morrison on 22 november 1960 ) is a british musician and author , best known as the singer of indie punk band carter usm .",7,Alexandre imperatori,"<mask__19> <mask__22> ( <mask__11> <mask__18> <mask__17> <mask__14> <mask__5> <mask__2> <mask__21> <mask__23> ) <mask__6> a <mask__20> <mask__4> <mask__9> <mask__1> , <mask__10> <mask__3> <mask__8> the <mask__12> <mask__7> <mask__15> <mask__16> <mask__13> carter usm .",7,1.430511e-06,0.999974,7,6676,7,1392,Successful
8,Riddick parker,"riddick parker ( born november 20 , 1972 in emporia , virginia ) is a former professional american football defensive lineman for the seattle seahawks , san diego chargers , new england patriots , baltimore ravens , and san francisco 49ers of the national football league .",8,Gino berretta,"<mask__37> <mask__38> ( <mask__11> <mask__33> <mask__34> , <mask__36> <mask__26> <mask__35> , <mask__2> ) <mask__22> <mask__9> <mask__30> <mask__1> <mask__14> <mask__23> <mask__18> <mask__19> <mask__24> <mask__3> <mask__15> <mask__17> , <mask__6> <mask__5> <mask__20> , <mask__13> <mask__8> <mask__31> , <mask__12> <mask__7> , <mask__16> <mask__28> <mask__29> <mask__32> <mask__4> <mask__25> <mask__27> <mask__21> <mask__10> .",1281,0.0,0.996465,8,8592,8,2851,Failed
9,Blessed osanna of cattaro -lrb- ozana kotorska -rrb-,blessed osanna of cattaro t.o.s.d. ( ) was a catholic visionary and anchoress from cattaro ( kotor ) .she was a teenage convert from orthodoxy of serbian descent from montenegro ( zeta ) .she became a dominican tertiary and was posthumously venerated as a saint in kotor .she was later beatified in 1934 .,9,Paul darmanin,blessed <mask__19> <mask__18> <mask__17> <mask__7>.o.s.d. ( ) <mask__11> a catholic <mask__6> and anchoress <mask__9> <mask__16> ( <mask__13> ) .she <mask__1> a teenage convert from orthodoxy of <mask__12> descent from <mask__15> ( zeta ) .she became a dominican tertiary and was <mask__4> <mask__10> as a <mask__8> in <mask__14> .<mask__2> was <mask__3> <mask__5> in 1934 .,9,0.0,1.0,9,4593,9,2967,Successful
