<a href="https://colab.research.google.com/github/haoboooo/Robustness-of-MRC-Models-to-Entity-Renaming/blob/main/filter_name_rename.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 1 using flair to tag sentences

In [29]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
!pip install flair



In [82]:
from flair.data import Sentence
from flair.models import SequenceTagger
import random

In [5]:
sentence = Sentence('I love Berlin, Lucia.') # create a sentence
tagger = SequenceTagger.load('ner') # load the NER tagger
tagger.predict(sentence) # run NER over the sentence

Downloading:   0%|          | 0.00/432M [00:00<?, ?B/s]

2022-04-10 20:34:28,059 loading file /root/.flair/models/ner-english/4f4cdab26f24cb98b732b389e6cebc646c36f54cfd6e0b7d3b90b25656e4262f.8baa8ae8795f4df80b28e7f7b61d788ecbb057d1dc85aacb316f1bd02837a4a4
2022-04-10 20:34:30,828 SequenceTagger predicts: Dictionary with 20 tags: <unk>, O, S-ORG, S-MISC, B-PER, E-PER, S-LOC, B-ORG, E-ORG, I-PER, S-PER, B-MISC, I-MISC, E-MISC, I-ORG, B-LOC, E-LOC, I-LOC, <START>, <STOP>


In [11]:
print(sentence)
for entity in sentence.get_spans('ner'): # iterate over entities and print each
    print(entity)

Sentence: "I love Berlin , Lucia ." → ["Berlin"/LOC, "Lucia"/LOC]
Span[2:3]: "Berlin" → LOC (0.9968)
Span[4:5]: "Lucia" → LOC (0.8008)


    convert the sentence after prediction to dictionary and access the labels

In [15]:
print(sentence.to_dict(tag_type='ner'))

{'text': 'I love Berlin, Lucia.', 'ner': [{'value': 'LOC', 'confidence': 0.9967647790908813}, {'value': 'LOC', 'confidence': 0.8007730841636658}]}


In [19]:
for label in sentence.labels:
    print(label) # flair label type
for label in sentence.get_labels('ner'):
    print(label)

Span[2:3]: "Berlin" → LOC (0.9968)
Span[4:5]: "Lucia" → LOC (0.8008)
Span[2:3]: "Berlin" → LOC (0.9968)
Span[4:5]: "Lucia" → LOC (0.8008)


In [21]:
entities = sentence.to_dict(tag_type='ner')
print(entities)

{'text': 'I love Berlin, Lucia.', 'ner': [{'value': 'LOC', 'confidence': 0.9967647790908813}, {'value': 'LOC', 'confidence': 0.8007730841636658}]}


In [22]:
sentence = Sentence('George Washington went to Washington.')

# predict NER tags
tagger.predict(sentence)

# print sentence with predicted tags
print(sentence)

Sentence: "George Washington went to Washington ." → ["George Washington"/PER, "Washington"/LOC]


In [23]:
for entity in sentence.get_spans('ner'):
    print(entity)

Span[0:2]: "George Washington" → PER (0.9989)
Span[4:5]: "Washington" → LOC (0.9942)


In [24]:
# iterate over each entity
for entity in sentence.get_spans('ner'):
    
    # print entity text, start_position and end_position
    print(f'entity.text is: "{entity.text}"')
    print(f'entity.start_position is: "{entity.start_position}"')
    print(f'entity.end_position is: "{entity.end_position}"')
    
    # also print the value and score of its "ner"-label
    print(f'entity "ner"-label value is: "{entity.get_label("ner").value}"')
    print(f'entity "ner"-label score is: "{entity.get_label("ner").score}"\n')

entity.text is: "George Washington"
entity.start_position is: "0"
entity.end_position is: "17"
entity "ner"-label value is: "PER"
entity "ner"-label score is: "0.9988862872123718"

entity.text is: "Washington"
entity.start_position is: "26"
entity.end_position is: "36"
entity "ner"-label value is: "LOC"
entity "ner"-label score is: "0.9942096471786499"



In [25]:
from flair.models import SequenceTagger
from flair.tokenization import SegtokSentenceSplitter

# example text with many sentences
text = "This is a sentence. This is another sentence. I love Berlin."

# initialize sentence splitter
splitter = SegtokSentenceSplitter()

# use splitter to split text into list of sentences
sentences = splitter.split(text)

# predict tags for sentences
tagger = SequenceTagger.load('ner')
tagger.predict(sentences)

# iterate through sentences and print predicted labels
for sentence in sentences:
    print(sentence)

2022-04-10 21:05:14,597 loading file /root/.flair/models/ner-english/4f4cdab26f24cb98b732b389e6cebc646c36f54cfd6e0b7d3b90b25656e4262f.8baa8ae8795f4df80b28e7f7b61d788ecbb057d1dc85aacb316f1bd02837a4a4
2022-04-10 21:05:16,570 SequenceTagger predicts: Dictionary with 20 tags: <unk>, O, S-ORG, S-MISC, B-PER, E-PER, S-LOC, B-ORG, E-ORG, I-PER, S-PER, B-MISC, I-MISC, E-MISC, I-ORG, B-LOC, E-LOC, I-LOC, <START>, <STOP>
Sentence: "This is a sentence ."
Sentence: "This is another sentence ."
Sentence: "I love Berlin ." → ["Berlin"/LOC]


# 2 access to the squad1 data

    define a squad example

In [30]:
class SquadExample(object):
    """
    A single training/test example for the Squad dataset.
    For examples without an answer, the start and end position are -1.
    """

    def __init__(self,
                 qas_id,
                 question_text,
                 doc_tokens,
                 orig_answer_text=None,
                 start_position=None,
                 end_position=None,
                 is_impossible=None):
        self.qas_id = qas_id
        self.question_text = question_text
        self.doc_tokens = doc_tokens
        self.orig_answer_text = orig_answer_text
        self.start_position = start_position
        self.end_position = end_position
        self.is_impossible = is_impossible

    load data from json, and store each qa into an example instance

In [48]:
import json
with open('./drive/MyDrive/Colab Notebooks/544_project/dev-v1.1.json',"r", encoding='utf-8') as reader:
    input_data = json.load(reader)["data"]
def is_whitespace(c):
    if c == " " or c == "\t" or c == "\r" or c == "\n" or ord(c) == 0x202F:
        return True
    return False

def whitespace_tokenize(text):
    """Runs basic whitespace cleaning and splitting on a piece of text."""
    text = text.strip()
    if not text:
        return []
    tokens = text.split()
    return tokens   


In [84]:
# input_data ->list, input_data[0]["paragraphs"]->content list, input_data[0]["paragraphs"][0]->dic: context,qas, 
# input_data[0]["paragraphs"][0]["qas"] -> list of qas, input_data[0]["paragraphs"][0]["qas"][0] -> dic answers(list of same aws), question(string), id
print(input_data[0]["paragraphs"][0]["qas"][1])

{'answers': [{'answer_start': 249, 'text': 'Carolina Panthers'}, {'answer_start': 249, 'text': 'Carolina Panthers'}, {'answer_start': 249, 'text': 'Carolina Panthers'}], 'question': 'Which NFL team represented the NFC at Super Bowl 50?', 'id': '56be4db0acb8001400a502ed'}


In [78]:
examples = []
for entry in input_data[:2]:
    for paragraph in entry["paragraphs"][:2]:
        paragraph_text = paragraph["context"] # context paragraph
        # print(paragraph_text)

        doc_tokens = []
        char_to_word_offset = []
        prev_is_whitespace = True

        # tokenizer the context
        for c in paragraph_text:
            if is_whitespace(c):
                prev_is_whitespace = True
            else:
                if prev_is_whitespace:
                    doc_tokens.append(c)
                else:
                    doc_tokens[-1] += c
                prev_is_whitespace = False
            char_to_word_offset.append(len(doc_tokens) - 1)
        # print(doc_tokens) # tokenized tokens
        # print(char_to_word_offset) # index to tokens


        for qa in paragraph["qas"][:2]:
            qas_id = qa["id"]
            question_text = qa["question"]

            start_position = None
            end_position = None
            orig_answer_text = None

            answerdic = qa["answers"][0]
            orig_answer_text = answerdic["text"] # answer string
            answer_offset = answerdic["answer_start"] # answer index
            answer_length = len(orig_answer_text)
            start_position = char_to_word_offset[answer_offset] # answer index to word start
            end_position = char_to_word_offset[answer_offset + answer_length - 1] # answer index to word end
            actual_text = " ".join(doc_tokens[start_position:(end_position + 1)]) # get the answer from the tokens
            # print(orig_answer_text)
            # print(answer_offset)
            # print(start_position)
            # print(end_position)
            # print(actual_text)
            cleaned_answer_text = " ".join(whitespace_tokenize(orig_answer_text)) # strip and join split
            # print(cleaned_answer_text)
            if actual_text.find(cleaned_answer_text) == -1:
              print("Could not find answer: '%s' vs. '%s'",actual_text, cleaned_answer_text)
              continue
            
            example = SquadExample(
                qas_id=qas_id,
                question_text=question_text,
                doc_tokens=doc_tokens,
                orig_answer_text=orig_answer_text,
                start_position=start_position,
                end_position=end_position,
                is_impossible=None)
            examples.append(example)

In [79]:
print(examples[0])

<__main__.SquadExample object at 0x7f409bc27210>


In [80]:
counter = 0
for example in examples:
    if example.orig_answer_text!=None:
        counter += 1
        print(example.question_text, example.orig_answer_text)
    if counter>10:
        break

Which NFL team represented the AFC at Super Bowl 50? Denver Broncos
Which NFL team represented the NFC at Super Bowl 50? Carolina Panthers
Which Carolina Panthers player was named Most Valuable Player? Cam Newton
How many appearances have the Denver Broncos made in the Super Bowl? 8
What was Maria Curie the first female recipient of? Nobel Prize
What year was Casimir Pulaski born in Warsaw? 1745
Over how many species of trees can be found in the Saxon Garden? 100
What is a popular strolling destination for the Varsovians? Krasiński Palace Garden


In [97]:
def replaceEntityNames(originalParagraphs, candidateList, Tagger):
    QaList = []
    EntityList = []
    length = len(candidateList)
    for para in originalParagraphs[:10]: # iterate all context-qas
        qa_list = [] # question-answer pairs for each context
        entity_list = []
        hasInstance = False
        context = para['context'] # get context
        for qa in para['qas']: # iterate all question-answer pairs
            if not qa['answers']:
                continue
            hasPer = False
            answer = qa['answers'][0]['text'] # get the answer
            sentence = Sentence(answer)
            Tagger.predict(sentence)
            for entity in sentence.get_spans('ner'):
                if 'PER' == entity.tag: # whether the answer is PER tag
                    entity_list.append(entity.text)
                    hasPer = True
            if hasPer: # if answer has PER tag
                qa_list.append(qa['question']) # add all question answer pairs to list
                qa_list.append(answer)
                hasInstance = True
        if hasInstance:
            qas_list = [qa_list, context] # question-answer pairs and context
            QaList.append(qas_list) 
            EntityList.append(entity_list)

    for i in range(len(QaList)):
        qas = QaList[i][0] # all question answer pairs
        context = QaList[i][1] # get context
        for oldName in EntityList[i]: # each context each answer anme
            newName = candidateList[random.randint(0, length-1)] # get a new name randomly
            for j in range(len(qas)):
                qas[j] = qas[j].replace(oldName, newName)
            context = context.replace(oldName, newName)
        QaList[i][0] = qas
        QaList[i][1] = context

    return QaList

In [98]:
def writeToFile(data, candidates, Tagger, outputFile):
    perterbed_list = []
    for sample in data:
        qas = sample['paragraphs']
        perterbed_list.append(replaceEntityNames(qas, candidates, Tagger))
    with open(outputFile, 'w', encoding='utf-8') as f:
        for paragraphs in perterbed_list:
            for paragraph in paragraphs:
                for k in range(0, len(paragraph[0]), 2):
                    f.write(paragraph[0][k] + '/' + paragraph[0][k+1] + '\n')
                f.write('\n' + paragraph[1] + '\n\n')

In [99]:
nameList = [
    'James',
    'Mia Forcost',
    'gags iiopg asvg',
    'Ming Cheng'
]

In [100]:
replaceEntityNames(input_data[0]["paragraphs"],nameList,tagger)

[]
[]
[[['Which Carolina Panthers player was named Most Valuable Player?', 'Cam Newton', "Who was this season's NFL MVP?", 'Cam Newton', "Which Carolina Panthers team member was picked as the team's MVP in 2015? ", 'Cam Newton', 'Who is the quarterback for the Panthers?', 'Cam Newton', 'Who was the Most Valuable Player for the 2015 NFL season?', 'Cam Newton', 'Who was the 2015 NFL MVP?', 'Cam Newton'], 'The Panthers finished the regular season with a 15–1 record, and quarterback Cam Newton was named the NFL Most Valuable Player (MVP). They defeated the Arizona Cardinals 49–15 in the NFC Championship Game and advanced to their second Super Bowl appearance since the franchise was founded in 1995. The Broncos finished the regular season with a 12–4 record, and denied the New England Patriots a chance to defend their title from Super Bowl XLIX by defeating them 20–18 in the AFC Championship Game. They joined the Patriots, Dallas Cowboys, and Pittsburgh Steelers as one of four teams that ha