# Pre-Processing - Adversarial Attacks

We test the model to textual changes with an evaluation scheme proposed in
*On Adversarial Examples for Biomedical NLP Tasks* by (Araujo et al., 2020),
which we further extended. We mimic a human operator who commits typographical
errors and expresses the original medical terms with synonyms. We only select
biomedical terms to proceed with the following word/sentence manipulation.

In [None]:
from copy import deepcopy
import json
import os
import random

import matplotlib.pyplot as plt
import numpy as np
import spacy
import scispacy
from scispacy.linking import EntityLinker
from tqdm import tqdm

%matplotlib inline

In [None]:
ANNOTATIONS = "../../datasets/mimic_cxr/annotations/"

TUI_WHITELIST = [
    "3300", "T005", "T007", "T017", "T019", "T020", "T021", "T022", "T023",
    "T024", "T025", "T029", "T030", "T033", "T034", "T037", "T039", "T041",
    "T046", "T047", "T048", "T055", "T056", "T059", "T060", "T060", "T061",
    "T074", "T093", "T097", "T101", "T121", "T125", "T170", "T184", "T190",
    "T191", "T195", "T201"
]

### scispaCy

In [None]:
spacy_pipeline = spacy.load("en_core_sci_sm")
spacy_pipeline.add_pipe(
    "scispacy_linker",
    config={"resolve_abbreviations": True, "linker_name": "umls"})

linker = spacy_pipeline.get_pipe("scispacy_linker")

### Annotations

In [None]:
with open(os.path.join(ANNOTATIONS, "test.jsonl"), "r") as f:
    test = [json.loads(i) for i in f.readlines()]

## Attacks

In [None]:
def swap_letters(text):
    # Tokenise.
    words = [i for i in text.split(" ")]
    
    for i in range(len(words)):
        if len(words[i]) < 2:
            continue 

        # Swap two characters for each word.
        word = list(words[i])
        j = random.randint(0, len(word) - 2)
        word[j], word[j + 1] = word[j + 1], word[j]
        words[i] = "".join(word)

    return " ".join(words)

In [None]:
def misplaced_letters(text):
    _keys = {"A": ["Q", "Z", "W", "S"],
             "B": ["G", "V", "N", "H"],
             "C": ["D", "X", "F", "V"],
             "D": ["E", "S", "X", "C", "F", "R"],
             "E": ["3", "4", "W", "S", "D", "R"],
             "F": ["R", "D", "C", "V", "G", "T"],
             "G": ["T", "F", "V", "B", "H", "Y"],
             "H": ["Y", "G", "B", "N", "J", "U"],
             "I": ["8", "9", "U", "J", "K", "O"],
             "J": ["U", "H", "N", "M", "K", "I"],
             "K": ["I", "J", "M", "L", "O"],
             "L": ["O", "K", "P"],
             "M": ["J", "N", "K"],
             "N": ["H", "B", "M", "J"],
             "O": ["9", "0", "I", "K", "L", "P"],
             "P": ["O", "L"],
             "Q": ["1", "2", "W", "A"],
             "R": ["4", "5", "E", "D", "F", "T"],
             "S": ["W", "A", "Z", "X", "D", "E"],
             "T": ["5", "6", "R", "F", "G", "Y"],
             "U": ["7", "8", "Y", "H", "J", "I"],
             "V": ["F", "C", "B", "G"],
             "W": ["2", "3", "Q", "A", "S", "E"],
             "X": ["S", "Z", "D", "C"],
             "Y": ["6", "7", "T", "G", "H", "U"],
             "Z": ["A", "S", "X"]}
    
    # Tokenise.
    words = [i for i in text.split(" ")]
    
    for i in range(len(words)):
        if len(words[i]) < 2:
            continue

        word = list(words[i])
        
        is_lower = False
        j = random.randint(0, len(word) - 1)
        
        if word[j].islower():
            is_lower = True
            
        try:  # swap a random character with an adjacent one.
            word[j] = random.choice(_keys[word[j].upper()])
            
            if is_lower:
                word[j] = word[j].lower()
        except KeyError:  # might be a special character.
            continue
            
        words[i] = "".join(word)
        
    return " ".join(words)

## NER

In [None]:
with open(os.path.join(ANNOTATIONS, "test.jsonl"), "r") as f:
    test = [json.loads(i) for i in f.readlines()]

for i in tqdm(test):
    pre_text = spacy_pipeline(i['text'])
    post_text = deepcopy(i['text'])
    umls_match = set()

    # Match entities with UMLS.
    for entity in pre_text.ents:
        for umls_entity in entity._.kb_ents:
            if linker.kb.cui_to_entity[umls_entity[0]].types[0] in TUI_WHITELIST:
                umls_match.add(entity)

    # Augmentation with swapping.
    for entity in list(umls_match):
        post_text = post_text.replace(str(entity), swap_letters(str(entity)))

    i['text'] = post_text

# with open(os.path.join(ANNOTATIONS, "test_attack_swapping.jsonl"), "w") as f:
#     for sample in tqdm(test):
#         f.write(json.dumps(sample) + "\n")

### Synonyms-based Attack

In [None]:
with open(os.path.join(ANNOTATIONS, "test.jsonl"), "r") as f:
    test = [json.loads(i) for i in f.readlines()]

for i in tqdm(test):
    pre_text = spacy_pipeline(i['text'])
    post_text = deepcopy(i['text'])
    umls_match = {}

    # Match entities with UMLS.
    for entity in pre_text.ents:
        for umls_entity in entity._.kb_ents:
            if linker.kb.cui_to_entity[umls_entity[0]].types[0] in TUI_WHITELIST:
                try:
                    umls_match[str(entity)] = random.choice(
                        linker.kb.cui_to_entity[umls_entity[0]].aliases)
                except IndexError:
                    continue

    # Augmentation with synonyms.
    for entity in umls_match.keys():
        post_text = post_text.replace(str(entity), umls_match[str(entity)])

    i['text'] = post_text
    

# with open(os.path.join(ANNOTATIONS, "test_attack_synonyms.jsonl"), "w") as f:
#     for sample in tqdm(test):
#         f.write(json.dumps(sample) + "\n")    

### Replacement-based Attack

In [None]:
with open(os.path.join(ANNOTATIONS, "test.jsonl"), "r") as f:
    test = [json.loads(i) for i in f.readlines()]

test_healthy = [i for i in test if np.sum(
    i['labels']) == 1 and np.argmax(i['labels']) == 8]
test_rest = [i for i in test if not i in test_healthy]

for i in tqdm(test_healthy):
    post_text = deepcopy(i['text'])
    
    # Randomly select an indication field from the `unhealthy` patients.
    post_text = random.choice(test_rest)['text']
    
    i['text'] = post_text

# with open(os.path.join(ANNOTATIONS, "test_attack_replacement.jsonl"), "w") as f:
#     for sample in tqdm(test_healthy):
#         f.write(json.dumps(sample) + "\n")

In [None]:
with open(os.path.join(ANNOTATIONS, "test.jsonl"), "r") as f:
    test = [json.loads(i) for i in f.readlines()]

test_map = {}
    
for i in test:
    test_map[i['id']] = []
    
    for j in test:
        overlap = False

        if i == j:
            continue
            
        for k in range(14):
            if i['labels'][k] == 1 and j['labels'][k] == 1:
                overlap = True
                break
                
        if not overlap:
            test_map[i['id']].append(j['id'])

In [None]:
test_shuffled = []

for i in test:
    subject = deepcopy(i)
    random_subject_id = random.choice(test_map[subject['id']])
    
    for j in test:
        if random_subject_id == j['id']:
            subject['text'] = j['text']
    
    test_shuffled.append(subject)
    
# with open(os.path.join(ANNOTATIONS,
#                        "test_attack_multilabel_replacement.jsonl"), "w") as f:
#     for sample in tqdm(test_shuffled):
#         f.write(json.dumps(sample) + "\n")