In [5]:
# Based on results from Adaptation theory and non-fluent aphasia in english by Salis and Edwards (2004)

In [6]:
import pandas as pd
import spacy
nlp = spacy.load("en_core_web_sm")
import numpy as np
import random
import re
from pattern.en import conjugate, lemma, lexeme, PRESENT, SG
from spacy.matcher import Matcher
from datasets import load_dataset
from nltk.tokenize import sent_tokenize
import re
from preprocess import preprocess, postprocess

In [7]:
dets = {'Art': ['a', 'an', 'the', ''],
           'Dem': ['this', 'that', 'these', 'those', ''],
           'Poss': ['my', 'your', 'his', 'her', 'its', 'our', 'their', '']}

In [8]:
def det_sub(det):
    for _, detrms in dets.items():
        if det.lower() in detrms:
            y = [j for j in detrms if det!=j]
            return random.choice(y)
    return ""

In [9]:
def handle_determiner(tok):
    x = np.random.uniform(0,1)
    utt = tok.text + " "
    
    # 3% of determiners were substituted
    if x >= 0.97:
        if tok.pos_ == "DET" or "Dem" in tok.morph.get('PronType') or "Yes" in tok.morph.get('Poss'):
            utt = det_sub(tok.text) + " "
        
    # 19% determiners were omitted
    if x >= 0.81:
        utt = " "
        
    return utt

In [10]:
def handle_verb(tok):
    x = np.random.uniform(0,1)
    utt = tok.text + " "

    # 5% of copula were substituted
    if x >= 0.95 and tok.pos_ == "AUX" and tok.dep_ == "ROOT":
        utt = conjugate(verb=tok.text,tense=PRESENT,number=SG) + " " 
        
    # 6% of lexical bound morphemes were removed
    if x >= 0.94 and tok.pos_ == "VERB":
        utt = tok.lemma_ + " "
    
    # 8% of auxiliary verbs were substituted
    if x >= 0.92 and tok.pos_ == "AUX" and tok.dep_ == "aux":
        utt = conjugate(verb=tok.text,tense=PRESENT,number=SG) + " " 
        
    # 12% of lexical verbs were substituted (tense error) 
    if x >= 0.88 and tok.pos_ == "VERB":
        utt = conjugate(verb=tok.text,tense=PRESENT,number=SG) + " " 
    
    # 14% of auxiliary verbs were omitted
    if x >= 0.86 and tok.pos_ == "AUX" and tok.dep_ == "aux":
        utt = " " 
    
    # 17% verbs were omitted
    if x >= 0.83:
        utt = " "
    return utt

In [11]:
def handle_preposition(tok):
    x = np.random.uniform(0,1)
    utt = tok.text + " "
    # 2% of prepositions were substituted
    if x >= 0.98:
        # TODO substitute
        pass
    # 37% of prepositions were omitted
    if x >= 0.63:
        utt = " "
    return utt

In [12]:
def handle_person_pron(tok):
    utt = tok.text + " "
    x = np.random.uniform(0,1)
    # 27% of personal pronouns were omitted
    if x >= 0.73:
        utt = " "
    return utt

In [13]:
def aphasic_speech(sentence):
    vp_pattern = [[{'POS': 'VERB', 'OP': '?'},
                   {'POS': 'ADV', 'OP': '*'},
                   {'POS': 'AUX', 'OP': '*'},
                   {'POS': 'VERB', 'OP': '+'}]]
    matcher = Matcher(nlp.vocab)
    matcher.add("Verb phrase", vp_pattern)
    n = 15
    aphasic_utt = ""
    doc = nlp(sentence)
    # print(sentence, len(sentence.split()))
    if len(sentence.split()) <= n:
        # get NPs
        noun_phrases = set()

        for nc in doc.noun_chunks:
            for nop in [nc, doc[nc.root.left_edge.i:nc.root.right_edge.i + 1]]:
                noun_phrases.add(nop.text.strip())
                # get VPs
        verb_phrases = matcher(doc)
        verb_phrases = [doc[start:end] for _, start, end in verb_phrases]

        try:
            ratio = len(noun_phrases) / len(verb_phrases)
        except:
            # print("Division by zero")
            return aphasic_utt

        X = np.random.uniform(0, 1)
        # print(ratio, X)
        if ratio > 2 and X <= 0.8:
            # skip sentence
            return aphasic_utt
    
        else:
            # don't skip sentence
            for tok in doc:
                if tok.pos_ in ["DET", "PRON"]:
                    aphasic_utt += handle_determiner(tok)      
                elif tok.pos_ in ["VERB", "AUX"]:
                    aphasic_utt += handle_verb(tok)
                elif tok.dep_ == "prep" or tok.pos_ == "ADP":
                    aphasic_utt += handle_preposition(tok)
                elif tok.morph.get('Case') == 'Nom' and tok.morph.get('Person') == 1:
                    aphasic_utt += handle_person_pron(tok)
                else:
                    aphasic_utt += tok.text + " "
    
    return aphasic_utt

In [14]:
ds = load_dataset("imdb")
texts = ds["train"]["text"]
sents = []
save_path = "data/synthetic_salis.csv"

for text in texts:
    t = re.sub(r'\<.*?\>', " ", text)
    sentences = sent_tokenize(t)
    for sent in sentences:
        sent = preprocess(sent)
        sents.append(" ".join(sent.split()))

broca_ds = sents[500:1000]
control_ds = sents[1000:1500]

print("Augmenting data")

augmented_sentences = []
control_sentences = []

for x in broca_ds:
    broca_sentence = aphasic_speech(x)
    if isinstance(broca_sentence, str):
        broca_sentence = postprocess(broca_sentence)
        if broca_sentence:
            augmented_sentences.append(broca_sentence)

for x in control_ds:
    control_sentence = postprocess(x)
    if control_sentence:
        control_sentences.append(control_sentence)

broca_data = pd.DataFrame(data={"modified": augmented_sentences, "label": [1] * len(augmented_sentences)})
control_data = pd.DataFrame(data={"modified": control_sentences, "label": [0] * len(control_sentences)})
data_full_scenario = pd.concat([broca_data, control_data], ignore_index=True)
data_full_scenario = data_full_scenario.sample(frac=1).reset_index(drop=True)
data_full_scenario.to_csv(save_path, sep=",", index=False)

print(f"Sentences retained after augmentation: {len(sents) / len(augmented_sentences)}")

Augmenting data
Sentences retained after augmentation: 2884.4444444444443
