##### NOTE: In CLAN the C-NNLA command was used to get the distribution parameters. However, error outputs from C-NNLA were not very useful (like % correct ..., % incorrect, % grammatical etc) since some CHA files do not annotate errors like [* p], [* s] (but just replace it with xxx, yyy, zzz), making those measures unreliable

##### However, I am trying to include the 5 most common word errors from the C-NNLA measures and basing their occurance on the Salis and Edwards (2004) paper

##### This approach also does not consider any utterance errors/substitutions (marked as @u or [* p:n] in AphasiaBank)

In [3]:
import string

import spacy
import numpy as np
import pandas as pd
import scipy.stats as stats
import random
import re
from pattern.text.en import singularize, pluralize
import enchant
from preprocess import preprocess
d = enchant.Dict("en_US")
nlp = spacy.load("en_core_web_sm")

In [4]:
dets = {'Art': ['a', 'an', 'the', ''],
           'Dem': ['this', 'that', 'these', 'those', ''],
           'Poss': ['my', 'your', 'his', 'her', 'its', 'our', 'their', '']}

In [5]:
def det_sub(x):
    for _, det in dets.items():
        if x.lower() in det:
            y = [j for j in det if x!=j]
            return random.choice(y)
    return ""

In [6]:
def get_truncnorm(mean, std, min, max):
    # mean, std, min, max parameters dependent on their extracted normal
    # distributions
    a, b = (min - mean) / std, (max - mean) / std
    return stats.truncnorm(a, b, loc=mean, 
                                   scale=std).rvs(size=1)[0]

In [7]:
def get_curr_nv_ratio(nouns, verbs):
    if len(nouns) != 0 and len(verbs) != 0:
        curr_ratio_nv = len(nouns)/len(verbs)
    else:
        curr_ratio_nv = 0
    return curr_ratio_nv

In [8]:
def get_alt_word(tok):
    # get a new word which is the same length as old word
    # to simulate p:w errors
    possible_words = [x for x in d.suggest(tok.text) if len(x) ==
                      len(tok.text) and d.check(x) and x != tok.text] 
    if possible_words:
        new_word = random.choice(possible_words)
        if new_word:
            return new_word
    
    # if we cannot find new word just return current word
    return tok.text

In [9]:
def aphasic_speech(text):
    doc = nlp(text)
    # max length of modified sentences follows a gamma distribution
    # determine how long this sentence will be (max in broca data was 47)
    n = np.random.gamma(shape=1.804433, scale=1/0.505594)
    n = round(n)
    
    while n > 47 or n < 1:
        n = np.random.gamma(shape=1.804433, scale=1/0.505594)
        n = round(n)
        
    utt = ""
    # length of original text
    length = len(re.findall("[a-zA-Z_]+", text))
    
    if length != 0:
        # get possible n/v ratio for this sentence
        ratio_nv = np.random.gamma(shape=2.180031, scale=1/1.498104)
        
        # get the possible percentage of all POS
        # noun and verb distributions are gamma, rest are truncated normal
        percent_noun = np.random.gamma(shape=4.0047683, scale=1/0.1944749)
        percent_verb = np.random.gamma(shape=9.9920204, scale=1/0.5973042)
        percent_det = get_truncnorm(7.55312, 6.004386, 0, 27.79661)
        percent_prep = get_truncnorm(3.15664, 2.386052, 0, 15.05682)
        percent_adj = get_truncnorm(4.258013, 3.460436, 0, 21.05263)
        percent_adv = get_truncnorm(5.808547, 2.911826, 0, 15.88448)
        
        nouns = []
        verbs = []
        determiners = []
        prepositions = []
        adjectives = []
        adverbs = []
        
        # count no. of respective POS
        for tok in doc:
            if tok.pos_ == "NOUN":
                nouns.append(tok.text)
            elif tok.pos_ == "VERB" or tok.dep_ == "cop" or tok.tag_ in ["VBD", "VBN"]:
                verbs.append(tok.text)
            elif tok.dep_ == "det":
                determiners.append(tok.text)
            elif tok.dep_ == "prep":
                prepositions.append(tok.text)
            elif tok.pos_ == "ADJ":
                adjectives.append(tok.text)
            elif tok.pos_ == "ADV":
                adverbs.append(tok.text)
                
        for tok in doc:
            # current percentage of nouns and verbs in broca utterance
            curr_ratio_nv = get_curr_nv_ratio(nouns, verbs)
            
            # Handle nouns
            if tok.pos_ == "NOUN": 
                # if possible noun percent in sentence less than current
                # percent or if current n/v ratio is too big, remove noun 
                # from sentence
                errs = ["m0", "m1", "p", "none", "suk"]
                err = random.choice(errs)
                if (percent_noun <= (len(nouns)/length) * 100 
                        or curr_ratio_nv > ratio_nv) :
                    utt += ' '
                    nouns.remove(tok.text)
                # m:0s:a, m:+s, p:w errors (equally)
                elif err in ["m0", "m1"]:
                    if "Plur" in tok.morph.get("Number"):
                        utt += singularize(tok.text) + ' '
                    elif "Sing" in tok.morph.get("Number"):
                        utt += pluralize(tok.text) + ' '  
                elif err == "p" or err == "suk":
                    utt += get_alt_word(tok) + ' '     
                else:
                    utt += tok.text + ' '
                    
            # Handle verbs (copula and gerund/participles counted as verb)
            elif  tok.pos_ == "VERB" or tok.dep_ == "cop" or tok.tag_ in ["VBD", "VBN"]:
                errs = ["m0", "mv", "p", "none", "suk"]
                err = random.choice(errs)
                # if possible verb percent in sentence less than current 
                # percent or if current n/v ratio too big remove noun 
                # from sentence
                if (percent_verb <= (len(verbs)/length) * 100 
                        or curr_ratio_nv > ratio_nv):
                   utt += ' '
                   verbs.remove(tok.text)
                # m:03s:a, m:vsg:a and p:w error equally 
                elif err in ["m0", "mv"]:
                    # lemmatize reg+irr 3rd sing
                    if '3' in tok.morph.get("Person") and 'Sing' in tok.morph.get("Number"):
                        utt += tok.lemma_ + " "
                    else:
                        utt += tok.text + " "
                elif err == "p" or err == "suk":
                    utt += get_alt_word(tok) + ' '
                else:
                    utt += tok.text + ' '
            
            # Handle determiners (pronouns are not determiners)
            elif tok.dep_ == "det":
                errs = ["s:r:gc", "s:uk", "none", "s:r"]
                err = random.choice(errs)
                x = np.random.uniform(0,1)
                if percent_det <= (len(determiners)/length) * 100:
                   utt += ' '
                   determiners.remove(tok.text)
                # s:r:gc:pro  and s:r error (same for pronouns)
                # but clan has both 
                elif err in ["s:r", "s:r:gc"]:
                    if tok.pos_ == "DET" or "Dem" in tok.morph.get('PronType') or "Yes" in tok.morph.get('Poss'):
                        utt += det_sub(tok.text) + " "
                    else:
                        utt += tok.text + " "
                elif err == "s:uk":
                    utt += get_alt_word(tok) + ' '
                else:
                    utt += tok.text + ' '

            # Handle prepositions
            elif tok.dep_ == "prep":
                err = ["s:uk", "none"]
                if percent_prep <= (len(prepositions)/length) * 100:
                   utt += ' '
                   prepositions.remove(tok.text)
                elif err == "s:uk":
                    utt += get_alt_word(tok) + ' ' 
                else:
                    utt += tok.text + ' '
            
            # Handle adjectives                 
            elif tok.pos_ == "ADJ":
                err = ["s:uk", "none"]
                if percent_adj <= (len(adjectives)/length) * 100:
                   utt += ' '
                   adjectives.remove(tok.text)
                elif err == "s:uk":
                    utt += get_alt_word(tok) + ' '
                else:
                    utt += tok.text + ' '
            
            # Handle adverbs
            elif tok.pos_ == "ADV":
                errs = ["p", "none", "s:uk"]
                err = random.choice(errs)
                if percent_adv <= (len(adverbs)/length) * 100:
                   utt += ' '
                   adverbs.remove(tok.text)
                # p:w and s:uk errors
                elif err == "p" or "s:uk":
                    utt += get_alt_word(tok) + ' '
                else:
                    utt += tok.text + ' '
            
            # Handle particles ('s, not etc)
            elif tok.pos_ == "PART":
                x = np.random.uniform(0,1)
                # m:0s error 50% times
                # missing plural suffix
                if tok.text.startswith("'s") :
                    if x <= 50:
                        utt = utt[:-1] + tok.text + ' ' 
                elif tok.text.startswith("'") or tok.text.startswith("n't"):
                        utt = utt[:-1] + ' ' 
                else:
                    utt += tok.text + ' '
            
            # Handle auxillaries ('ve in i have)
            elif tok.pos_ == "AUX":
                if tok.text.startswith("'"):
                    utt = utt[:-1] + tok.text + ' '

                else:
                    utt += tok.text + ' '
            
            # Handling punctuation (like :, .)
            elif tok.pos == "PUNCT":                
                utt = utt[:-1] + tok.text+ ' '
                
            # all other words with respective POS have a chance of s:uk
            else:
                err = ["s:uk", "none"]
                if err == "s:uk":
                    utt += get_alt_word(tok) + ' '
                else:
                    utt += tok.text + ' '
                
        utt = " ".join(utt.split()) # remove trailing whitespaces
        utt = re.sub(r'\s+([?.!",])', r'\1', utt)
        # print("Possible utt: ", utt)
        
        # only return sentences which are short enough
        if (n >= len(re.findall("[a-zA-Z_]+", utt)) 
                and len(re.findall("[a-zA-Z_]+", utt)) <= 47): 
            return utt, True
        else:
            return '', False
        
    # skipped sentence due to length = 0
    else:
        return "", False
    
def augment(filepath, save_path, include_canonical=False):
    pass

# Test area

In [10]:
para = """I received the brass that you sent me, and all looks well. Thank you very much for all your trouble and the extra 3 pieces. I feel that you have an outstanding company and are striving the best that you can to achieve customer satisfaction. I will be certain to tell my friends about US Reloading Supply. ~ Michelle Admin Note: By accident we shorted them two shells, so we sent them replacements."""

from nltk.tokenize import sent_tokenize

sentences = sent_tokenize(para)
for sent in sentences:
    print("Original sentence: ", sent)
    aphasic, changed = aphasic_speech(sent)
    if changed:
        print("Final utter:", " ".join(aphasic.split()).strip())
        print("-----------------------------------------------")
    else:
        print("-----------------------------------------------")

Original sentence:  I received the brass that you sent me, and all looks well.
-----------------------------------------------
Original sentence:  Thank you very much for all your trouble and the extra 3 pieces.
-----------------------------------------------
Original sentence:  I feel that you have an outstanding company and are striving the best that you can to achieve customer satisfaction.
-----------------------------------------------
Original sentence:  I will be certain to tell my friends about US Reloading Supply.
-----------------------------------------------
Original sentence:  ~ Michelle Admin Note: By accident we shorted them two shells, so we sent them replacements.
-----------------------------------------------


In [11]:
for x in nlp("Ruhi's books"):
    print(x.text, x.pos_)

Ruhi PROPN
's PART
books NOUN


## Test on some of the IMDB dataset

In [12]:
from datasets import load_dataset
from nltk.tokenize import sent_tokenize

ds = load_dataset("imdb")
print(ds)

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    unsupervised: Dataset({
        features: ['text', 'label'],
        num_rows: 50000
    })
})


In [13]:
texts = ds["train"]["text"]
sents = []

for text in texts:
    text = re.sub(r'\<.*?\>', " ", text)
    sentences = sent_tokenize(text)
    for sent in sentences:
        if isinstance(sent, str):
            sent = re.sub(r'\<.*?\>', " ", sent)
            if not sent.rstrip().isdigit() and len(sent.rstrip()) >= 1:
                sents.append(sent)
                
test_sents = sents[2000:4000]

In [14]:
test_sents

["And then there's the obligatory male-fantasy of an attractive straight woman suddenly deciding to give lesbianism a try -- PLEASE.",
 'Not only do I wish I could get my money back for the DVD rental, I also want those 112 minutes of my life back.',
 'What a ripoff.',
 'I was gifted with this movie as it had such a great premise, the friendship of three women bespoiled by one falling in love with a younger man.',
 'Intriguing.',
 'NOT!',
 'I hasten to add.',
 'These women are all drawn in extreme caricature, not very supportive of one another and conspiring and contriving to bring each other down.',
 "Anna Chancellor and Imelda Staunton could do no wrong in my book prior to seeing this, but here they are handed a dismal script and told to balance the action between slapstick and screwball, which doesn't work too well when the women are all well known professionals in a very small town.",
 'And for intelligent women they spend a whole pile of time bemoaning the lack of men/sex/lust in 

In [15]:
aphasic_sents = []
normal_sents = []

import inflect
p = inflect.engine()

for sent in test_sents:
    # no digits like in aphasiabank
    b = re.findall("[0-9]+", sent)
    for i in b:
        sent = sent.replace(i, p.number_to_words(i))
    
    s = preprocess(sent)
    aphasic, changed = aphasic_speech(s)
    if changed and aphasic !=".":       
        normal_sents.append(s)
        aphasic_sents.append(aphasic)

In [16]:
pd.DataFrame(data={"modified": aphasic_sents, "original": normal_sents}).to_csv("data/synthetic_clan.csv", sep=",", index=False)

# Post process of aphasic sentences
Also adding some "control" sentences

In [17]:
import pandas as pd
from datasets import load_dataset

# ds = load_dataset('stas/c4-en-10k')
ds = load_dataset("imdb")
print(ds)

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    unsupervised: Dataset({
        features: ['text', 'label'],
        num_rows: 50000
    })
})


In [18]:
df = pd.read_csv("data/synthetic_clan.csv")

In [19]:
sentences = df['modified']
original = df["original"]

In [20]:
sentences

0                                  ripoffs.
1                                      not!
2                                     i to.
3                                    right.
4                                 four ten.
                       ...                 
502                     i am paul raddick,.
503             , twenty nine philadelphia.
504    sandler that a pert one and redound.
505             spanglish is, dooley squat!
506                                       !
Name: modified, Length: 507, dtype: object

In [21]:
original

0                                         what a ripoff.
1                                                   not!
2                                       i hasten to add.
3                                                 right.
4                                       four out of ten.
                             ...                        
502                            i am paul raddick, a.k.a.
503    panic attack of wtaf, channel twenty nine in p...
504    sandler encounters babes that like history of ...
505    spanglish is red hot, going overboard ain't do...
506                            never again never forget!
Name: original, Length: 507, dtype: object

In [22]:
import re
import string
from preprocess import postprocess
broca_sents = []
original_sents = []
for sent, o in zip(sentences, original):
    x = postprocess(sent)
    if x != "":
        broca_sents.append(x)
        original_sents.append(o)

In [23]:
pd.DataFrame(data={"modified": broca_sents, "original": original_sents}).to_csv("data/synthetic_clan.csv", sep=",", index=False)

In [24]:
import inflect
from preprocess import preprocess
p = inflect.engine()

# preprocess and post process in same way
from nltk.tokenize import sent_tokenize
texts = ds["train"]["text"]
sents = []
for text in texts:
    text = re.sub(r'\<.*?\>', " ", text)
    sentences = sent_tokenize(text)
    for sent in sentences:
        if isinstance(sent, str):
            sent = re.sub(r'\<.*?\>', " ", sent)
            if not sent.rstrip().isdigit() and len(sent.rstrip()) >= 1:
                # no digits
                b = re.findall("[0-9]+", sent)
                for i in b:
                    sent = sent.replace(i, p.number_to_words(i))
                sent = preprocess(sent)
                sent = postprocess(sent)
                if sent != "":
                    sents.append(sent)

In [25]:
control_sents = sents[4000:4000+len(broca_sents)]

In [26]:
control_sents

["not that i haven't tried, mind you but i sit down, and i pop in the aged vhs, and i watch the openingand suddenly i'm five years old again and clutching my very own care bear and watching the movie with open eyes and an eager heart.",
 'i can see, objectively, that this movie is a bizarre combination of cuddly baby merchandising mascots and creepy prepubescent children with evil powers that has a thin story and uninteresting animation.',
 'but my inner five year old goes, yay!',
 'care bears!',
 'every time i think about it.',
 "so i'd only cautiously, reluctantly recommend this movie for those who saw it during their early youth and can call on the awesome power of nostalgia while watching it like me or those lovably cynical gen xy ers who deliberately seek out the wonderfully badstrange a category in which this moviedefinitely belongs.",
 'to those actually looking for a compelling movie or wholesome family entertainment you might want to keep looking.',
 'i really wanted to like t

In [27]:
print(len(broca_sents), len(control_sents))

490 490


In [28]:
broca_data = pd.DataFrame(data={"modified": broca_sents, "label": [1]*len(broca_sents)})
control_data = pd.DataFrame(data={"modified": control_sents, "label": [0]*len(control_sents)})
data_full_scenario = pd.concat([broca_data, control_data], ignore_index=True)
data_full_scenario = data_full_scenario.sample(frac=1).reset_index(drop=True)
data_full_scenario.to_csv("data/synthetic_clan_test.csv", sep=",", index=False)

In [29]:
data_full_scenario

Unnamed: 0,modified,label
0,that movie surprises me.,1
1,this is the worst documentary to come out of c...,0
2,"however, fellow green mile star doug hutchison...",0
3,"i failed to see why he was funny, although peo...",0
4,the only good performances here are from paul ...,0
...,...,...
975,this shows no imagination!,0
976,even when i find myself in one hundred percent...,0
977,and at not them selves serious!,1
978,but this was just so incredibly annoying.,0
