##### NOTE: In CLAN the C-NNLA command was used to get the distribution parameters. However, error outputs from C-NNLA were not very useful (like % correct ..., % incorrect, % grammatical etc) since some CHA files do not annotate errors like [* p], [* s] (but just replace it with xxx, yyy, zzz), making those measures unreliable

##### However, I am trying to include the 5 most common word errors from the C-NNLA measures and basing their occurance on the Salis and Edwards (2004) paper

##### This approach also does not consider any utterance errors/substitutions (marked as @u or [* p:n] in AphasiaBank)

In [11]:
import spacy
from spacy.matcher import Matcher
import numpy as np
import pandas as pd
from canonical_sents import get_canonical_sentences
import os
import scipy.stats as stats
import random
import re
from pattern.text.en import singularize, pluralize
from Levenshtein import apply_edit
import enchant
d = enchant.Dict("en_US")
nlp = spacy.load("en_core_web_sm")

In [12]:
special_characters = ['(.)', '[/]', '[//]', '‡', 'xxx', '+< ', '„', '+', '"" /..""', '+"/.', '+"', '+/?', '+//.',
                      '+//?', '[]', '<>', '_', '-', '^', ':', 'www .', '*PAR', '+/', '@o', '<', '>',
                      '//..', '//', '/..', '/', '"', 'ʌ', '..?', '0.', '0 .', '"" /.', ')', '(', "@u", "@si", "@k",
                      "@n", "$n", "$co", "$adj", "$on", "$v", "@l", 'æ', 'é', 'ð', 'ü', 'ŋ', 'ɑ', 'ɒ', 'ɔ', 'ə',
                      'ɚ', 'ɛ', 'ɜ', 'ɝ', 'ɡ', 'ɪ', 'ɹ', 'ɾ', 'ʃ', 'ʊ', 'ʒ', 'ʔ', 'ʤ', 'ʧ', 'ː', '˞', '͡', 'θ', "@q",
                      "@sspa", "@i", "@wp", "@sjpn", "@sdeu", "@p", "@sfra", "&", "*"]
ipa = ['æ', 'é', 'ð', 'ü', 'ŋ', 'ɑ', 'ɒ', 'ɔ', 'ə', 'ɚ', 'ɛ', 'ɜ', 'ɝ', 'ɡ', 'ɪ', 'ɹ', 'ɾ', 'ʃ', 'ʊ', 'ʒ', 'ʔ', 'ʤ',
       'ʧ', 'ː', '˞', '͡', 'θ', ]

In [13]:
def preprocess(utterance):
    # Remove IPA
    for special_character in ipa:
        utterance = utterance.replace(special_character, "")

    pattern1 = r"@u *\[: (.*?)]"
    pattern2 = r"\b\w*@u\w*\b"
    news = re.findall(pattern1, utterance)
    olds = re.findall(pattern2, utterance)
    if len(news) == len(olds):
        for new, old in zip(news, olds):
            if "x@n" in new:
                utterance = utterance.replace(old, "")
            else:
                utterance = utterance.replace(old, new)

    # Remove all actions: (e.g. &=points:picture)
    utterance = re.sub(r"\&\=[a-zA-Z:_0-9]+", "", utterance)

    # Remove all unicode errors   \W\d+\w\d+\W
    utterance = re.sub(r"\W\d+\w\d+\W", "", utterance)

    # Remove anything between [ and ]
    utterance = re.sub(r"\[.*?\]", "", utterance)

    # Remove  < and > for repetition [/] and retracing [//]
    utterance = re.sub(r"\<|\>", "", utterance)

    # Remove anything &+ or &* (eg &*INV)
    utterance = re.sub(r"\&\+[a-zA-Z]+", "", utterance)
    utterance = re.sub(r"\&\*INV:[a-zA-Z_]+", "", utterance)

    # Remove remaining special chars
    for special_character in special_characters:
        # EDITED: for words such as you_know and of_course
        if special_character == '_':
            utterance = utterance.replace(special_character, " ")
        utterance = utterance.replace(special_character, "")

    utterance = re.sub(' +', ' ', utterance)

    # Remove special characters from starting sentence
    remove_startswiths = [" ", ",", "!", ".", "?", "."]
    for start_string in remove_startswiths:
        if utterance.startswith(start_string):
            utterance = utterance[1:]

    # Remove final whitespaces (e.g. double space)
    utterance = re.sub(' +', ' ', utterance)
    # Remove trailing whitespaces before punctuation
    utterance = re.sub(r'\s+([?.!"])', r'\1', utterance)

    # remove extra spaces
    utterance = utterance.lstrip().rstrip().lower().replace("[]", "").replace("]", "")
    utterance = re.sub(' +|\t', ' ', utterance)
    utterance = re.sub(' ,', ',', utterance)

    return utterance

In [14]:
dets = {'Art': ['a', 'an', 'the', ''],
           'Dem': ['this', 'that', 'these', 'those', ''],
           'Poss': ['my', 'your', 'his', 'her', 'its', 'our', 'their', '']}

In [15]:
def det_sub(x):
    for _, det in dets.items():
        if x.lower() in det:
            y = [j for j in det if x!=j]
            return random.choice(y)
    return ""

In [16]:
def get_truncnorm(mean, std, min, max):
    # mean, std, min, max parameters dependent on their extracted normal
    # distributions
    a, b = (min - mean) / std, (max - mean) / std
    return stats.truncnorm(a, b, loc=mean, 
                                   scale=std).rvs(size=1)[0]

In [17]:
def get_curr_nv_ratio(nouns, verbs):
    if len(nouns) != 0 and len(verbs) != 0:
        curr_ratio_nv = len(nouns)/len(verbs)
    else:
        curr_ratio_nv = 0
    return curr_ratio_nv

In [18]:
def get_alt_word(tok):
    # get a new word which is the same length as old word
    # to simulate p:w errors
    possible_words = [x for x in d.suggest(tok.text) if len(x) ==
                      len(tok.text) and d.check(x) and x != tok.text] 
    if possible_words:
        new_word = random.choice(possible_words)
        if new_word:
            return new_word
    
    # if we cannot find new word just return current word
    return tok.text

In [19]:
# TODO: add s:uk, s:r and s:per errors 
def aphasic_speech(text):
    doc = nlp(text)
    # max length of modified sentences (log norm dist with log(27) being longest)
    # determine how long this sentence will be
    n = np.random.lognormal(mean=0.9162787, sigma=0.7350323)
    while n > 3.295:
        n = np.random.lognormal(mean=0.9162787, sigma=0.7350323)
        
    utt = ""
    # length of original text
    length = len(re.findall("[a-zA-Z_]+", text))
    
    if length != 0:
        # get possible n/v ratio for this sentence
        # n/v ratio is log-norm
        ratio_nv = get_truncnorm(1.461698, 1, 0.228, 6.818)
        # get the possible percentage of all POS
        percent_noun = get_truncnorm(20.68179, 9.794099, 5.991, 50)
        percent_verb = get_truncnorm(16.59888, 5.036936, 0, 26.267)
        percent_det = get_truncnorm(7.494686, 5.927496, 0, 27.79661)
        percent_prep = get_truncnorm(3.104844, 2.325505, 0, 15.05682)
        percent_adj = get_truncnorm(4.34104, 3.509376, 0, 21.05263)
        percent_adv = get_truncnorm(5.751157, 2.958618, 0, 15.88448)
        nouns = []
        verbs = []
        determiners = []
        prepositions = []
        adjectives = []
        adverbs = []
        
        # count no. of respective POS
        for tok in doc:
            if tok.pos_ == "NOUN":
                nouns.append(tok.text)
            elif tok.pos_ == "VERB" or tok.dep_ == "cop" or tok.tag_ in ["VBD", "VBN"]:
                verbs.append(tok.text)
            elif tok.dep_ == "det":
                determiners.append(tok.text)
            elif tok.dep_ == "prep":
                prepositions.append(tok.text)
            elif tok.pos_ == "ADJ":
                adjectives.append(tok.text)
            elif tok.pos_ == "ADV":
                adverbs.append(tok.text)
                
        for tok in doc:
            # current percentage of nouns and verbs in broca utterance
            curr_ratio_nv = get_curr_nv_ratio(nouns, verbs)
            
            # Handle nouns
            if tok.pos_ == "NOUN": 
                # if possible noun percent in sentence less than current
                # percent or if current n/v ratio is too big, remove noun 
                # from sentence
                errs = ["m0", "m1", "p", "none"]
                err = random.choice(errs)
                if (percent_noun <= (len(nouns)/length) * 100 
                        or curr_ratio_nv > ratio_nv) :
                    utt += ' '
                    nouns.remove(tok.text)
                # m:0s:a, m:+s, p:w errors (equally)
                elif err in ["m0", "m1"]:
                    if "Plur" in tok.morph.get("Number"):
                        utt += singularize(tok.text) + ' '
                    elif "Sing" in tok.morph.get("Number"):
                        utt += pluralize(tok.text) + ' '  
                elif err == "p":
                    utt += get_alt_word(tok) + ' '
                else:
                    utt += tok.text + ' '
                    
            # Handle verbs (copula and gerund/participles counted as verb)
            elif  tok.pos_ == "VERB" or tok.dep_ == "cop" or tok.tag_ in ["VBD", "VBN"]:
                errs = ["m0", "mv", "p", "none"]
                err = random.choice(errs)
                # if possible verb percent in sentence less than current 
                # percent or if current n/v ratio too big remove noun 
                # from sentence
                if (percent_verb <= (len(verbs)/length) * 100 
                        or curr_ratio_nv > ratio_nv):
                   utt += ' '
                   verbs.remove(tok.text)
                # m:03s:a, m:vsg:a and p:w error equally 
                elif err in ["m0", "mv"]:
                    # lemmatize reg+irr 3rd sing
                    if '3' in tok.morph.get("Person") and 'Sing' in tok.morph.get("Number"):
                        utt += tok.lemma_ + " "
                    else:
                        utt += tok.text + " "
                elif err == "p":
                    utt += get_alt_word(tok) + ' '
                else:
                    utt += tok.text + ' '
            
            # Handle determiners (pronouns are not determiners)
            elif tok.dep_ == "det":
                x = np.random.uniform(0,1)
                if percent_det <= (len(determiners)/length) * 100:
                   utt += ' '
                   determiners.remove(tok.text)
                # s:r:gc:pro error 50% times
                elif x <= 0.5:
                    if tok.pos_ == "DET" or "Dem" in tok.morph.get('PronType') or "Yes" in tok.morph.get('Poss'):
                        utt += det_sub(tok.text) + " "
                    else:
                        utt += tok.text + " "
                else:
                    utt += tok.text + ' '

            # Handle prepositions
            elif tok.dep_ == "prep":
                if percent_prep <= (len(prepositions)/length) * 100:
                   utt += ' '
                   prepositions.remove(tok.text)
                else:
                    utt += tok.text + ' '
            
            # Handle adjectives                 
            elif tok.pos_ == "ADJ":
                if percent_adj <= (len(adjectives)/length) * 100:
                   utt += ' '
                   adjectives.remove(tok.text)
                else:
                    utt += tok.text + ' '
            
            # Handle adverbs
            elif tok.pos_ == "ADV":
                errs = ["p", "none"]
                err = random.choice(errs)
                if percent_adv <= (len(adverbs)/length) * 100:
                   utt += ' '
                   adverbs.remove(tok.text)
                # p:w error
                elif err == "p":
                    utt += get_alt_word(tok) + ' '
                else:
                    utt += tok.text + ' '
            
            # Handle particles ('s, not etc)
            elif tok.pos_ == "PART":
                x = np.random.uniform(0,1)
                # m:0s error 50% times
                # missing plural suffix
                if tok.text.startswith("'") or tok.text.startswith("n't"):
                    if x <= 50:
                        utt = utt[:-1] + tok.text + ' ' 
                    else:
                        utt = utt[:-1] + ' ' 
                else:
                    utt += tok.text + ' '
            
            # Handle auxillaries ('ve in i have)
            elif tok.pos_ == "AUX":
                if tok.text.startswith("'"):
                    utt = utt[:-1] + tok.text+ ' '
                else:
                    utt += tok.text + ' '
            
            # Handling punctuation (like :, .)
            elif tok.pos == "PUNCT":                
                utt = utt[:-1] + tok.text+ ' '
                
            # all other words with respective POS unaffected
            else:
                utt += tok.text + ' '

        utt = " ".join(utt.split()) # remove trailing whitespaces
        utt = re.sub(r'\s+([?.!",])', r'\1', utt)
        # print("Possible utt: ", utt)
        
        # only return sentences which are short enough and are changed
        if (np.exp(n) >= len(re.findall("[a-zA-Z_]+", utt)) 
                and len(re.findall("[a-zA-Z_]+", utt)) <= 27):
            # print("Possible utt1: ", utt)
            if utt != text:
                return utt, True
            else: 
                return utt, False
        else:
            return '', False
        
    # skipped sentence due to length = 0
    else:
        return "", False
    
def augment(filepath, save_path, include_canonical=False):
    pass

# Test area

In [44]:
para = """I received the brass that you sent me, and all looks well. Thank you very much for all your trouble and the extra 3 pieces. I feel that you have an outstanding company and are striving the best that you can to achieve customer satisfaction. I will be certain to tell my friends about US Reloading Supply. ~ Michelle Admin Note: By accident we shorted them two shells, so we sent them replacements."""

from nltk.tokenize import sent_tokenize

sentences = sent_tokenize(para)
for sent in sentences:
    print("Original sentence: ", sent)
    aphasic, changed = aphasic_speech(sent)
    if changed:
        print("Final utter:", " ".join(aphasic.split()).strip())
        print("-----------------------------------------------")
    else:
        print("-----------------------------------------------")

Original sentence:  I received the brass that you sent me, and all looks well.
Final utter: I brasses that you me, and all loops.
-----------------------------------------------
Original sentence:  Thank you very much for all your trouble and the extra 3 pieces.
-----------------------------------------------
Original sentence:  I feel that you have an outstanding company and are striving the best that you can to achieve customer satisfaction.
-----------------------------------------------
Original sentence:  I will be certain to tell my friends about US Reloading Supply.
Final utter: I will be certain to my friends US Reloading Supply.
-----------------------------------------------
Original sentence:  ~ Michelle Admin Note: By accident we shorted them two shells, so we sent them replacements.
-----------------------------------------------


## Test on some of the IMDB dataset

In [23]:
from datasets import load_dataset
from nltk.tokenize import sent_tokenize

ds = load_dataset("imdb")
print(ds)

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    unsupervised: Dataset({
        features: ['text', 'label'],
        num_rows: 50000
    })
})


In [73]:
texts = ds["train"]["text"]
sents = []
for text in texts:
    t = re.sub(r'\<.*?\>', " ", text)
    sentences = sent_tokenize(t)
    for sent in sentences:
        sents.append(" ".join(sent.split()))
test_sents = sents[2000:4000]

In [74]:
test_sents

["And then there's the obligatory male-fantasy of an attractive straight woman suddenly deciding to give lesbianism a try -- PLEASE.",
 'Not only do I wish I could get my money back for the DVD rental, I also want those 112 minutes of my life back.',
 'What a ripoff.',
 'I was gifted with this movie as it had such a great premise, the friendship of three women bespoiled by one falling in love with a younger man.',
 'Intriguing.',
 'NOT!',
 'I hasten to add.',
 'These women are all drawn in extreme caricature, not very supportive of one another and conspiring and contriving to bring each other down.',
 "Anna Chancellor and Imelda Staunton could do no wrong in my book prior to seeing this, but here they are handed a dismal script and told to balance the action between slapstick and screwball, which doesn't work too well when the women are all well known professionals in a very small town.",
 'And for intelligent women they spend a whole pile of time bemoaning the lack of men/sex/lust in 

In [75]:
aphasic_sents = []
for sent in test_sents:
    s = preprocess(sent)
    aphasic, changed = aphasic_speech(s)
    if changed and aphasic !=".":
        print(aphasic)
        aphasic_sents.append(aphasic)

and then there 's an attractive straight suddenly decoding to five lesbianism a tries please.
i to.
, a cigarettes!
like i said, waste.
4 1
is first or So, as it to be a sexes and the pity.
macdowell a pat otherwise are and comedies, tarttongued anna chancellor.
46
i don't.
the... well, i don't knob what their rote is supposes to be.
maria's actings is that lacks feeling, her moment seem.
she seem to be to an cameras.
i'm sorry, i'd rather had watched it comport my rood.
it is and movie.it could have been way if it had stories and more damar.
my is everywhere, i don't any impression and effect a movies.
, it some parts to recommend.
parts war.
also, they wert lying field, conversations express their laves were and passionate.
a tudor giurgiu was amc theater sunday 1200pms 081006, with us watched the movie.
it doesn't make sense.
, it parts to recommend.
parts be.
alex willing to the ears piercing to appreciate kik's lore.
also, the they a, the conversations express their love wear and 

In [76]:
pd.DataFrame(data={"modified": aphasic_sents}).to_csv("data/synthetic.csv", sep=",", index=False)

# Post process of aphasic sentences
Also adding some "control" sentences

In [77]:
import pandas as pd
from datasets import load_dataset

# ds = load_dataset('stas/c4-en-10k')
ds = load_dataset("imdb")
print(ds)

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    unsupervised: Dataset({
        features: ['text', 'label'],
        num_rows: 50000
    })
})


In [78]:
df = pd.read_csv("data/synthetic.csv")

In [79]:
sentences = df['modified']

In [80]:
sentences

0      and then there 's an attractive straight sudde...
1                                                  i to.
2                                        , a cigarettes!
3                                    like i said, waste.
4                                                    4 1
                             ...                        
772                                      man, one chick.
773                       it's not supposes it's movies.
774                                          oh, pain...
775                                   .okay, not all it.
776                                                    !
Name: modified, Length: 777, dtype: object

In [85]:
import re
broca_sents = []
for sent in sentences:
    if isinstance(sent, str):
        if not sent.rstrip().isdigit() and len(sent.rstrip()) >= 1:
            sent = re.sub(r'\s+([?.!",;\'])', r'\1', sent)
            sent = re.sub(r"^\W+", "", sent)
            sent = re.sub(r"'\W+'", "", sent)
            sent = re.sub(r"\(\s*(.*?)\s*\)",r'(\1)', sent)
            sent = sent.lower()
            broca_sents.append(" ".join(sent.split()))

In [86]:
broca_sents

["and then there's an attractive straight suddenly decoding to five lesbianism a tries please.",
 'i to.',
 'a cigarettes!',
 'like i said, waste.',
 '4 1',
 'is first or so, as it to be a sexes and the pity.',
 'macdowell a pat otherwise are and comedies, tarttongued anna chancellor.',
 "i don't.",
 "the... well, i don't knob what their rote is supposes to be.",
 "maria's actings is that lacks feeling, her moment seem.",
 'she seem to be to an cameras.',
 "i'm sorry, i'd rather had watched it comport my rood.",
 'it is and movie.it could have been way if it had stories and more damar.',
 "my is everywhere, i don't any impression and effect a movies.",
 'it some parts to recommend.',
 'parts war.',
 'also, they wert lying field, conversations express their laves were and passionate.',
 'a tudor giurgiu was amc theater sunday 1200pms 081006, with us watched the movie.',
 "it doesn't make sense.",
 'it parts to recommend.',
 'parts be.',
 "alex willing to the ears piercing to appreciate 

In [87]:
# preprocess and post process in same way
from nltk.tokenize import sent_tokenize
texts = ds["train"]["text"]
sents = []
for text in texts:
    t = re.sub(r'\<.*?\>', " ", text)
    sentences = sent_tokenize(text)
    for sent in sentences:
        if isinstance(sent, str):
            if not sent.rstrip().isdigit() and len(sent.rstrip()) >= 1:
                sent = preprocess(sent)
                sent = re.sub(r'\s+([?.!",;\'])', r'\1', sent)
                sent = re.sub(r'\s+([?.!",;\']){2,}|([?.!",;\']){2,}', r'\2', sent)
                sent = re.sub(r"^\W+", "", sent)
                sent = re.sub(r"'\W+'", "", sent)
                sent = re.sub(r"\(\s*(.*?)\s*\)",r'(\1)', sent)
                sent = sent.lower()
                sents.append(" ".join(sent.split()))

In [93]:
control_sents = sents[4000:4000+len(broca_sents)]

In [94]:
control_sents

['at first evelyn is reluctant but is persuaded when it is agreed stage hand samuel leo valeriano goes along as well.',
 "once there count marnack tells his guests that his father grandfather both cut the heads off their cheating wives using a ceremonial knife there's a feeling of unease when it turns out that evelyn looks exactly the same as the current count's wife who ran away not too long ago.",
 'along with having to worry about weirdo servants it turns out that someone wants to use the knife themselves to chop a few heads off.br br this italian production was written directed by alfredo rizzo is total, complete utter crap from start to finish.',
 "first things first lets start the criticism with the title the bloodsucker leads the dance, lets examine that title because when i do i feel somewhat cheated that there aren't any vampires or any form of bloodsucking whatsoever, noone'leads' anything at anytime there most certainly isn't any dance or dancing so don't expect any of these

In [95]:
print(len(broca_sents), len(control_sents))

775 775


In [96]:
broca_data = pd.DataFrame(data={"modified": broca_sents, "label": [1]*len(broca_sents)})
control_data = pd.DataFrame(data={"modified": control_sents, "label": [0]*len(control_sents)})
data_full_scenario = pd.concat([broca_data, control_data], ignore_index=True)
data_full_scenario = data_full_scenario.sample(frac=1).reset_index(drop=True)
data_full_scenario.to_csv("data/gen_data.csv", sep=",", index=False)

In [97]:
data_full_scenario

Unnamed: 0,modified,label
0,this movie draws upon three basic areas.,0
1,don't your excuses movie.,1
2,it is bad!,0
3,nick mancuso a his and to a roles in an films.,1
4,his branks.,1
...,...,...
1545,the arab worships oil.,0
1546,it is truly saddening to see an artist's work ...,0
1547,i groups this movies is basically.,1
1548,and.,1
