##### NOTE: In CLAN the C-NNLA command was used to get the distribution parameters. However, error outputs from C-NNLA were not very useful (like % correct ..., % incorrect, % grammatical etc) since some CHA files do not annotate errors like [* p], [* s] (but just replace it with xxx, yyy, zzz), making those measures unreliable

##### However, I am trying to include the 5 most common word errors from the C-NNLA measures 

##### This approach also does not consider any utterance errors/substitutions (marked as @u or [* p:n] in AphasiaBank)

In [1]:
import string

import spacy
import numpy as np
import pandas as pd
import scipy.stats as stats
import random
import re
from pattern.text.en import singularize, pluralize, conjugate
import enchant
from preprocess import preprocess
from spacy.matcher import Matcher
from string import printable

d = enchant.Dict("en_US")
nlp = spacy.load("en_core_web_sm")

In [2]:
dets = {'Art': ['a', 'an', 'the', ''],
           'Dem': ['this', 'that', 'these', 'those', ''],
           'Poss': ['my', 'your', 'his', 'her', 'its', 'our', 'their', '']}

In [3]:
def det_sub(x):
    for _, det in dets.items():
        if x.lower() in det:
            y = [j for j in det if x!=j]
            return random.choice(y)
    return ""

In [4]:
def get_truncnorm(mean, std, min, max):
    # mean, std, min, max parameters dependent on their extracted normal
    # distributions
    a, b = (min - mean) / std, (max - mean) / std
    return stats.truncnorm(a, b, loc=mean, 
                                   scale=std).rvs(size=1)[0]

In [5]:
def get_curr_nv_ratio(nouns, verbs):
    if len(nouns) != 0 and len(verbs) != 0:
        curr_ratio_nv = len(nouns)/len(verbs)
    else:
        curr_ratio_nv = 0
    return curr_ratio_nv

In [6]:
def get_alt_word(tok):
    # get a new word which is the same length as old word
    # to simulate p:w errors
    possible_words = [x for x in d.suggest(tok.text) if len(x) ==
                      len(tok.text) and d.check(x) and x != tok.text] 
    if possible_words:
        new_word = random.choice(possible_words)
        if new_word:
            return new_word
    
    # if we cannot find new word just return current word
    return tok.text

In [7]:
m0sa_lim = 0.7      # m:0s:a             (30%)
ms_lim = 0.7        # m:+s(:a)           (30%)
m0s_lim = 0.7       # m:0s               (30%)
m03_lim = 0.6       # m:03s:a            (40%)
mvsg_lim = 0.6      # m:vsg:a            (40%)
pw_lim = 0.7        # p:w                (30%)
sgc_lim = 0.6       # s:r:gc             (40%)
suk_lim = 0.7       # s:uk               (30%)
sr_lim = 0.6        # s:r (for pronouns) (40%)
rep_lim = 0.4

In [8]:
vp_pattern = [[{'POS': 'VERB', 'OP': '?'},
               {'POS': 'ADV', 'OP': '*'},
               {'POS': 'AUX', 'OP': '*'},
               {'POS': 'VERB', 'OP': '+'}]]
matcher = Matcher(nlp.vocab)
matcher.add("Verb phrase", vp_pattern)
aphasic_utt = ""

In [9]:
def check_complexity(doc):
    # remove too complex sentences, from Misra et al. 
    # get NPs
    noun_phrases = set()
    for nc in doc.noun_chunks:
        for nop in [nc, doc[nc.root.left_edge.i:nc.root.right_edge.i + 1]]:
            noun_phrases.add(nop.text.strip())
    # get VPs
    verb_phrases = matcher(doc)
    verb_phrases = [doc[start:end] for _, start, end in verb_phrases]

    try:
        ratio = len(noun_phrases) / len(verb_phrases)
    except:
        # Division by zero
        return 0, True

    X = np.random.uniform(0, 1)
    
    # if too complex or going to reject sentence
    # return true for too complex
    return ratio, ratio > 2 and X <= 0.8

In [10]:
extra = '"#$%&()*+-/:;<=>@[\]^_`{|}~"'

for x in extra:
    printable = printable.replace(x,'',)

In [11]:
def aphasic_speech(text):
    doc = nlp(text)
    # keep 0.n of the text, min length is 5 words
    # n = max(5, round(len(re.findall("[a-zA-Z_]+", text))*0.3))
    
    n = np.random.gamma(shape=7.859547, scale=1/1.029798)
    n = round(n)
    
    while n > 47 or n < 5:
        n = np.random.gamma(shape=7.859547, scale=1/1.029798)
        n = round(n)
        
    utt = ""
    # length of original text
    length = len(re.findall("[a-zA-Z_]+", text))
    
    if length != 0:
        # do not modify sentences with special characters
        if set(text).difference(printable):
            return "", False
        
        # get possible n/v ratio for this sentence
        ratio_nv = np.random.gamma(shape=2.180031, scale=1/1.498104)
        
        # get the possible percentage of all POS
        # values below are %-ages, not ratios
        # noun and verb distributions are gamma, rest are truncated normal
        percent_noun = np.random.gamma(shape=4.0047683, scale=1/0.1944749)
        percent_verb = np.random.gamma(shape=9.9920204, scale=1/0.5973042)
        open_close = np.random.gamma(shape=4.99415, scale=1/3.558095)
        percent_det = get_truncnorm(7.55312, 6.004386, 0, 27.79661)
        percent_prep = get_truncnorm(3.15664, 2.386052, 0, 15.05682)
        percent_adj = get_truncnorm(4.258013, 3.460436, 0, 21.05263)
        percent_adv = get_truncnorm(5.808547, 2.911826, 0, 15.88448)
        # print(percent_noun)
        # print(percent_verb)
        nouns = []
        verbs = []
        determiners = []
        prepositions = []
        adjectives = []
        adverbs = []
        interjections = []
        nv_control = np.random.gamma(shape=49.65696, scale=1/50.32268)
        
        # count no. of respective POS
        for tok in doc:
            if tok.pos_ == "NOUN":
                nouns.append(tok.text)
            elif tok.pos_ == "VERB" or tok.dep_ == "cop" or tok.tag_ in ["VBD", "VBN"]:
                verbs.append(tok.text)
            # det:art and det:dem only
            elif tok.dep_ == "det" and ("Dem" in tok.morph.get('PronType') or "Art" in tok.morph.get('PronType')):
                determiners.append(tok.text)
            elif tok.dep_ == "prep":
                prepositions.append(tok.text)
            elif tok.pos_ == "ADJ":
                adjectives.append(tok.text)
            elif tok.pos_ == "ADV":
                adverbs.append(tok.text)
            elif tok.pos_ == "INTJ":
                interjections.append(tok.pos_)
        
        if len(verbs) == 0:
            return "", False
        
        if nv_control < len(nouns)/len(verbs):
            return "", False
        
        open_class_num = len(nouns) + len(verbs) + len(adjectives) + len(adverbs)
        closed_class_num = length - open_class_num - len(interjections)
        
        for tok in doc:
            # current percentage of nouns and verbs in broca utterance
            curr_ratio_nv = get_curr_nv_ratio(nouns, verbs)
            
            open_class_num = len(nouns) + len(verbs) + len(adjectives) + len(adverbs)
            closed_class_num = length - open_class_num - len(interjections)
             
            # if length is 0 then no utterance
            if length == 0 or closed_class_num == 0:
                return "", False
             
            m0sa_prob = random.uniform(0,1)     #m:0s:a
            ms_prob = random.uniform(0,1)       #m:+s(:a)
            m0s_prob = random.uniform(0,1)      #m:0s
            m03_prob = random.uniform(0,1)      # m:03s:a
            mvsg_prob = random.uniform(0,1)     # m:vsg:a
            pw_prob = random.uniform(0,1)       # p:w
            sgc_prob = random.uniform(0,1)      # s:r:gc
            suk_prob = random.uniform(0,1)      # s:uk  
            sr_prob = random.uniform(0,1)       #s:r (for pronouns)
            rep_prob = random.uniform(0,1)      # for repetition of                                         
                                                # pronouns and 
                                                # interjections
            remove = None
            add = False
            
            if open_close < open_class_num/closed_class_num:
                remove = np.random.choice(["NOUN", "VERB", "ADJ", "ADV"])
            if open_close > open_class_num/closed_class_num:
                add = True
                
            # Handle nouns
            if tok.pos_ == "NOUN": 
                # if possible noun percent in sentence less than current
                # percent or if current n/v ratio is too big, remove noun 
                # from sentence
                if (percent_noun <= (len(nouns)/length) * 100
                        or curr_ratio_nv > ratio_nv or remove == tok.pos_) :
                    utt += ' '
                    length -= 1 
                    nouns.remove(tok.text)  
                # m:0s:a, m:+s, p:w, s:uk errors
                elif m0sa_prob >= m0sa_lim or ms_prob >= ms_lim:
                    if "Plur" in tok.morph.get("Number"):
                        utt += singularize(tok.text) + ' '
                    elif "Sing" in tok.morph.get("Number"):
                        utt += pluralize(tok.text) + ' '  
                elif pw_prob >= pw_lim or suk_prob >= suk_lim:
                    utt += get_alt_word(tok) + ' '     
                else:
                    utt += tok.text + ' '
                    
            # Handle verbs (copula and gerund/participles counted as verb)
            elif  tok.pos_ == "VERB" or tok.dep_ == "cop" or tok.tag_ in ["VBD", "VBN"]:
                # if possible verb percent in sentence less than current 
                # percent or if current n/v ratio too big remove noun 
                # from sentence
                if (percent_verb <= (len(verbs)/length) * 100 or 
                        remove == tok.pos_):
                   utt += ' '
                   length -= 1
                   verbs.remove(tok.text)
                
                # m:03s:a, m:vsg:a error
                elif m03_prob >= m03_lim or mvsg_prob >= mvsg_lim:
                    # lemmatize reg+irr 3rd sing
                    if '3' in tok.morph.get("Person") and 'Sing' in tok.morph.get("Number"):
                        utt += tok.lemma_ + " "
                    else:
                        utt += tok.text + " "
                # p:w error
                elif pw_prob >= pw_lim:
                    utt += get_alt_word(tok) + ' '
                else:
                    x = np.random.uniform(0,1)
                    if x >= 0.5:
                        # print(tok.text)
                        utt += conjugate(tok.text, '3sg') + ' '
                    else:
                        utt += tok.text + ' '
            
            # Handle determiners (art and dem)
            elif tok.dep_ == "det" and ("Dem" in tok.morph.get('PronType') or "Art" in tok.morph.get('PronType')) :
                # if possible determiner percent in sentence less than current, remove determiner
                if percent_det <= (len(determiners)/length) * 100:
                   utt += ' '
                   length -= 1
                   determiners.remove(tok.text)
                else:
                    utt += tok.text + ' '
                    
            # Handle pronouns
            elif tok.pos_ == "PRON":
                # s:r:gc:pro  and s:r error (same for pronouns)
                # but clan uses both versions 
                if sgc_prob >= sgc_lim or sr_prob >= sr_lim:
                    if tok.pos_ == "DET" or "Dem" in tok.morph.get('PronType') or "Yes" in tok.morph.get('Poss'):
                        sub = det_sub(tok.text) 
                        utt += sub + " "
                        if rep_prob >= rep_lim:
                            length += 1
                            utt += sub + " "
                    else:
                        if rep_prob >= rep_lim:
                            length += 1
                            utt += tok.text + " "
                        utt += tok.text + " "
                else:
                    if rep_prob >= rep_lim:
                        length += 1
                        utt += tok.text + " "
                    utt += tok.text + " "
                

            # Handle prepositions
            elif tok.dep_ == "prep":
                if percent_prep <= (len(prepositions)/length) * 100:
                   utt += ' '
                   length -= 1
                   prepositions.remove(tok.text)
                elif suk_prob >= suk_lim:
                    utt += get_alt_word(tok) + ' ' 
                else:
                    utt += tok.text + ' '
            
            # Handle adjectives                 
            elif tok.pos_ == "ADJ":
                if (percent_adj <= (len(adjectives)/length) * 100 or 
                        remove == tok.pos_):
                   utt += ' '
                   length -= 1
                   adjectives.remove(tok.text)
                elif suk_prob >= suk_lim:
                    utt += get_alt_word(tok) + ' '
                else:
                    utt += tok.text + ' '
            
            # Handle adverbs
            elif tok.pos_ == "ADV":
                if (percent_adv <= (len(adverbs)/length) * 100
                        or remove == tok.pos_):
                   utt += ' '
                   length -= 1
                   adverbs.remove(tok.text)
                # p:w and s:uk errors
                elif pw_prob >= pw_lim or suk_prob >= suk_lim:
                    utt += get_alt_word(tok) + ' '
                else:
                    utt += tok.text + ' '
            
            # Handle particles ('s, not etc)
            elif tok.pos_ == "PART":
                x = np.random.uniform(0,1)
                # m:0s error 50% times
                # missing plural suffix
                if tok.text.startswith("'s") :
                    if m0s_prob >= m0s_lim:
                        utt = utt[:-1] + tok.text + ' ' 
                elif (tok.text.startswith("'") or tok.text.startswith("n't") 
                      or tok.text.startswith("nt") or tok.text.startswith("v'e")
                      or tok.text.startswith("ve")):
                        utt = utt[:-1] + ' ' 
                else:
                    utt += tok.text + ' '
            
            # Handle auxillaries ('ve in i have)
            elif tok.pos_ == "AUX":
                if tok.text.startswith("'"):
                    utt = utt[:-1] + tok.text + ' '

                else:
                    utt += tok.text + ' '
            
            # Handling punctuation (like :, .)
            elif tok.pos == "PUNCT":                
                utt = utt[:-1] + tok.text+ ' '
            
            elif tok.pos == "INTJ":
                if rep_prob >= rep_lim or add:
                    length += 1
                    interjections.append(tok.pos_)
                    utt += tok.text + " " 
                utt += tok.text + " "
                
            # all other words with respective POS have a chance of s:uk
            else:
                utt += tok.text + ' '

            
        utt = " ".join(utt.split()) # remove trailing whitespaces
        utt = re.sub(r'\s+([?.!",])', r'\1', utt)
        
        # only return sentences which are short enough
        if (5 < len(re.findall("[a-zA-Z_]+", utt)) <= n 
                and len(re.findall("[a-zA-Z_]+", utt)) <= 47): 
            return utt, True
        else:
            return '', False
        
    # skipped sentence due to original length = 0
    else:
        return "", False

# Test area

In [12]:
para = """I received the brass that you sent me. Thank you very much for all your trouble and the extra 3 pieces. I feel that you have an outstanding company and are striving the best that you can to achieve customer satisfaction. I will be certain to tell my friends about US Reloading Supply."""

from nltk.tokenize import sent_tokenize

sentences = sent_tokenize(para)
for sent in sentences:
    print("Original sentence: ", sent)
    aphasic, changed = aphasic_speech(sent)
    if changed:
        print("Final utter:", " ".join(aphasic.split()).strip())
        print("-----------------------------------------------")
    else:
        print("-----------------------------------------------")

Original sentence:  I received the brass that you sent me.
-----------------------------------------------
Original sentence:  Thank you very much for all your trouble and the extra 3 pieces.
-----------------------------------------------
Original sentence:  I feel that you have an outstanding company and are striving the best that you can to achieve customer satisfaction.
-----------------------------------------------
Original sentence:  I will be certain to tell my friends about US Reloading Supply.
-----------------------------------------------


In [13]:
for x in nlp("Ruhis notebook"):
    print(x.text, x.pos_)

Ruhis ADJ
notebook NOUN


## Test on some of the IMDB dataset

In [14]:
from datasets import load_dataset
from nltk.tokenize import sent_tokenize

ds = load_dataset("datablations/c4-filter-small")
print(ds)

DatasetDict({
    train: Dataset({
        features: ['text', 'timestamp', 'url', 'meta', 'text_length', 'domain', 'perplexity', 'dup_ratio', 'pairs', 'repetitions', 'cluster'],
        num_rows: 100000
    })
})


In [15]:
texts = ds["train"]["text"]
sents = []
for text in texts:
    text = re.sub(r'\<.*?\>', " ", text)
    sentences = sent_tokenize(text)
    for sent in sentences:
        if isinstance(sent, str):
            sent = re.sub(r'\<.*?\>', " ", sent)
            if not sent.rstrip().isdigit() and len(sent.rstrip()) >= 1:
                sents.append(sent)

In [16]:
total_num_sents = 40000 # 1000
test_sents = sents[:total_num_sents]

In [18]:
aphasic_sents = []
normal_sents = []

import inflect
from preprocess import postprocess
p = inflect.engine()
count = 0

for sent in test_sents:
    # no digits like in aphasiabank
    # print(sent)
    b = re.findall("[0-9]+", sent)
    for i in b:
        sent = sent.replace(i, p.number_to_words(i))
    
    s = preprocess(sent)
    aphasic, changed = aphasic_speech(s)
    count += 1
    if count % 1000 == 0:
        print(f"Processed {count} sentences, broca utts: {len(aphasic_sents)}")
    if changed and aphasic !=".":
        # print(sent)
        # print(s)
        # print(postprocess(aphasic))
        # print()
        normal_sents.append(s)
        aphasic_sents.append(aphasic)

Processed 1000 sentences, broca utts: 44
Processed 2000 sentences, broca utts: 92
Processed 3000 sentences, broca utts: 126
Processed 4000 sentences, broca utts: 168
Processed 5000 sentences, broca utts: 197
Processed 6000 sentences, broca utts: 240
Processed 7000 sentences, broca utts: 269
Processed 8000 sentences, broca utts: 306
Processed 9000 sentences, broca utts: 358
Processed 10000 sentences, broca utts: 403
Processed 11000 sentences, broca utts: 432
Processed 12000 sentences, broca utts: 475
Processed 13000 sentences, broca utts: 523
Processed 14000 sentences, broca utts: 549
Processed 15000 sentences, broca utts: 586
Processed 16000 sentences, broca utts: 618
Processed 17000 sentences, broca utts: 636
Processed 18000 sentences, broca utts: 669
Processed 19000 sentences, broca utts: 714
Processed 20000 sentences, broca utts: 746
Processed 21000 sentences, broca utts: 785
Processed 22000 sentences, broca utts: 830
Processed 23000 sentences, broca utts: 854
Processed 24000 senten

# Post process of aphasic sentences
Also adding some "control" sentences

In [None]:
# import pandas as pd
# from datasets import load_dataset
# 
# # ds = load_dataset('stas/c4-en-10k')
# ds = load_dataset("imdb")
# print(ds)

In [19]:
sentences = aphasic_sents
original = normal_sents

In [20]:
import re
import string
from preprocess import postprocess
broca_sents = []
original_sents = []
for sent, o in zip(sentences, original):
    x = postprocess(sent)
    if x != "":
        broca_sents.append(x)
        original_sents.append(o)

In [21]:
pd.DataFrame(data={"modified": broca_sents, "original": original_sents}).to_csv("data/test.csv", sep=",", index=False)

In [22]:
broca_sents = pd.read_csv("data/test.csv", sep=",")["modified"]

In [23]:
import inflect
from preprocess import preprocess, postprocess
p = inflect.engine()
import re

# post process and pre-process in same way
from nltk.tokenize import sent_tokenize
new_sents = []
sentences = sents[total_num_sents:]
for sent in sentences:
    if len(new_sents) >= round(len(broca_sents)*2.5755):
        break
    if isinstance(sent, str):
        sent = re.sub(r'\<.*?\>', " ", sent)
        if not sent.rstrip().isdigit() and len(sent.rstrip()) >= 1:
            # no digits
            b = re.findall("[0-9]+", sent)
            for i in b:
                sent = sent.replace(i, p.number_to_words(i))
            sent = preprocess(sent)
            sent = postprocess(sent)
            if sent != "":
                new_sents.append(sent)

In [24]:
len(new_sents)

3603

In [25]:
len(broca_sents)

1399

In [26]:
control_sents = new_sents[:total_num_sents+round(len(broca_sents)*2.5755)]

In [27]:
print(len(broca_sents), len(control_sents))

1399 3603


In [28]:
broca_data = pd.DataFrame(data={"preprocessed_text": broca_sents, "label": [1]*len(broca_sents)})
control_data = pd.DataFrame(data={"preprocessed_text": control_sents, "label": [0]*len(control_sents)})
data_full_scenario = pd.concat([broca_data, control_data], ignore_index=True)
data_full_scenario = data_full_scenario.sample(frac=1).reset_index(drop=True)
# data_full_scenario.to_csv("data/synthetic_clan_test.csv", sep=",", index=False)
data_full_scenario.to_csv("data/test_merge.csv", sep=",", index=False)

In [29]:
data_full_scenario

Unnamed: 0,preprocessed_text,label
0,as i hoped they would be.,1
1,i walked away from the dinner party thinking i...,0
2,how do we we usu them?,1
3,we are committed to efficient safe and simple ...,0
4,​​picking winners is our business ​​copyright ...,0
...,...,...
4997,let us keep the good ones around.,0
4998,get your original tractor or combine parts her...,0
4999,on may zeroth of zerozero call it arson releas...,0
5000,in june zero the united states marine corps an...,0
