##### NOTE: In CLAN the C-NNLA command was used to get the distribution parameters. However, error outputs from C-NNLA were not very useful (like % correct ..., % incorrect, % grammatical etc) since some CHA files do not annotate errors like [* p], [* s] (but just replace it with xxx, yyy, zzz), making those measures unreliable

##### However, I am trying to include the 5 most common word errors from the C-NNLA measures 

##### This approach also does not consider any utterance errors/substitutions (marked as @u or [* p:n] in AphasiaBank)

##### This approach has repetition (no bigram repetition), keeps 60% of the original sentence (min words in modified is 5, max is 20), does not modify sentences with special chars, makes sure there are more nouns than verbs and implements the NP/VP ratio as used by Misra et al.

In [57]:
import string

import spacy
import numpy as np
import pandas as pd
import scipy.stats as stats
import random
import re
from pattern.text.en import singularize, pluralize
import enchant
from preprocess import preprocess
from spacy.matcher import Matcher
from string import printable

d = enchant.Dict("en_US")
nlp = spacy.load("en_core_web_sm")
matcher = Matcher(nlp.vocab)


In [58]:
dets = {'Art': ['a', 'an', 'the', ''],
           'Dem': ['this', 'that', 'these', 'those', ''],
           'Poss': ['my', 'your', 'his', 'her', 'its', 'our', 'their', '']}

In [59]:
def det_sub(x):
    for _, det in dets.items():
        if x.lower() in det:
            y = [j for j in det if x!=j]
            return random.choice(y)
    return ""

In [60]:
def get_truncnorm(mean, std, min, max):
    # mean, std, min, max parameters dependent on their extracted normal
    # distributions
    a, b = (min - mean) / std, (max - mean) / std
    return float(stats.truncnorm(a, b, loc=mean, 
                                   scale=std).rvs(size=1)[0])

In [61]:
def get_curr_nv_ratio(nouns, verbs):
    if len(nouns) != 0 and len(verbs) != 0:
        curr_ratio_nv = len(nouns)/len(verbs)
    else:
        curr_ratio_nv = 0
    return curr_ratio_nv

In [62]:
def get_alt_word(tok):
    # get a new word which is the same length as old word
    # to simulate p:w errors
    possible_words = [x for x in d.suggest(tok.text) if len(x) ==
                      len(tok.text) and d.check(x) and x != tok.text] 
    if possible_words:
        new_word = random.choice(possible_words)
        if new_word:
            return new_word
    
    # if we cannot find new word just return current word
    return tok.text

In [63]:
m0sa_lim = 0.6      # m:0s:a             (40%)
ms_lim = 0.6        # m:+s(:a)           (40%)
m0s_lim = 0.6       # m:0s               (40%)
m03_lim = 0.6       # m:03s:a            (40%)
mvsg_lim = 0.6      # m:vsg:a            (40%)
sgc_lim = 0.6       # s:r:gc             (40%)
sr_lim = 0.6        # s:r (for pronouns) (40%)

In [64]:
def check_complexity(doc):
    # remove too complex sentences, from Misra et al. 
    # get NPs
    noun_phrases = set()
    for nc in doc.noun_chunks:
        for nop in [nc, doc[nc.root.left_edge.i:nc.root.right_edge.i + 1]]:
            noun_phrases.add(nop.text.strip())
    # get VPs
    verb_phrases = matcher(doc)
    verb_phrases = [doc[start:end] for _, start, end in verb_phrases]

    try:
        ratio = len(noun_phrases) / len(verb_phrases)
    except:
        # Division by zero
        return 0, True

    X = np.random.uniform(0, 1)
    
    # if too complex or going to reject sentence
    # return true for too complex
    return ratio, ratio > 2 and X <= 0.8

In [65]:
extra = '"#$%&()*+-/:;<=>@[\\]^_`{|}~"'

for x in extra:
    printable = printable.replace(x,'',)

In [66]:
printable

"0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!',.? \t\n\r\x0b\x0c"

In [67]:
def aphasic_speech(text):
    doc = nlp(text)
    
    # at minimum keep 60% of the text, with 5 words being the lowest
    n = max(5, round(len(re.findall("[a-zA-Z_]+", text))*0.6))
    utt = ""
    # length of original text
    length = len(re.findall("[a-zA-Z_]+", text))
    if length != 0:
        
        vp_pattern = [[{'POS': 'VERB', 'OP': '?'},
                   {'POS': 'ADV', 'OP': '*'},
                   {'POS': 'AUX', 'OP': '*'},
                   {'POS': 'VERB', 'OP': '+'}]]
        matcher.add("Verb phrase", vp_pattern)
        aphasic_utt = ""
    
        # get NPs
        noun_phrases = set()

        for nc in doc.noun_chunks:
            for nop in [nc, doc[nc.root.left_edge.i:nc.root.right_edge.i + 1]]:
                noun_phrases.add(nop.text.strip())
                # get VPs
        verb_phrases = matcher(doc)
        verb_phrases = [doc[start:end] for _, start, end in verb_phrases]

        try:
            ratio = len(noun_phrases) / len(verb_phrases)
        except:
            # no verb phrases in sentence
            return '', False

        X = np.random.uniform(0, 1)
    
        if ratio > 2 and X > 0.8:
            # skip sentence
            return '', False
        else:
            # do not modify sentences with special characters
            if set(text).difference(printable):
                return "", False
            
            # get possible n/v ratio for this sentence
            ratio_nv = np.random.gamma(shape=2.180031, scale=1/1.498104)
            
            # get the possible percentage of all POS
            # values below are %-ages, not ratios
            # noun and verb distributions are gamma, rest are truncated normal
            percent_noun = np.random.gamma(shape=4.0047683, scale=1/0.1944749)
            percent_verb = np.random.gamma(shape=9.9920204, scale=1/0.5973042)
            # more nouns than verbs
            while percent_verb >= percent_noun:
                percent_noun = np.random.gamma(shape=4.0047683, scale=1/0.1944749)
            
            percent_det = get_truncnorm(7.55312, 6.004386, 0, 27.79661)
            percent_prep = get_truncnorm(3.15664, 2.386052, 0, 15.05682)
            percent_adj = get_truncnorm(4.258013, 3.460436, 0, 21.05263)
            percent_adv = get_truncnorm(5.808547, 2.911826, 0, 15.88448)
                   
            nouns = []
            verbs = []
            determiners = []
            prepositions = []
            adjectives = []
            adverbs = []
            
            # count no. of respective POS
            for tok in doc:
                if tok.pos_ == "NOUN":
                    nouns.append(tok.text)
                elif tok.pos_ == "VERB" or tok.dep_ == "cop" or tok.tag_ in ["VBD", "VBN"]:
                    verbs.append(tok.text)
                # det:art and det:dem only
                elif tok.dep_ == "det" and ("Dem" in tok.morph.get('PronType') or "Art" in tok.morph.get('PronType')):
                    determiners.append(tok.text)
                elif tok.dep_ == "prep":
                    prepositions.append(tok.text)
                elif tok.pos_ == "ADJ":
                    adjectives.append(tok.text)
                elif tok.pos_ == "ADV":
                    adverbs.append(tok.text)
                    
            for tok in doc:
                # current percentage of nouns and verbs in broca utterance
                curr_ratio_nv = get_curr_nv_ratio(nouns, verbs)
                
                m0sa_prob = random.uniform(0,1)     #m:0s:a
                ms_prob = random.uniform(0,1)       #m:+s(:a)
                m0s_prob = random.uniform(0,1)      #m:0s
                m03_prob = random.uniform(0,1)      # m:03s:a
                mvsg_prob = random.uniform(0,1)     # m:vsg:a
                sgc_prob = random.uniform(0,1)      # s:r:gc
                sr_prob = random.uniform(0,1)       #s:r (for pronouns)
                
                # Handle nouns
                if tok.pos_ == "NOUN": 
                    # if possible noun percent in sentence less than current
                    # percent or if current n/v ratio is too big, remove noun 
                    # from sentence
                    if (percent_noun <= (len(nouns)/length) * 100 
                            or curr_ratio_nv > ratio_nv) :
                        utt += ' '
                        length -= 1 
                        nouns.remove(tok.text)  
                    # m:0s:a, m:+s, p:w, s:uk errors
                    elif m0sa_prob >= m0sa_lim or ms_prob >= ms_lim:
                        if "Plur" in tok.morph.get("Number"):
                            utt += singularize(tok.text) + ' '
                        elif "Sing" in tok.morph.get("Number"):
                            utt += pluralize(tok.text) + ' '    
                    else:
                        utt += tok.text + ' '
                        
                # Handle verbs (copula and gerund/participles counted as verb)
                elif  tok.pos_ == "VERB" or tok.dep_ == "cop" or tok.tag_ in ["VBD", "VBN"]:
                    # if possible verb percent in sentence less than current 
                    # percent or if current n/v ratio too big remove noun 
                    # from sentence
                    if (percent_verb <= (len(verbs)/length) * 100 
                            or curr_ratio_nv > ratio_nv):
                       utt += ' '
                       length -= 1
                       verbs.remove(tok.text)
                    # m:03s:a, m:vsg:a error
                    elif m03_prob >= m03_lim or mvsg_prob >= mvsg_lim:
                        # lemmatize 
                        utt += tok.lemma_ + " "
                    else:
                        utt += tok.text + ' '
                
                # Handle determiners (art and dem)
                elif tok.dep_ == "det" and ("Dem" in tok.morph.get('PronType') or "Art" in tok.morph.get('PronType')) :
                    # if possible determiner percent in sentence less than current, remove determiner
                    if percent_det <= (len(determiners)/length) * 100:
                       utt += ' '
                       length -= 1
                       determiners.remove(tok.text)
                    else:
                        utt += tok.text + ' '
                        
                # Handle pronouns
                elif tok.pos_ == "PRON":
                    # s:r:gc:pro  and s:r error (same for pronouns)
                    # but clan uses both versions 
                    utt += tok.text + " " # for repetition
                    if sgc_prob >= sgc_lim or sr_prob >= sr_lim:
                        if tok.pos_ == "DET" or "Dem" in tok.morph.get('PronType') or "Yes" in tok.morph.get('Poss'):
                            utt += det_sub(tok.text) + " "
                        else:
                            utt += tok.text + " "
                    else:
                        utt += tok.text + " "
    
                # Handle prepositions
                elif tok.dep_ == "prep":
                    if percent_prep <= (len(prepositions)/length) * 100:
                       utt += ' '
                       length -= 1
                       prepositions.remove(tok.text)
                    else:
                        utt += tok.text + ' '
                
                # Handle adjectives                 
                elif tok.pos_ == "ADJ":
                    if percent_adj <= (len(adjectives)/length) * 100:
                       utt += ' '
                       length -= 1
                       adjectives.remove(tok.text)
                    else:
                        utt += tok.text + ' '
                
                # Handle adverbs
                elif tok.pos_ == "ADV":
                    if percent_adv <= (len(adverbs)/length) * 100:
                       utt += ' '
                       length -= 1
                       adverbs.remove(tok.text)
                    else:
                        utt += tok.text + ' '
                
                # Handle particles ('s, not etc)
                elif tok.pos_ == "PART":
                    # m:0s error 50% times
                    # missing plural suffix
                    if tok.text.startswith("'s") :
                        if m0s_prob >= m0s_lim:
                            utt = utt[:-1] + tok.text + ' ' 
                    elif (tok.text.startswith("'") or tok.text.startswith("n't") 
                          or tok.text.startswith("nt") or tok.text.startswith("v'e")
                          or tok.text.startswith("ve")):
                            utt = utt[:-1] + ' ' 
                    else:
                        utt += tok.text + ' '
                
                # Handle auxillaries ('ve in i have)
                elif tok.pos_ == "AUX":
                    if tok.text.startswith("'"):
                        utt = utt[:-1] + tok.text + ' '
    
                    else:
                        utt += tok.text + ' '
                
                # Handling punctuation (like :, .)
                elif tok.pos_ == "PUNCT":                
                    utt = utt[:-1] + tok.text+ ' '
                
                # Handling interjections (for repetition)
                elif tok.pos_ == 'INTJ':
                    utt += tok.text + " "
                        
                # all other words with respective POS have a chance of s:uk
                else:
                    utt += tok.text + ' '
                                
            utt = " ".join(utt.split()) # remove trailing whitespaces
            utt = re.sub(r'\s+([?.!",])', r'\1', utt)
            
            # only return utterance if its long enough and has more nouns
            # than verbs, and if the utterance is not the same as the original 
            if n <= len(re.findall("[a-zA-Z_]+", utt)) <= 20 and len(nouns) >= len(verbs)\
                    and utt.lower() != text.lower(): 
                return utt, True
            else:
                return '', False
            
    # skipped sentence due to original length = 0
    else:
        return "", False

# Test area

In [68]:
para = """I received the brass that you sent me. Thank you very much for all your trouble and the extra 3 pieces. I feel that you have an outstanding company and are striving the best that you can to achieve customer satisfaction. I will be certain to tell my friends about US Reloading Supply."""

from nltk.tokenize import sent_tokenize

sentences = sent_tokenize(para)
for sent in sentences:
    print("Original sentence: ", sent)
    aphasic, changed = aphasic_speech(sent)
    if changed:
        print("Final utter:", " ".join(aphasic.split()).strip())
        print("-----------------------------------------------")
    else:
        print("-----------------------------------------------")

Original sentence:  I received the brass that you sent me.
Final utter: I I receive brasses that that you you me me.
-----------------------------------------------
Original sentence:  Thank you very much for all your trouble and the extra 3 pieces.
Final utter: you you all your your and 3 pieces.
-----------------------------------------------
Original sentence:  I feel that you have an outstanding company and are striving the best that you can to achieve customer satisfaction.
Final utter: I I that you you and are that that you you can to satisfactions.
-----------------------------------------------
Original sentence:  I will be certain to tell my friends about US Reloading Supply.
Final utter: I I will be to my friends US Reloading Supply.
-----------------------------------------------


In [69]:
text = "did you no longer want to use it?"

for doc in nlp(text):
    print(doc.text, doc.pos_)

did AUX
you PRON
no ADV
longer ADV
want VERB
to PART
use VERB
it PRON
? PUNCT


## Test on some of the C4 dataset

In [70]:
from datasets import load_dataset
from nltk.tokenize import sent_tokenize

ds = load_dataset("stas/c4-en-10k")
print(ds)

DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 10000
    })
})


In [71]:
texts = ds["train"]["text"]
sents = []
total_num_sents = 3000
for text in texts:
    text = re.sub(r'\<.*?\>', " ", text)
    sentences = sent_tokenize(text)
    for sent in sentences:
        if isinstance(sent, str):
            sent = re.sub(r'\<.*?\>', " ", sent)
            if not sent.rstrip().isdigit() and len(sent.rstrip()) >= 1:
                sents.append(sent)
    if len(sents) >= total_num_sents:
        break
                
test_sents = sents[:total_num_sents]

In [72]:
test_sents

['Beginners BBQ Class Taking Place in Missoula!',
 'Do you want to get better at making delicious BBQ?',
 'You will have the opportunity, put this on your calendar now.',
 'Thursday, September 22nd join World Class BBQ Champion, Tony Balay from Lonestar Smoke Rangers.',
 'He will be teaching a beginner level class for everyone who wants to get better with their culinary skills.',
 'He will teach you everything you need to know to compete in a KCBS BBQ competition, including techniques, recipes, timelines, meat selection and trimming, plus smoker and fire information.',
 'The cost to be in the class is $35 per person, and for spectators it is free.',
 'Included in the cost will be either a t-shirt or apron and you will be tasting samples of each meat that is prepared.',
 "Discussion in 'Mac OS X Lion (10.7)' started by axboi87, Jan 20, 2012.",
 "I've got a 500gb internal drive and a 240gb SSD.",
 'When trying to restore using disk utility i\'m given the error "Not enough space on disk _

In [73]:
aphasic_sents = []
normal_sents = []

import inflect
from preprocess import postprocess
p = inflect.engine()
count = 0

for sent in test_sents:
    # no digits like in aphasiabank
    # print(sent)
    b = re.findall("[0-9]+", sent)
    if not b:
        s = preprocess(sent)
        aphasic, changed = aphasic_speech(s)
        count += 1
        if count % 1000 == 0:
            print(f"Processed {count} sentences, broca utts: {len(aphasic_sents)}")
        if changed and aphasic !="." and postprocess(aphasic.lower()) != s.lower():
            print(sent)
            print(s)
            print(postprocess(aphasic))
            print()
            normal_sents.append(s)
            aphasic_sents.append(aphasic)

Do you want to get better at making delicious BBQ?
do you want to get better at making delicious bbq?
do you you to make bbq?

You will have the opportunity, put this on your calendar now.
you will have the opportunity, put this on your calendar now.
you you will the this these your your calendars.

He will be teaching a beginner level class for everyone who wants to get better with their culinary skills.
he will be teaching a beginner level class for everyone who wants to get better with their culinary skills.
he he will be teach a beginners level classes everyone everyone who who to get their culinary skills.

Included in the cost will be either a t-shirt or apron and you will be tasting samples of each meat that is prepared.
included in the cost will be either a t shirt or apron and you will be tasting samples of each meat that is prepared.
the will be either a or and you you will be taste samples each meat that that is.

I've done this several times going from larger HDD to smaller

# Post process of aphasic sentences
Also adding some "control" sentences

In [74]:
import pandas as pd
from datasets import load_dataset

ds = load_dataset('stas/c4-en-10k')
# ds = load_dataset("imdb")
print(ds)

DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 10000
    })
})


In [75]:
sentences = aphasic_sents
original = normal_sents

In [76]:
sentences

['do you you to make bbq?',
 'you you will the, this these your your calendars.',
 'he he will be teach a beginners level classes everyone everyone who who to get their culinary skills.',
 'the will be either a or and you you will be taste samples each meat that that is.',
 'i i have done this times going and i i up a drive.',
 'or is it it think and play by sets rule?',
 'are to themselves themselves and their value.',
 'sysco, based houston, plans to make shares repurchases the two year.',
 'i i can not i i..',
 'something something that that will you you hear voice!',
 'how to and it it how it it applies lives!',
 'it it is that that must be matter what what your levels faith and maturities.',
 'i i sites brittlloyd.org.',
 'you you to it it?',
 ', what what i i to his our demand?',
 'i i do not to simplistic, but any or, of fact that pan that that stick will up.',
 'people normally problems up, but i i do not like it it since it it makes foods greasy.',
 'the that that you you migh

In [77]:
original

['do you want to get better at making delicious bbq?',
 'you will have the opportunity, put this on your calendar now.',
 'he will be teaching a beginner level class for everyone who wants to get better with their culinary skills.',
 'included in the cost will be either a t shirt or apron and you will be tasting samples of each meat that is prepared.',
 'i have done this several times going from larger hdd to smaller ssd and i wound up with a bootable ssd drive.',
 'or is it the rich think different and play by a different set of rules?',
 'rich people are willing to promote themselves and their value.',
 'sysco, based in houston, plans to make the share repurchases over the next two years.',
 'i cannot believe i manage..',
 'something new that will help you hear gods voice!',
 'how to understand and rightly divide it how it applies to life!',
 'it is a serious matter that must be reviewed no matter what your level of faith and maturity.',
 'i have existing web site brittlloyd.org.',
 

In [78]:
import re
import string
from preprocess import postprocess
broca_sents = []
original_sents = []
for sent, o in zip(sentences, original):
    x = postprocess(sent)
    # x = repetition(x)
    if x != "":
        broca_sents.append(x)
        original_sents.append(o)

In [79]:
pd.DataFrame(data={"modified": broca_sents, "original": original_sents}).to_csv("data/synthetic_clan_test.csv", sep=",", index=False)

In [80]:
# broca_sents = pd.read_csv("data/synthetic_clan_c4.csv", sep=",")["modified"]
broca_sents = pd.read_csv("data/synthetic_clan_test.csv", sep=",")["modified"]

In [81]:
import inflect
from preprocess import preprocess, postprocess
p = inflect.engine()
import re

# preprocess and post process in same way
from nltk.tokenize import sent_tokenize
texts = ds["train"]["text"][total_num_sents:] 
sents = []
for text in texts:
    if len(sents) >= round(len(broca_sents)*2.5755):
        break
    text = re.sub(r'\<.*?\>', " ", text)
    sentences = sent_tokenize(text)
    for sent in sentences:
        if isinstance(sent, str):
            sent = re.sub(r'\<.*?\>', " ", sent)
            if not sent.rstrip().isdigit() and len(sent.rstrip()) >= 1:
                # no digits
                b = re.findall("[0-9]+", sent)
                for i in b:
                    sent = sent.replace(i, p.number_to_words(i))
                sent = preprocess(sent)
                sent = postprocess(sent)
                if sent != "":
                    sents.append(sent)

In [82]:
len(sents)

1271

In [83]:
len(broca_sents)

492

In [84]:
control_sents = sents[:round(len(broca_sents)*2.5755)]
# control_sents = sents[:len(broca_sents)]

In [85]:
control_sents

['well it just had to happen.',
 'the peach tree just decided it was time to flower and along comes an april snow shower.',
 'looks iffy for peaches again this year.',
 'however given the lack of moisture we have had this spring five inches or so of an april shower is truly a welcomed event and perhaps will bring may flowers.',
 'the melting snow will keep the temps from going below freezing.',
 'they might survive and give you peaches.',
 'hi jim we will see it was well below freezing last night.',
 'this happens most years and peaches seem to come only about once every three four years.',
 'we have tried to cover the tree from time to time but got lazy this year.',
 'crossing my fingers for the peaches!',
 'thanks it is going to take a little luck and sunshine.',
 'hope your day is going well.',
 'very pretty like a japanese print but a little sad too.',
 'hi eliza yes there is a yin yangness to the late season snow this year.',
 'probably another year without peaches yet we sure did

In [86]:
print(len(broca_sents), len(control_sents))

492 1267


In [87]:
broca_data = pd.DataFrame(data={"modified": broca_sents, "label": [1]*len(broca_sents)})
control_data = pd.DataFrame(data={"modified": control_sents, "label": [0]*len(control_sents)})
data_full_scenario = pd.concat([broca_data, control_data], ignore_index=True)
data_full_scenario = data_full_scenario.sample(frac=1).reset_index(drop=True)
data_full_scenario.to_csv("data/synthetic_clan_merge_test.csv", sep=",", index=False)
# data_full_scenario.to_csv("data/synthetic_clan_merge_c4.csv", sep=",", index=False)

In [88]:
data_full_scenario

Unnamed: 0,modified,label
0,medication treatments frequently help manage t...,0
1,completely renovated four bedroom three bath h...,0
2,then there is shopping madness.,0
3,it could also cost you dearly in many ways.,0
4,for now check out our amazing mood board!,0
...,...,...
1754,it it saves half energies i.e.,1
1755,manager response we are so glad that everythin...,0
1756,kessell is joined on the all state first team ...,0
1757,poke cakes are one of my favorite desserts bec...,0
