##### NOTE: In CLAN the C-NNLA command was used to get the distribution parameters. However, error outputs from C-NNLA were not very useful (like % correct ..., % incorrect, % grammatical etc) since some CHA files do not annotate errors like [* p], [* s] (but just replace it with xxx, yyy, zzz), making those measures unreliable

##### However, I am trying to include the 5 most common word errors from the C-NNLA measures and basing their occurance on the Salis and Edwards (2004) paper

##### This approach also does not consider any phonetic errors/substitutions (marked as @u or [* p])

In [87]:
special_characters = ['(.)', '[/]', '[//]', '‡', 'xxx', '+< ', '„', '+', '"" /..""', '+"/.', '+"', '+/?', '+//.',
                      '+//?', '[]', '<>', '_', '-', '^', ':', 'www .', '*PAR', '+/', '@o', '<', '>',
                      '//..', '//', '/..', '/', '"', 'ʌ', '..?', '0.', '0 .', '"" /.', ')', '(', "@u", "@si", "@k",
                      "@n", "$n", "$co", "$adj", "$on", "$v", "@l", 'æ', 'é', 'ð', 'ü', 'ŋ', 'ɑ', 'ɒ', 'ɔ', 'ə',
                      'ɚ', 'ɛ', 'ɜ', 'ɝ', 'ɡ', 'ɪ', 'ɹ', 'ɾ', 'ʃ', 'ʊ', 'ʒ', 'ʔ', 'ʤ', 'ʧ', 'ː', '˞', '͡', 'θ', "@q",
                      "@sspa", "@i", "@wp", "@sjpn", "@sdeu", "@p", "@sfra", "&", "*"]
ipa = ['æ', 'é', 'ð', 'ü', 'ŋ', 'ɑ', 'ɒ', 'ɔ', 'ə', 'ɚ', 'ɛ', 'ɜ', 'ɝ', 'ɡ', 'ɪ', 'ɹ', 'ɾ', 'ʃ', 'ʊ', 'ʒ', 'ʔ', 'ʤ',
       'ʧ', 'ː', '˞', '͡', 'θ', ]

In [88]:
import spacy
from spacy.matcher import Matcher
import numpy as np
import pandas as pd
from canonical_sents import get_canonical_sentences
import os
import scipy.stats as stats
import random
import re
from pattern.text.en import singularize, pluralize
nlp = spacy.load("en_core_web_sm")

In [89]:
def preprocess(utterance):
    # Remove IPA
    for special_character in ipa:
        utterance = utterance.replace(special_character, "")

    pattern1 = r"@u *\[: (.*?)]"
    pattern2 = r"\b\w*@u\w*\b"
    news = re.findall(pattern1, utterance)
    olds = re.findall(pattern2, utterance)
    if len(news) == len(olds):
        for new, old in zip(news, olds):
            if "x@n" in new:
                utterance = utterance.replace(old, "")
            else:
                utterance = utterance.replace(old, new)

    # Remove all actions: (e.g. &=points:picture)
    utterance = re.sub(r"\&\=[a-zA-Z:_0-9]+", "", utterance)

    # Remove all unicode errors   \W\d+\w\d+\W
    utterance = re.sub(r"\W\d+\w\d+\W", "", utterance)

    # Remove anything between [ and ]
    utterance = re.sub(r"\[.*?\]", "", utterance)

    # Remove  < and > for repetition [/] and retracing [//]
    utterance = re.sub(r"\<|\>", "", utterance)

    # Remove anything &+ or &* (eg &*INV)
    utterance = re.sub(r"\&\+[a-zA-Z]+", "", utterance)
    utterance = re.sub(r"\&\*INV:[a-zA-Z_]+", "", utterance)

    # Remove remaining special chars
    for special_character in special_characters:
        # EDITED: for words such as you_know and of_course
        if special_character == '_':
            utterance = utterance.replace(special_character, " ")
        utterance = utterance.replace(special_character, "")

    utterance = re.sub(' +', ' ', utterance)

    # Remove special characters from starting sentence
    remove_startswiths = [" ", ",", "!", ".", "?", "."]
    for start_string in remove_startswiths:
        if utterance.startswith(start_string):
            utterance = utterance[1:]

    # Remove final whitespaces (e.g. double space)
    utterance = re.sub(' +', ' ', utterance)
    # Remove trailing whitespaces before punctuation
    utterance = re.sub(r'\s+([?.!"])', r'\1', utterance)

    # remove extra spaces
    utterance = utterance.lstrip().rstrip().lower().replace("[]", "").replace("]", "")
    utterance = re.sub(' +|\t', ' ', utterance)
    utterance = re.sub(' ,', ',', utterance)

    return utterance

In [90]:
pronouns = {'Art': ['a', 'an', 'the', ''],
           'Dem': ['this', 'that', 'these', 'those', ''],
           'Poss': ['my', 'your', 'his', 'her', 'its', 'our', 'their', '']}

In [91]:
def pro_sub(pro):
    for _, pro1 in pronouns.items():
        if pro.lower() in pro1:
            y = [j for j in pro1 if pro!=j]
            return random.choice(y)
    return ""

In [92]:
def get_truncnorm(mean, std, min, max):
    # mean, std, min, max parameters dependent on their extracted normal
    # distributions
    a, b = (min - mean) / std, (max - mean) / std
    return stats.truncnorm(a, b, loc=mean, 
                                   scale=std).rvs(size=1)[0]

In [93]:
def get_curr_nv_ratio(nouns, verbs):
    if len(nouns) != 0 and len(verbs) != 0:
        curr_ratio_nv = len(nouns)/len(verbs)
    else:
        curr_ratio_nv = 0
    return curr_ratio_nv

In [94]:
# TODO: add p:w, s:uk, s:r and s:per errors 
def aphasic_speech(text):
    doc = nlp(text)
    # max length of modified sentences (log norm dist with log(27) being longest)
    # determine how long this sentence will be
    n = np.random.lognormal(mean=0.9162787, sigma=0.7350323)
    while n > 3.295:
        n = np.random.lognormal(mean=0.9162787, sigma=0.7350323)
        
    utt = ""
    # length of original text
    length = len(re.findall("[a-zA-Z_]+", text))
    
    if length != 0:
        # get possible n/v ratio for this sentence
        # n/v ratio is log-norm
        ratio_nv = get_truncnorm(1.461698, 1, 0.228, 6.818)
        # get the possible percentage of all POS
        percent_noun = get_truncnorm(20.68179, 9.794099, 5.991, 50)
        percent_verb = get_truncnorm(16.59888, 5.036936, 0, 26.267)
        percent_det = get_truncnorm(7.494686, 5.927496, 0, 27.79661)
        percent_prep = get_truncnorm(3.104844, 2.325505, 0, 15.05682)
        percent_adj = get_truncnorm(4.34104, 3.509376, 0, 21.05263)
        percent_adv = get_truncnorm(5.751157, 2.958618, 0, 15.88448)
        nouns = []
        verbs = []
        determiners = []
        prepositions = []
        adjectives = []
        adverbs = []
        
        # count no. of respective POS
        for tok in doc:
            if tok.pos_ == "NOUN":
                nouns.append(tok.text)
            elif tok.pos_ == "VERB" or tok.dep_ == "cop" or tok.tag_ in ["VBD", "VBN"]:
                verbs.append(tok.text)
            elif tok.dep_ == "det":
                determiners.append(tok.text)
            elif tok.dep_ == "prep":
                prepositions.append(tok.text)
            elif tok.pos_ == "ADJ":
                adjectives.append(tok.text)
            elif tok.pos_ == "ADV":
                adverbs.append(tok.text)
                
        for tok in doc:
            # possible percentage of keeping respective POS
            curr_ratio_nv = get_curr_nv_ratio(nouns, verbs)
            
            # Handle nouns
            if tok.pos_ == "NOUN": 
                # if possible noun percent in sentence less than current
                # percent or if current n/v ratio is too big, remove noun 
                # from sentence
                x = np.random.uniform(0, 1)
                if (percent_noun <= (len(nouns)/length) * 100 
                        or curr_ratio_nv > ratio_nv) :
                    utt += ' '
                    nouns.remove(tok.text)
                # m:0s:a and m:+s errors (50% times)
                elif x <= 0.5:
                    if "Plur" in tok.morph.get("Number"):
                        utt += singularize(tok.text) + ' '
                    elif "Sing" in tok.morph.get("Number"):
                        utt += pluralize(tok.text) + ' '  
                else:
                    utt += tok.text + ' '
                    
            # Handle verbs (copula and gerund/participles counted as verb)
            elif  tok.pos_ == "VERB" or tok.dep_ == "cop" or tok.tag_ in ["VBD", "VBN"]:
                x = np.random.uniform(0, 1)
                # if possible verb percent in sentence less than current 
                # percent or if current n/v ratio too big remove noun 
                # from sentence
                if (percent_verb <= (len(verbs)/length) * 100 
                        or curr_ratio_nv > ratio_nv):
                   utt += ' '
                   verbs.remove(tok.text)
                # m:03s:a and m:vsg:a error 50% 
                # lemmatize reg+irr 3rd sing
                elif x <= 0.5:
                    if '3' in tok.morph.get("Person") and 'Sing' in tok.morph.get("Number"):
                        utt += tok.lemma_ + " "
                    else:
                        utt += tok.text + " "
                else:
                    utt += tok.text + ' '
            
            # Handle determiners (pronouns are not determiners)
            elif tok.dep_ == "det":
                x = np.random.uniform(0,1)
                if percent_det <= (len(determiners)/length) * 100:
                   utt += ' '
                   determiners.remove(tok.text)
                # s:r:gc:pro error 50% times
                elif x <= 0.5:
                    if tok.pos_ == "DET" or "Dem" in tok.morph.get('PronType') or "Yes" in tok.morph.get('Poss'):
                        utt += pro_sub(tok.text) + " "
                    else:
                        utt += tok.text + " "
                else:
                    utt += tok.text + ' '

            # Handle prepositions
            elif tok.dep_ == "prep":
                if percent_prep <= (len(prepositions)/length) * 100:
                   utt += ' '
                   prepositions.remove(tok.text)
                else:
                    utt += tok.text + ' '
            
            # Handle adjectives                 
            elif tok.pos_ == "ADJ":
                if percent_adj <= (len(adjectives)/length) * 100:
                   utt += ' '
                   adjectives.remove(tok.text)
                else:
                    utt += tok.text + ' '
            
            # Handle adverbs
            elif tok.pos_ == "ADV":
                if percent_adv <= (len(adverbs)/length) * 100:
                   utt += ' '
                   adverbs.remove(tok.text)
                else:
                    utt += tok.text + ' '
            
            # Handle particles ('s, not etc)
            elif tok.pos_ == "PART":
                x = np.random.uniform(0,1)
                # m:0s error 50% times
                # missing plural suffix
                if tok.text.startswith("'") or tok.text.startswith("n't"):
                    if x <= 50:
                        utt = utt[:-1] + tok.text + ' ' 
                    else:
                        utt = utt[:-1] + ' ' 
                else:
                    utt += tok.text + ' '
            
            # Handle auxillaries ('ve in i have)
            elif tok.pos_ == "AUX":
                if tok.text.startswith("'"):
                    utt = utt[:-1] + tok.text+ ' '
                else:
                    utt += tok.text + ' '
            
            # Handling punctuation (like :, .)
            elif tok.pos == "PUNCT":                
                utt = utt[:-1] + tok.text+ ' '
                
            # all other words with respective POS unaffected
            else:
                utt += tok.text + ' '

        utt = " ".join(utt.split()) # remove trailing whitespaces
        utt = re.sub(r'\s+([?.!",])', r'\1', utt)
        # print("Possible utt: ", utt)
        
        # only return sentences which are short enough and are changed
        if (np.exp(n) >= len(re.findall("[a-zA-Z_]+", utt)) 
                and len(re.findall("[a-zA-Z_]+", utt)) <= 27):
            # print("Possible utt1: ", utt)
            if utt != text:
                return utt, True
            else: 
                return '', False
        else:
            return '', False
        
    # skipped sentence due to length = 0
    else:
        return "", False
    
def augment(filepath, save_path, include_canonical=False):
    pass

# Test area

In [95]:
para = """I received the brass that you sent me, and all looks well. Thank you very much for all your trouble and the extra 3 pieces. I feel that you have an outstanding company and are striving the best that you can to achieve customer satisfaction. I will be certain to tell my friends about US Reloading Supply. ~ Michelle Admin Note: By accident we shorted them two shells, so we sent them replacements."""

from nltk.tokenize import sent_tokenize

sentences = sent_tokenize(para)
for sent in sentences:
    print("Original sentence: ", sent)
    aphasic, changed = aphasic_speech(sent)
    if changed:
        print("Final utter:", " ".join(aphasic.split()).strip())
        print("-----------------------------------------------")
    else:
        print("-----------------------------------------------")

Original sentence:  I received the brass that you sent me, and all looks well.
-----------------------------------------------
Original sentence:  Thank you very much for all your trouble and the extra 3 pieces.
Final utter: you your trouble and 3 piece.
-----------------------------------------------
Original sentence:  I feel that you have an outstanding company and are striving the best that you can to achieve customer satisfaction.
-----------------------------------------------
Original sentence:  I will be certain to tell my friends about US Reloading Supply.
Final utter: I will be to tell my friends US Reloading Supply.
-----------------------------------------------
Original sentence:  ~ Michelle Admin Note: By accident we shorted them two shells, so we sent them replacements.
-----------------------------------------------


In [96]:
sent = "She could'nt do it"
utt, changed = aphasic_speech(sent)
if changed:
    print(utt)

She do it


In [97]:
for tok in nlp("She couldn't do it"):
    print(tok.pos_, tok.text, tok.text.startswith("n't"))

PRON She False
AUX could False
PART n't True
VERB do False
PRON it False


## Test on some of the IMDB dataset

In [98]:
from datasets import load_dataset
from nltk.tokenize import sent_tokenize

ds = load_dataset("imdb")
print(ds)

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    unsupervised: Dataset({
        features: ['text', 'label'],
        num_rows: 50000
    })
})


In [99]:
texts = ds["train"]["text"]
sents = []
for text in texts:
    t = re.sub(r'\<.*?\>', " ", text)
    sentences = sent_tokenize(t)
    for sent in sentences:
        sents.append(" ".join(sent.split()))
test_sents = sents[:2000]

In [100]:
test_sents

['I rented I AM CURIOUS-YELLOW from my video store because of all the controversy that surrounded it when it was first released in 1967.',
 'I also heard that at first it was seized by U.S. customs if it ever tried to enter this country, therefore being a fan of films considered "controversial" I really had to see this for myself.',
 'The plot is centered around a young Swedish drama student named Lena who wants to learn everything she can about life.',
 'In particular she wants to focus her attentions to making some sort of documentary on what the average Swede thought about certain political issues such as the Vietnam War and race issues in the United States.',
 'In between asking politicians and ordinary denizens of Stockholm about their opinions on politics, she has sex with her drama teacher, classmates, and married men.',
 'What kills me about I AM CURIOUS-YELLOW is that 40 years ago, this was considered pornographic.',
 "Really, the sex and nudity scenes are few and far between,

In [101]:
aphasic_sents = []
for sent in test_sents:
    s = preprocess(sent)
    aphasic, changed = aphasic_speech(s)
    if changed and aphasic !=".":
        print(aphasic)
        aphasic_sents.append(aphasic)

and stockholm their, she sex her drama teachers, classmates, and man.
while my it, and nudity are staple cinemas.
ingmar bergman, arguably their old john ford, had his film.
, because they don't.
and, which we're vincent gallo's throbbing johnson, but not visible on chloes sevignies.
, you won't genital an films anything porn or explicit eroticas.
if to films futures.
is but story.
one might one's times out growing.
two elements and those are, 1 actings 2, good, photo.
its, and places.
a the museums i could this films, i was as as my parent were when they schlepped to see it!!
, its, couldn't a rabbits.
is school sophomore amateur night marxism.
is and.
sound is!
and call art?
whoever any book lucille ball, especially her autobiography.
all, i that lucille is one those who can not be portrayed by anyone than themselves.
to like lucille being she the movies is horrendous.
pino does not qualify to play.
he's and, his is, and, his acting is unbelievable.
although and ethel were not, they 

In [102]:
pd.DataFrame(data={"modified": aphasic_sents}).to_csv("synthetic.csv", sep=",", index=False)

# Post process of aphasic sentences
Also adding some "control" sentences

In [103]:
import pandas as pd
from datasets import load_dataset

# ds = load_dataset('stas/c4-en-10k')
ds = load_dataset("imdb")
print(ds)

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    unsupervised: Dataset({
        features: ['text', 'label'],
        num_rows: 50000
    })
})


In [104]:
df = pd.read_csv("synthetic.csv")

In [105]:
sentences = df['modified']

In [106]:
sentences

0      and stockholm their, she sex her drama teacher...
1            while my it, and nudity are staple cinemas.
2      ingmar bergman, arguably their old john ford, ...
3                                  , because they don't.
4      and, which we're vincent gallo's throbbing joh...
                             ...                        
722     , american pie's,, no, story, refuted character.
723                                              screen.
724                                             and not.
725    it, men to write and direct flick, and it fail...
726    there, to any, and to the film any ways or bel...
Name: modified, Length: 727, dtype: object

In [107]:
import re
new_sents = []
for sent in sentences:
    if isinstance(sent, str):
        sent = re.sub(r'\s+([?.!",;\'])', r'\1', sent)
        sent = re.sub(r"^\W+", "", sent)
        sent = re.sub(r"'\W+'", "", sent)
        sent = re.sub(r"\(\s*(.*?)\s*\)",r'(\1)', sent)
        new_sents.append(sent)

In [108]:
new_sents

['and stockholm their, she sex her drama teachers, classmates, and man.',
 'while my it, and nudity are staple cinemas.',
 'ingmar bergman, arguably their old john ford, had his film.',
 "because they don't.",
 "and, which we're vincent gallo's throbbing johnson, but not visible on chloes sevignies.",
 "you won't genital an films anything porn or explicit eroticas.",
 'if to films futures.',
 'is but story.',
 "one might one's times out growing.",
 'two elements and those are, 1 actings 2, good, photo.',
 'its, and places.',
 'a the museums i could this films, i was as as my parent were when they schlepped to see it!!',
 "its, couldn't a rabbits.",
 'is school sophomore amateur night marxism.',
 'is and.',
 'sound is!',
 'and call art?',
 'whoever any book lucille ball, especially her autobiography.',
 'all, i that lucille is one those who can not be portrayed by anyone than themselves.',
 'to like lucille being she the movies is horrendous.',
 'pino does not qualify to play.',
 "he's 

In [109]:
# preprocess and post process in same way
from nltk.tokenize import sent_tokenize
texts = ds["train"]["text"]
sents = []
for text in texts:
    t = re.sub(r'\<.*?\>', " ", text)
    sentences = sent_tokenize(text)
    for sent in sentences:
        if isinstance(sent, str):
            t = re.sub(r'\<.*?\>', " ", text) # remove <br/> <br/>
            sent = preprocess(sent)
            sent = re.sub(r'\s+([?.!",;\'])', r'\1', sent)
            sent = re.sub(r'\s+([?.!",;\']){2,}|([?.!",;\']){2,}', r'\2', sent)
            sent = re.sub(r"^\W+", "", sent)
            sent = re.sub(r"'\W+'", "", sent)
            sent = re.sub(r"\(\s*(.*?)\s*\)",r'(\1)', sent)
            sents.append(" ".join(sent.split()))

In [110]:
control_sents = sents[2000:2000+len(new_sents)]

In [111]:
control_sents

['to not live in montana and especially not to live there at the end of the 19th century.br br a river runs through it certainly is a well made movie from a cineastic standpoint.',
 'great landscapes, redford acting well.br br unfortunately, the story is bad if there is a story at all.br br i felt sorry for the narrator author, who is as dry, narrowminded a character as his father, a preacher.',
 'being driven, not driving his own life, he is left to watch his brother, who is also caged in the small town environment, losing his life.',
 "the author never even comes close to undestand his brother's motivations, but at least realizes, that he is lacking the slightest amount of homour fun.",
 'all there is, is flyfishing, where he follows even as an old man the style of his father.br br the end is not surprising, it is forseeable from the very beginning.br br definitely not a mustsee 3 10br br',
 'i always felt that a good film should have a plot.',
 'this particular film was missing one,

In [112]:
print(len(new_sents), len(control_sents))

724 724


In [116]:
broca_data = pd.DataFrame(data={"modified": new_sents, "label": [1]*len(new_sents)})
control_data = pd.DataFrame(data={"modified": control_sents, "label": [0]*len(control_sents)})
data_full_scenario = pd.concat([broca_data, control_data], ignore_index=True)
data_full_scenario = data_full_scenario.sample(frac=1).reset_index(drop=True)
data_full_scenario.to_csv("gen_data.csv", sep=",", index=False)

In [117]:
data_full_scenario

Unnamed: 0,modified,label
0,"with valentine's day approaching, the women be...",0
1,the plot is stupid.,0
2,they all share spotlights.,1
3,turner when she be bad.,1
4,there isn't one.,0
...,...,...
1443,this was made even worse by the fact that it s...,0
1444,"i will just simply say, please do not ever see...",0
1445,do not watch this movie.,0
1446,"there's some good songs in the soundtrack, inc...",0
