##### NOTE: In CLAN the C-NNLA command was used to get the distribution parameters. However, error outputs from C-NNLA were not very useful (like % correct ..., % incorrect, % grammatical etc) since some CHA files do not annotate errors like [* p], [* s] (but just replace it with xxx, yyy, zzz), making those measures unreliable

##### However, I am trying to include the 5 most common word errors from the C-NNLA measures and basing their occurance on the Salis and Edwards (2004) paper

##### This approach also does not consider any phonetic errors/substitutions (marked as @u or [* p])

In [120]:
import spacy
from spacy.matcher import Matcher
import numpy as np
import pandas as pd
from canonical_sents import get_canonical_sentences
import os
import scipy.stats as stats
import random
import re
nlp = spacy.load("en_core_web_sm")

In [121]:
pronouns = {'Art': ['a', 'an', 'the', ''],
           'Dem': ['this', 'that', 'these', 'those', ''],
           'Poss': ['my', 'your', 'his', 'her', 'its', 'our', 'their', '']}

In [122]:
def pro_sub(pro):
    for _, pro1 in pronouns.items():
        if pro.lower() in pro1:
            y = [j for j in pro1 if pro!=j]
            return random.choice(y)
    return ""

In [123]:
def get_truncnorm(mean, std, min, max):
    # mean, std, min, max parameters dependent on their extracted normal
    # distributions
    a, b = (min - mean) / std, (max - mean) / std
    return stats.truncnorm(a, b, loc=mean, 
                                   scale=std).rvs(size=1)[0]

In [124]:
def get_curr_nv_ratio(nouns, verbs):
    if len(nouns) != 0 and len(verbs) != 0:
        curr_ratio_nv = len(nouns)/len(verbs)
    else:
        curr_ratio_nv = 0
    return curr_ratio_nv

In [125]:
# TODO: add verb form handling 
def aphasic_speech(text):
    doc = nlp(text)
    # max length of modified sentences (log norm dist with log(27) being longest)
    # determine how long this sentence will be
    n = np.random.lognormal(mean=0.9162787, sigma=0.7350323)
    while n > 3.295:
        n = np.random.lognormal(mean=0.9162787, sigma=0.7350323)
        
    utt = ""
    # length of original text
    length = len(re.findall("[a-zA-Z_]+", text))
    
    if length != 0:
        # get possible n/v ratio for this sentence
        # n/v ratio is log-norm
        ratio_nv = get_truncnorm(1.461698, 1, 0.228, 6.818)
        # get the possible percentage of all POS
        percent_noun = get_truncnorm(20.68179, 9.794099, 5.991, 50)
        percent_verb = get_truncnorm(16.59888, 5.036936, 0, 26.267)
        percent_det = get_truncnorm(7.494686, 5.927496, 0, 27.79661)
        percent_prep = get_truncnorm(3.104844, 2.325505, 0, 15.05682)
        percent_adj = get_truncnorm(4.34104, 3.509376, 0, 21.05263)
        percent_adv = get_truncnorm(5.751157, 2.958618, 0, 15.88448)
        nouns = []
        verbs = []
        determiners = []
        prepositions = []
        adjectives = []
        adverbs = []
        
        # count no. of respective POS
        for tok in doc:
            if tok.pos_ == "NOUN":
                nouns.append(tok.text)
            elif tok.pos_ == "VERB" or tok.dep_ == "cop" or tok.tag_ in ["VBD", "VBN"]:
                verbs.append(tok.text)
            elif tok.dep_ == "det":
                determiners.append(tok.text)
            elif tok.dep_ == "prep":
                prepositions.append(tok.text)
            elif tok.pos_ == "ADJ":
                adjectives.append(tok.text)
            elif tok.pos_ == "ADV":
                adverbs.append(tok.text)
                
        for tok in doc:
            # possible percentage of keeping respective POS
            curr_ratio_nv = get_curr_nv_ratio(nouns, verbs)
            
            if tok.pos_ == "NOUN": 
                # if possible noun percent in sentence less than current
                # percent or if current n/v ratio is too big, remove noun 
                # from sentence
                if (percent_noun <= (len(nouns)/length) * 100 
                        or curr_ratio_nv > ratio_nv) :
                    utt += ' '
                    nouns.remove(tok.text)
                else:
                    utt += tok.text + ' '
                
            elif  tok.pos_ == "VERB" or tok.dep_ == "cop" or tok.tag_ in ["VBD", "VBN"]:
                # if possible verb percent in sentence less than current 
                # percent or if current n/v ratio too big remove noun 
                # from sentence
                if (percent_verb <= (len(verbs)/length) * 100 
                        or curr_ratio_nv > ratio_nv):
                   utt += ' '
                   verbs.remove(tok.text)
                else:
                    utt += tok.text + ' '
                    
            elif tok.dep_ == "det":
                if percent_det <= (len(determiners)/length) * 100:
                   utt += ' '
                   determiners.remove(tok.text)
                else:
                    utt += tok.text + ' '
                    
            elif tok.dep_ == "prep":
                if percent_prep <= (len(prepositions)/length) * 100:
                   utt += ' '
                   prepositions.remove(tok.text)
                else:
                    utt += tok.text + ' '
                    
            elif tok.pos_ == "ADJ":
                if percent_adj <= (len(adjectives)/length) * 100:
                   utt += ' '
                   adjectives.remove(tok.text)
                else:
                    utt += tok.text + ' '
                    
            elif tok.pos_ == "ADV":
                if percent_adj <= (len(adverbs)/length) * 100:
                   utt += ' '
                   adverbs.remove(tok.text)
                else:
                    utt += tok.text + ' '
                    
            elif tok.pos_ in ["DET", "PRON"]:
                # based on Salis and Edwards (2004)
                # CLAN error code s:r:gc:pro
                x = np.random.uniform(0,1)
                                   
                # 3% of determiners were substituted
                if x >= 0.97:
                    if tok.pos_ == "DET" or "Dem" in tok.morph.get('PronType') or "Yes" in tok.morph.get('Poss'):
                        utt += pro_sub(tok.text) + " "
                # 19% determiners were omitted
                elif x >= 0.81:
                    utt += " "
                else:
                    utt += tok.text + ' '
           
           # all other words with respective POS unaffected
            else:
                utt += tok.text + ' '

        utt = " ".join(utt.split()) # remove trailing whitespaces
        utt = re.sub(r'\s+([?.!",])', r'\1', utt)
        # print("Possible utt: ", utt)
        # only return sentences which are short enough
        if np.exp(n) >= len(re.findall("[a-zA-Z_]+", utt)) and len(re.findall("[a-zA-Z_]+", utt)) <= 27:
            return utt, True
        else:
            return '', False
        
    # skipped sentence due to length = 0
    else:
        return "", False
    
def augment(filepath, save_path, include_canonical=False):
    pass

In [126]:
para = """I received the brass that you sent me, and all looks well. Thank you very much for all your trouble and the extra 3 pieces. I feel that you have an outstanding company and are striving the best that you can to achieve customer satisfaction. I will be certain to tell my friends about US Reloading Supply. ~ Michelle Admin Note: By accident we shorted them two shells, so we sent them replacements."""

from nltk.tokenize import sent_tokenize

sentences = sent_tokenize(para)
for sent in sentences:
    aphasic, changed = aphasic_speech(sent)
    if changed:
        print("Final utter:", " ".join(aphasic.split()).strip())
        print("-----------------------------------------------")
    else:
        print("-----------------------------------------------")

Final utter: I the brass that you sent me, and all looks.
-----------------------------------------------
Final utter: you much your trouble and extra 3 pieces.
-----------------------------------------------
-----------------------------------------------
-----------------------------------------------
-----------------------------------------------


In [127]:
sent = "Her flowers are pretty"

for tok in nlp(sent):
    print(tok.text, tok.dep_, tok.pos_, tok.tag_)

Her poss PRON PRP$
flowers nsubj NOUN NNS
are ROOT AUX VBP
pretty acomp ADV RB


## Test on 10k C4

In [1]:
from datasets import load_dataset
from nltk.tokenize import sent_tokenize

ds = load_dataset('stas/c4-en-10k')
print(ds)

DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 10000
    })
})


In [2]:
texts = ds["train"]["text"]
sents = []
for text in texts:
    sentences = sent_tokenize(text)
    for sent in sentences:
        sents.append(" ".join(sent.split()))

KeyboardInterrupt: 

In [None]:
len(sents)

In [None]:
test_sents = sents[:1000]

In [None]:
aphasic_sents = []
for sent in test_sents:
    aphasic_speech(sent)
    aphasic, changed = aphasic_speech(sent)
    if changed and aphasic !=".":
        print(aphasic)
        aphasic_sents.append(aphasic)

In [None]:
pd.DataFrame(data={"modified": aphasic_sents}).to_csv("synthetic.csv", sep=",", index=False)

In [None]:
aphasic_sents

# Post process of aphasic sentences
Also adding some "control" sentences

In [1]:
import pandas as pd
from datasets import load_dataset

ds = load_dataset('stas/c4-en-10k')
print(ds)

DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 10000
    })
})


In [2]:
df = pd.read_csv("synthetic.csv")

In [3]:
sentences = df['modified']

In [4]:
sentences

0             Beginners BBQ Class Taking Place Missoula!
1                          You will, this your calendar.
2      Thursday, September 22nd join World Class BBQ ...
3                        I 've 500 drive and 240 gb SSD.
4                          I 've SSD and I up SSD drive.
                             ...                        
377                                            Grace is.
378                    the stomach, and it takes breath.
379    I God ’s our, getting married, and living our ...
380    it feels a raging fight going on me, making de...
381                                    Where does Grace?
Name: modified, Length: 382, dtype: object

In [6]:
import re
remove_startswiths = [" ", ",", "!", ".", "?", ".", "/"]
new_sents = []
for sent in sentences:
    sent = re.sub(r'\s+([?.!",\'])', r'\1', sent)
    sent = re.sub(r"^\W+", "", sent)
    sent = re.sub(r"\(\s*(.*?)\s*\)",r'(\1)', sent)
    new_sents.append(sent)

In [7]:
from nltk.tokenize import sent_tokenize
texts = ds["train"]["text"]
sents = []
for text in texts:
    sentences = sent_tokenize(text)
    for sent in sentences:
        sents.append(" ".join(sent.split()))

In [8]:
control_sents = sents[1000:2000]

In [9]:
broca_data = pd.DataFrame(data={"modified": new_sents, "label": [1]*len(new_sents)})
control_data = pd.DataFrame(data={"modified": control_sents, "label": [0]*len(control_sents)})
data_full_scenario = pd.concat([broca_data, control_data], ignore_index=True)
data_full_scenario = data_full_scenario.sample(frac=1).reset_index(drop=True)
data_full_scenario.to_csv("gen_data.csv", sep=",", index=False)

In [11]:
data_full_scenario

Unnamed: 0,modified,label
0,A tooth-colored composite placed on the lingua...,0
1,Serve in bowls topped with toasted sesame seed...,0
2,in Anthropology from Loyola University of Chic...,0
3,"Copper Chef is, and we are a look at their bak...",1
4,Disturbia Clothing coupon code gift !,0
...,...,...
1377,It’s effective both for people who simply want...,0
1378,13.,0
1379,EPA 21 MPG Hwy/15 MPG City!,0
1380,The Security Council has adopted such statemen...,0
