## NOTE: In CLAN the C-NNLA command was used to get the distribution parameters. However, error outputs from C-NNLA were not very useful (like % correct ..., % incorrect, % grammatical etc) since some CHA files do not annotate errors like [* p], [* s] (but just replace it with xxx, yyy, zzz), making those measures unreliable

## This approach also does not consider any phonetic errors/substitutions (marked as @u or [* p]

In [1]:
import spacy
from spacy.matcher import Matcher
import numpy as np
import pandas as pd
from canonical_sents import get_canonical_sentences
import os
import scipy.stats as stats
import random
import re
nlp = spacy.load("en_core_web_sm")

In [2]:
def get_truncnorm(mean, std, min, max):
    # mean, std, min, max parameters dependent on their extracted normal
    # distributions
    a, b = (min - mean) / std, (max - mean) / std
    return stats.truncnorm(a, b, loc=mean, 
                                   scale=std).rvs(size=1)[0]

In [3]:
def get_curr_nv_ratio(nouns, verbs):
    if len(nouns) != 0 and len(verbs) != 0:
        curr_ratio_nv = len(nouns)/len(verbs)
    else:
        curr_ratio_nv = 0
    return curr_ratio_nv

In [4]:
# DONE noun and verb handling?
# TODO: check distributions of sentence types and only allow those, handle determiners, articles, copulas, prepositions, adjectives, adverbs 
def aphasic_speech(text):
    doc = nlp(text)
    # max length of modified sentences 
    # determine how long this sentence will be
    n = np.random.lognormal(mean=0.9162787, sigma=0.7350323)
    while n > 3.295:
        n = np.random.lognormal(mean=0.9162787, sigma=0.7350323)
        
    utt = ""
    # length of original text
    length = len(re.findall("[a-zA-Z_]+", text))
    
    # TODO: remove when whole stuff implemented: max no. words in sentence in aphasiabank for brocas was 27
    if length <= 27:
        # get possible n/v ratio for this sentence
        # n/v ratio is log-norm
        ratio_nv = get_truncnorm(1.461698, 1, 0.228, 6.818)
        # get the possible percentage of all POS
        percent_noun = get_truncnorm(20.68179, 9.794099, 5.991, 50)
        percent_verb = get_truncnorm(16.59888, 5.036936, 0, 26.267)
        percent_det = get_truncnorm(7.494686, 5.927496, 0, 27.79661)
        percent_prep = get_truncnorm(3.104844, 2.325505, 0, 15.05682)
        percent_adj = get_truncnorm(4.34104, 3.509376, 0, 21.05263)
        percent_adv = get_truncnorm(5.751157, 2.958618, 0, 15.88448)
        nouns = []
        verbs = []
        determiners = []
        prepositions = []
        adjectives = []
        adverbs = []
        
        # count no. of respective POS
        for tok in doc:
            if tok.pos_ == "NOUN":
                nouns.append(tok.text)
            elif tok.pos_ == "VERB" or tok.dep_ == "cop" or tok.tag_ in ["VBD", "VBN"]:
                verbs.append(tok.text)
            elif tok.dep_ == "det":
                determiners.append(tok.text)
            elif tok.dep_ == "prep":
                prepositions.append(tok.text)
            elif tok.pos_ == "ADJ":
                adjectives.append(tok.text)
            elif tok.pos_ == "ADV":
                adverbs.append(tok.text)
                
        for tok in doc:
            # possible percentage of keeping respective POS
            
            curr_ratio_nv = get_curr_nv_ratio(nouns, verbs)
            
            if tok.pos_ == "NOUN": 
                # if possible noun percent in sentence less than current
                # percent or if current n/v ratio is too big, remove noun 
                # from sentence
                if (percent_noun <= (len(nouns)/length) * 100 
                        or curr_ratio_nv > ratio_nv) :
                    utt += ' '
                    nouns.remove(tok.text)
                else:
                    utt += tok.text + ' '
                
            elif  tok.pos_ == "VERB" or tok.dep_ == "cop" or tok.tag_ in ["VBD", "VBN"]:
                # if possible verb percent in sentence less than current 
                # percent or if current n/v ratio too big remove noun 
                # from sentence
                if (percent_verb <= (len(verbs)/length) * 100 
                        or curr_ratio_nv > ratio_nv):
                   utt += ' '
                   verbs.remove(tok.text)
                else:
                    utt += tok.text + ' '
                    
            elif tok.dep_ == "det":
                if percent_det <= (len(determiners)/length) * 100:
                   utt += ' '
                   determiners.remove(tok.text)
                else:
                    utt += tok.text + ' '
                    
            elif tok.dep_ == "prep":
                if percent_prep <= (len(prepositions)/length) * 100:
                   utt += ' '
                   prepositions.remove(tok.text)
                else:
                    utt += tok.text + ' '
                    
            elif tok.pos_ == "ADJ":
                if percent_adj <= (len(adjectives)/length) * 100:
                   utt += ' '
                   adjectives.remove(tok.text)
                else:
                    utt += tok.text + ' '
                    
            elif tok.pos_ == "ADV":
                if percent_adj <= (len(adverbs)/length) * 100:
                   utt += ' '
                   adverbs.remove(tok.text)
                else:
                    utt += tok.text + ' '
                    
           # all other words with respective POS unaffected
            else:
                utt += tok.text + ' '

        utt = " ".join(utt.split()) # remove trailing whitespaces
        utt = re.sub(r'\s+([?.!",])', r'\1', utt)
        print("Possible utt: ", utt)
        # only return sentences which are short enough
        if np.exp(n) >= len(re.findall("[a-zA-Z_]+", utt)):
            return utt, True
        else:
            return '', False
        
    # skipped sentence due to length of it being too big?
    # TODO: remove when adding other stuff mentioned in todo at start
    else:
        return "", False
    
def augment(filepath, save_path, include_canonical=False):
    pass

In [14]:
para = """I received the brass that you sent me, and all looks well. Thank you very much for all your trouble and the extra 3 pieces. I feel that you have an outstanding company and are striving the best that you can to achieve customer satisfaction. I will be certain to tell my friends about US Reloading Supply. ~ Michelle Admin Note: By accident we shorted them two shells, so we sent them replacements."""

from nltk.tokenize import sent_tokenize

sentences = sent_tokenize(para)
for sent in sentences:
    aphasic, changed = aphasic_speech(sent)
    if changed:
        print("Final utter:", " ".join(aphasic.split()).strip())
        print("-----------------------------------------------")

Possible utt:  I brass that you me, and all looks.
-----------------------------------------------
Possible utt:  you your trouble and 3 pieces.
Final utter: you your trouble and 3 pieces.
-----------------------------------------------
Possible utt:  I that you have outstanding company and are striving the best that you can to achieve customer satisfaction.
-----------------------------------------------
Possible utt:  I will be to tell my friends US Reloading Supply.
-----------------------------------------------
Possible utt:  ~ Michelle Admin Note : we them two, we them replacements.
-----------------------------------------------


In [6]:
sent = "All flowers are pretty"

for tok in nlp(sent):
    print(tok.text, tok.dep_, tok.pos_, tok.tag_)

All det DET DT
flowers nsubj NOUN NNS
are ROOT AUX VBP
pretty acomp ADV RB
