In [37]:
import spacy
import numpy as np
import pandas as pd
import random
import re
from pattern.text.en import singularize, pluralize
import enchant
from preprocess import preprocess
from spacy.matcher import Matcher
from string import printable
from nltk.tokenize import sent_tokenize

d = enchant.Dict("en_US")
nlp = spacy.load("en_core_web_sm")

In [None]:
broca_save = "../datafiles/generated_output/bnc_broca.csv"
generated_save = "../datafiles/generated_output/bnc_all.csv"
dataset_filename = "../datafiles/spoken corpus/bnc/preprocessed_bnc.csv"

In [38]:
extra = '!"#$%&\'()*+-/:;<=>?@[\\]^_`{|}~'

for x in extra:
    printable = printable.replace(x,'',)

In [39]:
printable

'0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ,. \t\n\r\x0b\x0c'

In [40]:
dets = {'Art': ['a', 'an', 'the'],
           'Dem': ['this', 'that', 'these', 'those'],
           'Poss': ['my', 'your', 'his', 'her', 'its', 'our', 'their']}

def det_sub(x):
    for _, det in dets.items():
        if x.lower() in det:
            y = [j for j in det if x!=j]
            return random.choice(y)
    return ""

In [41]:
ds = pd.read_csv(dataset_filename, encoding='utf8', index_col=False).drop(['Unnamed: 0'], axis=1)

In [42]:
ds

Unnamed: 0,preprocessed_text
0,thank you for cooking.
1,rice is good.
2,it's a mighty green.
3,misses the greens. I have been missing them.
4,that wasn't even me that said that that was.
...,...
819390,yeah. it was weird and like coming back and be...
819391,Sha Li yeah er but then when I've like. I don'...
819392,but. anyway. erm he was like Sha Li that's rea...
819393,I'll just stick with it yeah. this was like ha...


In [43]:
texts = ds["preprocessed_text"]
sents = []
for text in texts:
    text = re.sub(r'\<.*?\>', " ", text)
    sentences = sent_tokenize(text)
    for sent in sentences:
        if isinstance(sent, str):
            sent = re.sub(r'\<.*?\>', " ", sent)
            if not sent.rstrip().isdigit() and len(sent.rstrip()) >= 1:
                sents.append(sent)

In [44]:
len(sents)

1219315

In [45]:
total_num_sents = len(sents) # 1000
test_sents = sents[:total_num_sents]

In [46]:
m0sa_lim = 0.7      # m:0s:a             (30%) done
ms_lim = 0.7        # m:+s(:a)           (30%) done
sgc_lim = 0.6       # s:r:gc             (40%) done 
rep_lim = 0.9       # repetition         (10%) done

In [47]:
def aphasic_speech(text):
    doc = nlp(text)
    vp_pattern = [[{'POS': 'VERB', 'OP': '?'},
                   {'POS': 'ADV', 'OP': '*'},
                   {'POS': 'AUX', 'OP': '*'},
                   {'POS': 'VERB', 'OP': '+'}]]
    matcher = Matcher(nlp.vocab)
    matcher.add("Verb phrase", vp_pattern)
    n = 15
    aphasic_utt = ""
    length = len(re.findall("[a-zA-Z_]+", text))
    
    nouns = []
    verbs = []
    determiners = []
    prepositions = []
    adjectives = []
    adverbs = []
    interjections = []
    open_close = np.random.gamma(shape=4.99415, scale=1/3.558095)
    add = False
    
    # count no. of respective POS
    for tok in doc:
        if tok.pos_ == "NOUN":
            nouns.append(tok.text)
        elif tok.pos_ == "VERB" or tok.dep_ == "cop" or tok.tag_ in ["VBD", "VBN"]:
            verbs.append(tok.text)
        # det:art and det:dem only
        elif tok.dep_ == "det" and ("Dem" in tok.morph.get('PronType') or "Art" in tok.morph.get('PronType')):
            determiners.append(tok.text)
        elif tok.dep_ == "prep":
            prepositions.append(tok.text)
        elif tok.pos_ == "ADJ":
            adjectives.append(tok.text)
        elif tok.pos_ == "ADV":
            adverbs.append(tok.text)
        elif tok.pos_ == "INTJ":
            interjections.append(tok.pos_)
            
    open_class_num = len(nouns) + len(verbs) + len(adjectives) + len(adverbs)
    closed_class_num = length - open_class_num - len(interjections)
    
    # acc to frank, not removing only adding
    if closed_class_num != 0:
        if open_close > open_class_num/closed_class_num:
            add = True
    
    # discard sentences of 15 and above length
    # and with symbols
    if length <= n and not set(text).difference(printable):
        m0sa_prob = random.uniform(0,1)     # m:0s:a
        ms_prob = random.uniform(0,1)       # m:+s(:a)
        sgc_prob = random.uniform(0,1)      # s:r:gc
        rep_prob = random.uniform(0,1)      # repetitions
        
        # get NPs
        noun_phrases = set()

        for nc in doc.noun_chunks:
            for nop in [nc, doc[nc.root.left_edge.i:nc.root.right_edge.i + 1]]:
                noun_phrases.add(nop.text.strip())
                # get VPs
        verb_phrases = matcher(doc)
        verb_phrases = [doc[start:end] for _, start, end in verb_phrases]

        try:
            ratio = len(noun_phrases) / len(verb_phrases)
        except:
            return False, aphasic_utt

        X = np.random.uniform(0, 1)
        
        if ratio > 2 and X < 0.8:
            # skip sentence if np/vp too big with prob of 80%
            return False, aphasic_utt
        else:
            # dont skip sentence
            
            # Handle nouns
            for tok in doc:
                if tok.pos_ == "NOUN":
                    # m:0s:a and m:+s(:a) errors
                    if m0sa_prob >= m0sa_lim or ms_prob >= ms_lim:
                        if "Plur" in tok.morph.get("Number"):
                            aphasic_utt += singularize(tok.text) + ' '
                        elif "Sing" in tok.morph.get("Number"):
                            aphasic_utt += pluralize(tok.text) + ' ' 
                    # keep noun as is
                    else:
                        aphasic_utt += tok.text + " "
                
                # Handle pronouns
                elif tok.pos_ == "PRON":
                    # s:r:gc:pro error 
                    if sgc_prob >= sgc_lim:
                        if tok.pos_ == "DET" or "Dem" in tok.morph.get('PronType') or "Yes" in tok.morph.get('Poss'):
                            sub = det_sub(tok.text) 
                            aphasic_utt += sub + " "
                            # repetition of s:r:gc:pro error
                            if rep_prob >= rep_lim:
                                aphasic_utt += sub + " "
                        else:
                            # repetition or keep as is
                            if rep_prob >= rep_lim:
                                aphasic_utt += tok.text + " "
                            aphasic_utt += tok.text + " "
                    else:
                        # repetition or keep as is
                        aphasic_utt += tok.text + " "
                        if rep_prob >= rep_lim:
                            aphasic_utt += tok.text + " "
                            
                # Handle determiners, prepositions, particle       
                elif tok.pos_ in ["DET", "PART"] or tok.dep_ in ["prep"]:
                    # discard with 60-70%
                    Y = np.random.uniform(0, 1)
                    prob = np.random.uniform(0.6,0.7)
                    if Y > prob:
                        # keep if not discarding
                        aphasic_utt += tok.text + " "
    
                # Handle adjectives, adverbs 
                elif tok.pos_ in ["ADJ", "ADV"]:
                    # discard with 30%
                    # TODO: maybe not drop it at all
                    Z = np.random.uniform(0, 1)
                    if Z < 0.3:
                        # keep if not discarding
                        aphasic_utt += tok.text + " "
                        
                # Handle verbs     
                elif tok.pos_ == "VERB":
                    # lemmatize with 50%
                    Z = np.random.uniform(0, 1)
                    if Z < 0.5:
                        aphasic_utt += tok.lemma_ + " "
                    else:
                        # keep as is
                        aphasic_utt += tok.text + " "
                 
                # Handle interjections        
                elif tok.pos == "INTJ":
                    # close class from PC analysis OR repetition error
                    if rep_prob >= rep_lim or add:
                        aphasic_utt += tok.text + " "
                    aphasic_utt += tok.text + " "
                
                else:
                    # all other POS remain
                    aphasic_utt += tok.text + " "
                
        # exclusion criterias
        lower = round(length * (1/3))
        higher = round(length * (2/3))
        aphasic_utt = postprocess(aphasic_utt)
        new_length = int(len(re.findall("[a-zA-Z_]+", aphasic_utt)))
        
        if lower <= new_length <= higher:
            return True, aphasic_utt
        else:
            return False, aphasic_utt
    
    else:
        return False, aphasic_utt

In [48]:
import nltk

def tuple_to_str(tuple):
    output_str = ""
    for tup in tuple:
        try:
            output_str += " " + tup[0]
            output_str += " " + tup[1]
        except:
            continue
    return output_str[1:]


def remove_single_repetitions(text):
    """
    Removes duplicated words (stuttering) and duplicated pauses from utterance.
    e.g: I I I I I I wanted --> I wanted.
    :param line: input text containing dupes.
    :return: unduped string containing text.
    """
    utterance = text.split(" ")

    newlist = []
    newlist.append(utterance[0])
    for i, element in enumerate(utterance):
        if i > 0 and utterance[i - 1] != element:
            newlist.append(element)

    return ' '.join(newlist)


def remove_bigram_repetitions(text):
    """
    Removes bigram stuttering from text. I went I went to the to the doctor --> I went to the doctor.
    :param text: input text containing a string
    :return: string without duplicates.
    """
    bigram = list(nltk.bigrams(text.split()))
    grams = []

    for i in range(0, len(bigram)):
        if i % 2 == 0:
            grams.append(bigram[i])

    result = []
    prev_item = None
    for item in grams:
        if item != prev_item:
            result.append(item)
            prev_item = item

    if result[-1][-1] != bigram[-1][-1]:
        result.append(tuple((bigram[-1][-1]).split(" ")))

    return tuple_to_str(result)

def remove_all_repetitions(text):
    """
    Removes bigram repetitions and stuttering from text.
    :return: Clean text.
    """
    try:
        output_text = remove_single_repetitions(text)
        output_text2 = remove_bigram_repetitions(output_text)
    except:
        return text
    return output_text2

In [49]:
aphasic_sents = []
normal_sents = []
new_sents = []

import inflect
from preprocess import postprocess
p = inflect.engine()
count = 0
num_sents = 20000 # how many aphasic sentences?

for sent in test_sents:
    # no digits like in aphasiabank
    # print(sent)
    b = re.findall("[0-9]+", sent)
    for i in b:
        sent = sent.replace(i, p.number_to_words(i))
    
    s = preprocess(sent)
    
    if len(aphasic_sents) >= num_sents:
        changed = False
        aphasic = ""
    else:
        changed, aphasic = aphasic_speech(s)
        count += 1
    
    if len(aphasic_sents) >= num_sents and len(new_sents) >= num_sents * 2.5911:
        break
        
    if count % 1000 == 0:
        print(f"Processed {count} sentences, broca utts: {len(aphasic_sents)}")
    
    # min length is 3?
    if changed and aphasic !="." and (postprocess(s) != aphasic): #and 3 <= len(re.findall("[a-zA-Z_]+", aphasic)):
        # print(sent)
        # print(postprocess(s))
        # print(postprocess(aphasic))
        # print()
        normal_sents.append(remove_all_repetitions(postprocess(s)))
        aphasic_sents.append(aphasic)
    if not changed:
        new_sents.append(remove_all_repetitions(postprocess(s)))

Processed 1000 sentences, broca utts: 151
Processed 2000 sentences, broca utts: 303
Processed 3000 sentences, broca utts: 463
Processed 4000 sentences, broca utts: 621
Processed 5000 sentences, broca utts: 791
Processed 6000 sentences, broca utts: 953
Processed 7000 sentences, broca utts: 1097
Processed 8000 sentences, broca utts: 1305
Processed 9000 sentences, broca utts: 1432
Processed 10000 sentences, broca utts: 1582
Processed 11000 sentences, broca utts: 1766
Processed 12000 sentences, broca utts: 1912
Processed 13000 sentences, broca utts: 2047
Processed 14000 sentences, broca utts: 2195
Processed 15000 sentences, broca utts: 2314
Processed 16000 sentences, broca utts: 2455
Processed 17000 sentences, broca utts: 2619
Processed 18000 sentences, broca utts: 2797
Processed 19000 sentences, broca utts: 2962
Processed 20000 sentences, broca utts: 3130
Processed 21000 sentences, broca utts: 3288
Processed 22000 sentences, broca utts: 3485
Processed 23000 sentences, broca utts: 3666
Pro

In [50]:
sentences = aphasic_sents
original = normal_sents

In [51]:
import re
import string
from preprocess import postprocess
broca_sents = []
original_sents = []
for sent, o in zip(sentences, original):
    x = postprocess(sent)
    if x != "":
        broca_sents.append(x)
        original_sents.append(o)

In [52]:
pd.DataFrame(data={"modified": broca_sents, "original": original_sents}).to_csv(broca_save, sep=",", index=False)

In [53]:
broca_sents = pd.read_csv(broca_save, sep=",")["modified"]

In [54]:
broca_sents = broca_sents[:num_sents]

In [55]:
round(len(broca_sents)*2.5911)

51503

In [56]:
control_sents = new_sents[:round(len(broca_sents)*2.5911)]

In [57]:
broca_data = pd.DataFrame(data={"preprocessed_text": broca_sents, "label": [1]*len(broca_sents)})
control_data = pd.DataFrame(data={"preprocessed_text": control_sents, "label": [0]*len(control_sents)})
data_full_scenario = pd.concat([broca_data, control_data], ignore_index=True)
data_full_scenario = data_full_scenario.sample(frac=1).reset_index(drop=True)
data_full_scenario["preprocessed_text"] = [re.sub(r'[^\w\s]','',x) for x in data_full_scenario["preprocessed_text"]]
data_full_scenario.to_csv(generated_save, sep=",", index=False)

In [58]:
data_full_scenario

Unnamed: 0,preprocessed_text,label
0,and then there were two girls that fancied him...,0
1,yeah they are nice yeah,0
2,mm yeah it is not strong is it,0
3,it is mums go,1
4,he is his fathers son you see in certain eleme...,0
...,...,...
71375,not sure,0
71376,i do not like up fusses,1
71377,i do not it that,1
71378,mm yes it,0
