In [17]:
import string

import spacy
import numpy as np
import pandas as pd
import scipy.stats as stats
import random
import re
from pattern.text.en import singularize, pluralize, conjugate
import enchant
from preprocess import preprocess
from spacy.matcher import Matcher
from string import printable
from preprocess import postprocess
from nltk.tokenize import sent_tokenize

d = enchant.Dict("en_US")
nlp = spacy.load("en_core_web_sm")

In [18]:
extra = '!"#$%&\'()*+-/:;<=>?@[\\]^_`{|}~'

for x in extra:
    printable = printable.replace(x,'',)

In [19]:
printable

'0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ,. \t\n\r\x0b\x0c'

In [20]:
dets = {'Art': ['a', 'an', 'the'],
           'Dem': ['this', 'that', 'these', 'those'],
           'Poss': ['my', 'your', 'his', 'her', 'its', 'our', 'their']}

def det_sub(x):
    for _, det in dets.items():
        if x.lower() in det:
            y = [j for j in det if x!=j]
            return random.choice(y)
    return ""

In [21]:
# broca_save = "../datafiles/generated output/bnc_broca.csv"
# generated_save = "../datafiles/generated output/bnc_all.csv"
# dataset_filename = "../datafiles/spoken corpus/bnc/preprocessed_bnc.csv"

broca_save = "../datafiles/generated output/boston_broca.csv"
generated_save = "../datafiles/generated output/boston_all.csv"
dataset_filename = "../datafiles/spoken corpus/boston/preprocessed_boston.csv"

In [22]:
ds = pd.read_csv(dataset_filename, encoding='utf8', index_col=False).drop(['Unnamed: 0'], axis=1)

In [23]:
ds

Unnamed: 0,source_file,speaker,preprocessed_text,label
0,SBC008.cha,*REBE,the way that your testimony is coming in i do ...,0
1,SBC008.cha,*REBE,to expose himself to a person for sexual arous...,0
2,SBC008.cha,*REBE,in at least one of the cases. then we are allo...,0
3,SBC008.cha,*REBE,things like that that is why we are able to ha...,0
4,SBC008.cha,*REBE,well i have made two one a year from almost a ...,0
...,...,...,...,...
7713,SBC060.cha,*ALAN,on areas score score higher on the test than t...,0
7714,SBC060.cha,*ALAN,i am told i have not seen any of them but the ...,0
7715,SBC060.cha,*JON,but they are well educated people down there. ...,0
7716,SBC060.cha,*ALAN,i am s. i know that this fellow goldstone who ...,0


In [24]:
m0sa_lim = 0.7      # m:0s:a             (30%) done
ms_lim = 0.7        # m:+s(:a)           (30%) done
sgc_lim = 0.6       # s:r:gc             (40%) done 
rep_lim = 0.98       # repetition         (10%) done

In [25]:
def aphasic_speech(text):
    # print('Sentence: ', text)

    doc = nlp(text)
    vp_pattern = [[{'POS': 'VERB', 'OP': '?'},
                   {'POS': 'ADV', 'OP': '*'},
                   {'POS': 'AUX', 'OP': '*'},
                   {'POS': 'VERB', 'OP': '+'}]]
    matcher = Matcher(nlp.vocab)
    matcher.add("Verb phrase", vp_pattern)
    n = 15
    aphasic_utt = ""
    length = len(re.findall("[a-zA-Z_]+", text))
    nouns = []
    verbs = []
    determiners = []
    prepositions = []
    adjectives = []
    adverbs = []
    interjections = []
    open_close = np.random.gamma(shape=4.99415, scale=1/3.558095)
    add = False
    
    # count no. of respective POS
    for tok in doc:
        if tok.pos_ == "NOUN":
            nouns.append(tok.text)
        elif tok.pos_ == "VERB" or tok.dep_ == "cop" or tok.tag_ in ["VBD", "VBN"]:
            verbs.append(tok.text)
        # det:art and det:dem only
        elif tok.dep_ == "det" and ("Dem" in tok.morph.get('PronType') or "Art" in tok.morph.get('PronType')):
            determiners.append(tok.text)
        elif tok.dep_ == "prep":
            prepositions.append(tok.text)
        elif tok.pos_ == "ADJ":
            adjectives.append(tok.text)
        elif tok.pos_ == "ADV":
            adverbs.append(tok.text)
        elif tok.pos_ == "INTJ":
            interjections.append(tok.pos_)
            
    open_class_num = len(nouns) + len(verbs) + len(adjectives) + len(adverbs)
    closed_class_num = length - open_class_num - len(interjections)
    
    # acc to frank, not removing only adding
    if closed_class_num != 0:
        if open_close > open_class_num/closed_class_num:
            add = True
    
    # discard sentences of 15 and above length
    # and with symbols
    if length <= n and not set(text).difference(printable):       
        # get NPs
        noun_phrases = set()

        for nc in doc.noun_chunks:
            for nop in [nc, doc[nc.root.left_edge.i:nc.root.right_edge.i + 1]]:
                noun_phrases.add(nop.text.strip())
                # get VPs
        verb_phrases = matcher(doc)
        verb_phrases = [doc[start:end] for _, start, end in verb_phrases]
        

        try:
            ratio = len(noun_phrases) / len(verb_phrases)
        except:
            ratio = 1
            # print('No verb phrases')
            # return False, aphasic_utt

        X = np.random.uniform(0, 1)
        
        if ratio > 2 and X < 0.8:
            # skip sentence if np/vp too big with prob of 80%
            # print("Skipped sentence")
            return False, aphasic_utt
        else:
            # dont skip sentence
            
            # Handle nouns
            for tok in doc:
                # print(tok.pos_, tok.text)
                if tok.pos_ == "NOUN":
                    # m:0s:a and m:+s(:a) errors
                    m0sa_prob = random.uniform(0,1)     # m:0s:a
                    ms_prob = random.uniform(0,1)       # m:+s(:a)
                    if m0sa_prob >= m0sa_lim or ms_prob >= ms_lim:
                        if "Plur" in tok.morph.get("Number"):
                            aphasic_utt += singularize(tok.text) + ' '
                        elif "Sing" in tok.morph.get("Number"):
                            aphasic_utt += pluralize(tok.text) + ' ' 
                    # keep noun as is
                    else:
                        aphasic_utt += tok.text + " "
                
                # Handle pronouns
                elif tok.pos_ == "PRON":
                    sgc_prob = random.uniform(0,1)      # s:r:gc
                    rep_prob = random.uniform(0,1)      # repetitions
                    # s:r:gc:pro error 
                    if sgc_prob >= sgc_lim:
                        if tok.pos_ == "DET" or "Dem" in tok.morph.get('PronType') or "Yes" in tok.morph.get('Poss'):
                            sub = det_sub(tok.text) 
                            aphasic_utt += sub + " "
                            # repetition of s:r:gc:pro error
                            if rep_prob >= rep_lim:
                                aphasic_utt += sub + " "
                        else:
                            # repetition or keep as is
                            if rep_prob >= rep_lim:
                                aphasic_utt += tok.text + " "
                            aphasic_utt += tok.text + " "
                    else:
                        # repetition or keep as is
                        aphasic_utt += tok.text + " "
                        if rep_prob >= rep_lim:
                            aphasic_utt += tok.text + " "
                            
                # Handle determiners, prepositions, particle, aux       
                elif tok.pos_ in ["DET", "PART", "AUX"] or tok.dep_ in ["prep"]:
                    # keep with 30-40%
                    Y = np.random.uniform(0, 1)
                    prob = np.random.uniform(0.6, 0.7)
                    if Y > prob:
                        # keep if not discarding
                        aphasic_utt += tok.text + " "
    
                # Handle adjectives, adverbs 
                elif tok.pos_ in ["ADJ", "ADV"]:
                    # keep with 70%
                    # TODO: maybe not drop it at all
                    Z = np.random.uniform(0, 1)
                    if Z > 0.3:
                        # keep if not discarding
                        aphasic_utt += tok.text + " "
                        
                # Handle verbs     
                elif tok.pos_ == "VERB":
                    # lemmatize with 50%
                    Z = np.random.uniform(0, 1)
                    if Z <= 0.5:
                        aphasic_utt += tok.lemma_ + " "
                    else:
                        # keep as is
                        aphasic_utt += tok.text + " "
                 
                # Handle interjections        
                elif tok.pos == "INTJ":
                    rep_prob = random.uniform(0,1)      # repetitions
                    # close class from PC analysis OR repetition error
                    if rep_prob >= rep_lim or add:
                        aphasic_utt += tok.text + " "
                    aphasic_utt += tok.text + " "
                
                else:
                    # all other POS remain
                    aphasic_utt += tok.text + " "
                
        # exclusion criterias
        lower = round(length * (1/3))
        higher = round(length * (2/3))
        aphasic_utt = postprocess(aphasic_utt)
        new_length = len(re.findall("[a-zA-Z_]+", aphasic_utt))
        
        if lower <= new_length <= higher:
            return True, aphasic_utt
        else:
            # print(aphasic_utt, lower, higher, new_length, length)
            # print("Too much/too little removed")
            return False, aphasic_utt
    
    else:
        # print('removed', length, not set(text).difference(printable))
        return False, aphasic_utt

# Remove repetition in data

In [26]:
import nltk

def tuple_to_str(tuple):
    output_str = ""
    for tup in tuple:
        try:
            output_str += " " + tup[0]
            output_str += " " + tup[1]
        except:
            continue
    return output_str[1:]


def remove_single_repetitions(text):
    """
    Removes duplicated words (stuttering) and duplicated pauses from utterance.
    e.g: I I I I I I wanted --> I wanted.
    :param line: input text containing dupes.
    :return: unduped string containing text.
    """
    utterance = text.split(" ")

    newlist = []
    newlist.append(utterance[0])
    for i, element in enumerate(utterance):
        if i > 0 and utterance[i - 1] != element:
            newlist.append(element)

    return ' '.join(newlist)


def remove_bigram_repetitions(text):
    """
    Removes bigram stuttering from text. I went I went to the to the doctor --> I went to the doctor.
    :param text: input text containing a string
    :return: string without duplicates.
    """
    bigram = list(nltk.bigrams(text.split()))
    grams = []

    for i in range(0, len(bigram)):
        if i % 2 == 0:
            grams.append(bigram[i])

    result = []
    prev_item = None
    for item in grams:
        if item != prev_item:
            result.append(item)
            prev_item = item

    if result[-1][-1] != bigram[-1][-1]:
        result.append(tuple((bigram[-1][-1]).split(" ")))

    return tuple_to_str(result)

def remove_all_repetitions(text):
    """
    Removes bigram repetitions and stuttering from text.
    :return: Clean text.
    """
    try:
        output_text = remove_single_repetitions(text)
        output_text2 = remove_bigram_repetitions(output_text)
    except:
        return text
    return output_text2

In [27]:
#### PARAGRAPH LEVEL ####
texts = ds["preprocessed_text"]
aphasic_texts = []
original_texts = []
control_texts = []

total_paras = 10000
import inflect
from preprocess import postprocess
count = 0

for text in texts:
    count += 1
    text = re.sub(r'\<.*?\>', " ", text)
    text = text.replace('?', '.')
    text = text.replace('!', '.')
    sentences = sent_tokenize(text)
    aphasic_text = ""
    original_text = ""
    # print("Original: ", text)
    if count % 1000 == 0:
        print(f"Processed {count} sentences, broca paras: {len(aphasic_texts)}")
    
    if len(aphasic_texts) > total_paras:
        break
        
    for sent in sentences:
        p = inflect.engine()   
    
        # no digits like in aphasiabank
        # print(sent)
        b = re.findall("[0-9]+", sent)
        for i in b:
            sent = sent.replace(i, p.number_to_words(i))
        
        s = preprocess(sent)
        
        changed, aphasic = aphasic_speech(s)
        
        # print(count)
        if changed:
            aphasic = postprocess(aphasic)
            aphasic_text += aphasic.rstrip().lstrip() + " "
            s = postprocess(s)
            s = remove_all_repetitions(s)
            original_text += s.rstrip().lstrip() + " "
            
        else:
            # only get example where all sentences are modified
            aphasic_text = ''
            control_texts.append(text.rstrip().lstrip())
            break
                    
              
    
    if len(re.findall("[a-zA-Z_]+", aphasic_text)) != 0:
        # print("Original: ", original_text)
        # print("Aphasic: ", aphasic_text)  
        aphasic_texts.append(aphasic_text.rstrip().lstrip()) 
        original_texts.append(original_text.rstrip().lstrip())
        
      

Processed 1000 sentences, broca paras: 32
Processed 2000 sentences, broca paras: 62
Processed 3000 sentences, broca paras: 98
Processed 4000 sentences, broca paras: 130
Processed 5000 sentences, broca paras: 153
Processed 6000 sentences, broca paras: 177
Processed 7000 sentences, broca paras: 209


In [28]:
pd.DataFrame(data={"modified": aphasic_texts, "original": original_texts}).to_csv(broca_save, sep=",", index=False)

In [29]:
control_sents = control_texts[:round(len(aphasic_texts)*2.5911)]

broca_data = pd.DataFrame(data={"preprocessed_text": aphasic_texts, "label": [1]*len(aphasic_texts)})
control_data = pd.DataFrame(data={"preprocessed_text": control_sents, "label": [0]*len(control_sents)})
data_full_scenario = pd.concat([broca_data, control_data], ignore_index=True)
# data_full_scenario = data_full_scenario.sample(frac=1).reset_index(drop=True)
#data_full_scenario["preprocessed_text"] = [re.sub(r'[^\w\s]','',x) for x in data_full_scenario["preprocessed_text"]]
data_full_scenario.to_csv(generated_save, sep=",", index=False)

In [30]:
data_full_scenario

Unnamed: 0,preprocessed_text,label
0,even for one. for one. i talk this one.,1
1,less or exes. or equal. which xx uh less or to...,1
2,i know this one so. you know how to do ones. w...,1
3,so get math over. i got it over out of high sc...,1
4,to side. to sides. to sides.,1
...,...,...
799,how mad were you. oops that will not work. oop...,0
800,what is everybody waiting for. do not do it. y...,0
801,well we are. i will just take my gifts up to m...,0
802,there is more to tonight. well i have f a fun ...,0


In [33]:
broca_data

Unnamed: 0,preprocessed_text,label
0,even for one. for one. i talk this one.,1
1,less or exes. or equal. which xx uh less or to...,1
2,i know this one so. you know how to do ones. w...,1
3,so get math over. i got it over out of high sc...,1
4,to side. to sides. to sides.,1
...,...,...
219,i did person do like big piece. oh it good. th...,1
220,where our toffee. the basements. i get up.,1
221,she dead after we marry. a year. how old she.,1
222,but he real. yeah he. well you know he go to d...,1


# merge

In [39]:
# df = pd.concat([pd.read_csv("../datafiles/generated output/boston_all.csv"), pd.read_csv("../datafiles/generated output/bnc_all.csv")])
df = pd.concat([pd.read_csv("../datafiles/generated output/bnc_broca.csv"), pd.read_csv("../datafiles/generated output/boston_broca.csv")])

In [40]:
df.to_csv("../datafiles/generated output/merge_broca.csv")

In [32]:
ans = []
for i in range(100):
    Y = np.random.uniform(0, 1)
    prob = np.random.uniform(0.6, 0.7)
    if Y > prob:
        # keep if not discarding
        ans.append(1)
len(ans)

35