# TODO

In [21]:
import string

import spacy
import numpy as np
import pandas as pd
import scipy.stats as stats
import random
import re
from pattern.text.en import singularize, pluralize, conjugate
import enchant
from preprocess import preprocess
from spacy.matcher import Matcher
from string import printable
from preprocess import postprocess
from nltk.tokenize import sent_tokenize

d = enchant.Dict("en_US")
nlp = spacy.load("en_core_web_sm")

In [22]:
extra = '!"#$%&\'()*+-/:;<=>?@[\\]^_`{|}~'

for x in extra:
    printable = printable.replace(x,'',)

In [23]:
printable

'0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ,. \t\n\r\x0b\x0c'

In [24]:
dets = {'Art': ['a', 'an', 'the'],
           'Dem': ['this', 'that', 'these', 'those'],
           'Poss': ['my', 'your', 'his', 'her', 'its', 'our', 'their']}

def det_sub(x):
    for _, det in dets.items():
        if x.lower() in det:
            y = [j for j in det if x!=j]
            return random.choice(y)
    return ""

In [25]:
dataset_filename = "../linguistic_model/data/spoken corpus/preprocessed_test_merge.csv"
ds = pd.read_csv(dataset_filename, encoding='utf8', index_col=False).drop(['Unnamed: 0',"label"], axis=1)

In [26]:
texts = ds["preprocessed_text"]
sents = []
for text in texts:
    text = re.sub(r'\<.*?\>', " ", text)
    sentences = sent_tokenize(text)
    for sent in sentences:
        if isinstance(sent, str):
            sent = re.sub(r'\<.*?\>', " ", sent)
            if not sent.rstrip().isdigit() and len(sent.rstrip()) >= 1:
                sents.append(sent)

In [27]:
len(sents)

23207

In [28]:
total_num_sents = len(sents) # 1000
test_sents = sents[:total_num_sents]

In [29]:
m0sa_lim = 0.7      # m:0s:a             (30%) done
ms_lim = 0.7        # m:+s(:a)           (30%) done
sgc_lim = 0.6       # s:r:gc             (40%) done 
rep_lim = 0.9       # repetition         (10%) done

In [30]:
def aphasic_speech(text):
    doc = nlp(text)
    vp_pattern = [[{'POS': 'VERB', 'OP': '?'},
                   {'POS': 'ADV', 'OP': '*'},
                   {'POS': 'AUX', 'OP': '*'},
                   {'POS': 'VERB', 'OP': '+'}]]
    matcher = Matcher(nlp.vocab)
    matcher.add("Verb phrase", vp_pattern)
    n = 15
    aphasic_utt = ""
    length = len(re.findall("[a-zA-Z_]+", text))
    
    nouns = []
    verbs = []
    determiners = []
    prepositions = []
    adjectives = []
    adverbs = []
    interjections = []
    open_close = np.random.gamma(shape=4.99415, scale=1/3.558095)
    add = False
    
    # count no. of respective POS
    for tok in doc:
        if tok.pos_ == "NOUN":
            nouns.append(tok.text)
        elif tok.pos_ == "VERB" or tok.dep_ == "cop" or tok.tag_ in ["VBD", "VBN"]:
            verbs.append(tok.text)
        # det:art and det:dem only
        elif tok.dep_ == "det" and ("Dem" in tok.morph.get('PronType') or "Art" in tok.morph.get('PronType')):
            determiners.append(tok.text)
        elif tok.dep_ == "prep":
            prepositions.append(tok.text)
        elif tok.pos_ == "ADJ":
            adjectives.append(tok.text)
        elif tok.pos_ == "ADV":
            adverbs.append(tok.text)
        elif tok.pos_ == "INTJ":
            interjections.append(tok.pos_)
            
    open_class_num = len(nouns) + len(verbs) + len(adjectives) + len(adverbs)
    closed_class_num = length - open_class_num - len(interjections)
    
    # acc to frank, not removing only adding
    if closed_class_num != 0:
        if open_close > open_class_num/closed_class_num:
            add = True
    
    # discard sentences of 15 and above length
    # and with symbols
    if length <= n and not set(text).difference(printable):
        m0sa_prob = random.uniform(0,1)     # m:0s:a
        ms_prob = random.uniform(0,1)       # m:+s(:a)
        sgc_prob = random.uniform(0,1)      # s:r:gc
        rep_prob = random.uniform(0,1)      # repetitions
        
        # get NPs
        noun_phrases = set()

        for nc in doc.noun_chunks:
            for nop in [nc, doc[nc.root.left_edge.i:nc.root.right_edge.i + 1]]:
                noun_phrases.add(nop.text.strip())
                # get VPs
        verb_phrases = matcher(doc)
        verb_phrases = [doc[start:end] for _, start, end in verb_phrases]

        try:
            ratio = len(noun_phrases) / len(verb_phrases)
        except:
            return False, aphasic_utt

        X = np.random.uniform(0, 1)
        
        if ratio > 2 and X < 0.8:
            # skip sentence if np/vp too big with prob of 80%
            return False, aphasic_utt
        else:
            # dont skip sentence
            for tok in doc:
                
                if tok.pos_ == "NOUN":
                    # m:0s:a and m:+s(:a)
                    if m0sa_prob >= m0sa_lim or ms_prob >= ms_lim:
                        if "Plur" in tok.morph.get("Number"):
                            aphasic_utt += singularize(tok.text) + ' '
                        elif "Sing" in tok.morph.get("Number"):
                            aphasic_utt += pluralize(tok.text) + ' ' 
                    else:
                        aphasic_utt += tok.text + " "
                
                # Handle pronouns
                elif tok.pos_ == "PRON":
                    # s:r:gc:pro  
                    if sgc_prob >= sgc_lim:
                        if tok.pos_ == "DET" or "Dem" in tok.morph.get('PronType') or "Yes" in tok.morph.get('Poss'):
                            sub = det_sub(tok.text) 
                            aphasic_utt += sub + " "
                            if rep_prob >= rep_lim:
                                aphasic_utt += sub + " "
                        else:
                            if rep_prob >= rep_lim:
                                aphasic_utt += tok.text + " "
                            aphasic_utt += tok.text + " "
                    else:
                        aphasic_utt += tok.text + " "
                        if rep_prob >= rep_lim:
                            aphasic_utt += tok.text + " "
                elif tok.pos_ in ["DET", "PART"] or tok.dep_ in ["prep"]:
                    # determiners, prepositions, particle discard with 60-70%
                    Y = np.random.uniform(0, 1)
                    prob = np.random.uniform(0.6,0.7)
                    if Y > prob:
                        aphasic_utt += tok.text + " "
                        
                elif tok.pos_ in ["ADJ", "ADV"]:
                    # adjectives, adverbs discard with 50%
                    Z = np.random.uniform(0, 1)
                    if Z < 0.5:
                        aphasic_utt += tok.text + " "
                        
                elif tok.pos_ == "VERB":
                    # verbs lemmatize with 50%
                    Z = np.random.uniform(0, 1)
                    if Z < 0.5:
                        aphasic_utt += tok.lemma_ + " "
                        
                elif tok.pos == "INTJ":
                    # close class from PC analysis 
                    if rep_prob >= rep_lim or add:
                        aphasic_utt += tok.text + " "
                    aphasic_utt += tok.text + " "
                
                else:
                    # all other pos remain
                    aphasic_utt += tok.text + " "
                
        # exclusion criterias
        lower = round(length * (1/3))
        higher = round(length * (2/3))
        aphasic_utt = postprocess(aphasic_utt)
        new_length = len(re.findall("[a-zA-Z_]+", aphasic_utt))
        
        if lower <= new_length <= higher:
            return True, aphasic_utt
        else:
            return False, aphasic_utt
    
    else:
        return False, aphasic_utt

In [31]:
aphasic_sents = []
normal_sents = []
new_sents = []

import inflect
from preprocess import postprocess
p = inflect.engine()
count = 0
num_sents = 9000 # how many aphasic sentences?

for sent in test_sents:
    # no digits like in aphasiabank
    # print(sent)
    b = re.findall("[0-9]+", sent)
    for i in b:
        sent = sent.replace(i, p.number_to_words(i))
    
    s = preprocess(sent)
    
    if len(aphasic_sents) >= num_sents:
        changed = False
        aphasic = ""
    else:
        changed, aphasic = aphasic_speech(s)
        count += 1
    
    if len(aphasic_sents) >= num_sents and len(new_sents) >= num_sents * 2.5911:
        break
        
    if count % 1000 == 0:
        print(f"Processed {count} sentences, broca utts: {len(aphasic_sents)}")
    
    # min length is 3?
    if changed and aphasic !="." and (postprocess(s) != aphasic): #and 3 <= len(re.findall("[a-zA-Z_]+", aphasic)):
        # print(sent)
        # print(postprocess(s))
        # print(postprocess(aphasic))
        # print()
        normal_sents.append(postprocess(s))
        aphasic_sents.append(aphasic)
    if not changed:
        new_sents.append(postprocess(s))

Processed 1000 sentences, broca utts: 193
Processed 2000 sentences, broca utts: 412
Processed 3000 sentences, broca utts: 592
Processed 4000 sentences, broca utts: 803
Processed 5000 sentences, broca utts: 989
Processed 6000 sentences, broca utts: 1169
Processed 7000 sentences, broca utts: 1382
Processed 8000 sentences, broca utts: 1603
Processed 9000 sentences, broca utts: 1798
Processed 10000 sentences, broca utts: 2008
Processed 11000 sentences, broca utts: 2209
Processed 12000 sentences, broca utts: 2420
Processed 13000 sentences, broca utts: 2610
Processed 14000 sentences, broca utts: 2766
Processed 15000 sentences, broca utts: 2984
Processed 16000 sentences, broca utts: 3140
Processed 17000 sentences, broca utts: 3346
Processed 18000 sentences, broca utts: 3589
Processed 19000 sentences, broca utts: 3793
Processed 20000 sentences, broca utts: 3976
Processed 21000 sentences, broca utts: 4170
Processed 22000 sentences, broca utts: 4411
Processed 23000 sentences, broca utts: 4618


In [32]:
sentences = aphasic_sents
original = normal_sents

In [33]:
import re
import string
from preprocess import postprocess
broca_sents = []
original_sents = []
for sent, o in zip(sentences, original):
    x = postprocess(sent)
    if x != "":
        broca_sents.append(x)
        original_sents.append(o)

In [34]:
pd.DataFrame(data={"modified": broca_sents, "original": original_sents}).to_csv("data/new_3.csv", sep=",", index=False)

In [35]:
broca_sents = pd.read_csv("data/new_3.csv", sep=",")["modified"]

In [36]:
broca_sents = broca_sents[:num_sents]

In [37]:
round(len(broca_sents)*2.5911)

12033

In [38]:
control_sents = new_sents[:round(len(broca_sents)*2.5911)]

In [39]:
broca_data = pd.DataFrame(data={"preprocessed_text": broca_sents, "label": [1]*len(broca_sents)})
control_data = pd.DataFrame(data={"preprocessed_text": control_sents, "label": [0]*len(control_sents)})
data_full_scenario = pd.concat([broca_data, control_data], ignore_index=True)
data_full_scenario = data_full_scenario.sample(frac=1).reset_index(drop=True)
data_full_scenario["preprocessed_text"] = [re.sub(r'[^\w\s]','',x) for x in data_full_scenario["preprocessed_text"]]
data_full_scenario.to_csv("data/new_3_merge.csv", sep=",", index=False)

In [40]:
data_full_scenario

Unnamed: 0,preprocessed_text,label
0,i can give you a rice cake,0
1,he used the four,0
2,god has created us made us like we are,0
3,you smoked it down into the cork did not you,0
4,ll demos cratos ll,0
...,...,...
16672,what shall i the righteousness god,1
16673,what i mean,0
16674,well i do not think henry was the horrible per...,0
16675,that is where we bought the car at,0
