In [132]:
# Based on results from Adaptation theory and non-fluent aphasia in english by Salis and Edwards (2004)

In [133]:
import pandas as pd
import spacy
nlp = spacy.load("en_core_web_sm")
import numpy as np
import random
import re
from pattern.en import conjugate, lemma, lexeme, PRESENT, SG
from spacy.matcher import Matcher


In [134]:
filepath = "../preprocessing/data_control_preprocessed.csv"

In [135]:
df = pd.read_csv(filepath).dropna().reset_index()

In [136]:
sentences = df["preprocessed_text"]

In [137]:
dets = {'Art': ['a', 'an', 'the', ''],
           'Dem': ['this', 'that', 'these', 'those', ''],
           'Poss': ['my', 'your', 'his', 'her', 'its', 'our', 'their', '']}

In [138]:
def det_sub(det):
    for _, detrms in dets.items():
        if det.lower() in detrms:
            y = [j for j in detrms if det!=j]
            return random.choice(y)
    return ""

In [139]:
def handle_determiner(tok):
    x = np.random.uniform(0,1)
    # 3% of determiners were substituted
    if x >= 0.97:
        if tok.pos_ == "DET" or "Dem" in tok.morph.get('PronType') or "Yes" in tok.morph.get('Poss'):
            return det_sub(tok.text) + " "
    # 19% determiners were omitted
    elif x >= 0.81:
        return " "
    return tok.text + " "

In [140]:
def handle_verb(tok):
    # TODO handle the copula, auxiliary and lexical tense/bound morphemes
    x = np.random.uniform(0,1)
    # 4% of verbs were substituted (tense error) 
    if x >= 0.96:
        # TODO tense sub based on frequency?
        return conjugate(verb=tok.text,tense=PRESENT,number=SG) + " " 
    # 17% determiners were omitted
    elif x >= 0.83:
        return " "
    return tok.text + " "

In [141]:
def handle_preposition(tok):
    x = np.random.uniform(0,1)
    # 2% of prepositions were substituted
    if x >= 0.98:
        # TODO substitute
        pass
    # 37% of prepositions were omitted
    elif x >= 0.63:
        return " "
    return tok.text + " "

In [142]:
def handle_person_pron(tok):
    x = np.random.uniform(0,1)
    # 27% of personal pronouns were omitted
    if x >= 0.73:
        return " "
    return tok.text + " "

In [143]:
def aphasic_speech(sentence):
    vp_pattern = [[{'POS': 'VERB', 'OP': '?'},
                   {'POS': 'ADV', 'OP': '*'},
                   {'POS': 'AUX', 'OP': '*'},
                   {'POS': 'VERB', 'OP': '+'}]]
    matcher = Matcher(nlp.vocab)
    matcher.add("Verb phrase", vp_pattern)
    n = 15
    aphasic_utt = ""
    doc = nlp(sentence)
    # print(sentence, len(sentence.split()))
    if len(sentence.split()) <= n:
        # get NPs
        noun_phrases = set()

        for nc in doc.noun_chunks:
            for nop in [nc, doc[nc.root.left_edge.i:nc.root.right_edge.i + 1]]:
                noun_phrases.add(nop.text.strip())
                # get VPs
        verb_phrases = matcher(doc)
        verb_phrases = [doc[start:end] for _, start, end in verb_phrases]

        try:
            ratio = len(noun_phrases) / len(verb_phrases)
        except:
            # print("Division by zero")
            return aphasic_utt

        X = np.random.uniform(0, 1)
        # print(ratio, X)
        if ratio > 2 and X <= 0.8:
            # skip sentence
            return aphasic_utt
    
        else:
            # don't skip sentence
            for tok in doc:
                if tok.pos_ in ["DET", "PRON"]:
                    aphasic_utt += handle_determiner(tok)          
                elif tok.pos_ == "VERB":
                    aphasic_utt += handle_verb(tok)
                elif tok.dep_ == "prep" or tok.pos_ == "ADP":
                    aphasic_utt += handle_preposition(tok)
                elif tok.morph.get('Case') == 'Nom' and tok.morph.get('Person') == 1:
                    aphasic_utt += handle_person_pron(tok)
                    
                else:
                    aphasic_utt += tok.text + " "
    
    return aphasic_utt

In [144]:
aphasic = []
for sent in sentences:
    modify = aphasic_speech(sent)
    if modify is not None and modify != '':
        # remove excess whitespaces (and capitalize first word?)
        modify = re.sub(r'\s([?.!"](?:\s|$))', r'\1', modify)
        modify = " ".join(modify.split())
        aphasic.append(modify)

In [145]:
df = pd.DataFrame(data={"preprocessed_text": aphasic})
df.to_csv("../classifiers/test.csv", sep=',',index=False)

In [146]:
len(aphasic)/len(sentences)

0.6059646987218503