In [537]:
# Based on results from Adaptation theory and non-fluent aphasia in english by Salis and Edwards (2004)

In [538]:
import pandas as pd
import spacy
nlp = spacy.load("en_core_web_sm")
import numpy as np
import random
import re
from pattern.en import conjugate, lemma, lexeme, PRESENT, SG

In [539]:
filepath = "../preprocessing/data_control_preprocessed.csv"

In [540]:
df = pd.read_csv(filepath).dropna().reset_index()

In [541]:
sentences = df["preprocessed_text"]

In [542]:
dets = {'Art': ['a', 'an', 'the', ''],
           'Dem': ['this', 'that', 'these', 'those', ''],
           'Poss': ['my', 'your', 'his', 'her', 'its', 'our', 'their', '']}

In [543]:
def det_sub(det):
    for _, detrms in dets.items():
        if det.lower() in detrms:
            y = [j for j in detrms if det!=j]
            return random.choice(y)
    return ""

In [544]:
def handle_determiner(tok):
    x = np.random.uniform(0,1)
    # 3% of determiners were substituted
    if x >= 0.97:
        if tok.pos_ == "DET" or "Dem" in tok.morph.get('PronType') or "Yes" in tok.morph.get('Poss'):
            return det_sub(tok.text) + " "
    # 19% determiners were omitted
    elif x >= 0.81:
        return " "
    return tok.text + " "

In [545]:
def handle_verb(tok):
    # TODO handle the copula, auxiliary and lexical tense/bound morphemes
    x = np.random.uniform(0,1)
    # 4% of verbs were substituted (tense error) 
    if x >= 0.96:
        # TODO tense sub based on frequency?
        return conjugate(verb=tok.text,tense=PRESENT,number=SG) + " " 
    # 17% determiners were omitted
    elif x >= 0.83:
        return " "
    return tok.text + " "

In [546]:
def handle_preposition(tok):
    x = np.random.uniform(0,1)
    # 2% of prepositions were substituted
    if x >= 0.98:
        # TODO substitute
        pass
    # 37% of prepositions were omitted
    elif x >= 0.63:
        return " "
    return tok.text + " "

In [547]:
def handle_person_pron(tok):
    x = np.random.uniform(0,1)
    # 27% of personal pronouns were omitted
    if x >= 0.73:
        return " "
    return tok.text + " "

In [548]:
aphasic = []
for sentence in sentences:
    aphasic_utt = ""
    doc = nlp(sentence)
    for tok in doc:
        if tok.pos_ in ["DET","PRON"]:
            aphasic_utt += handle_determiner(tok)          
        elif tok.pos_ == "VERB":
            aphasic_utt += handle_verb(tok)
        elif tok.dep_ == "prep" or tok.pos_ == "ADP":
            aphasic_utt += handle_preposition(tok)
        elif tok.morph.get('Case') == 'Nom' and tok.morph.get('Person') == 1:
            aphasic_utt += handle_person_pron(tok)
            
        else:
            aphasic_utt += tok.text + " "
    
    # remove excess whitespaces (and capitalize first word?)
    aphasic_utt = re.sub(r'\s([?.!"](?:\s|$))', r'\1', aphasic_utt)
    aphasic_utt = " ".join(aphasic_utt.split())
    aphasic.append(aphasic_utt)

In [549]:
df = pd.DataFrame(data={"preprocessed_text": aphasic})
df.to_csv("../classifiers/test.csv", sep=',',index=False)

In [550]:
aphasic

['so I have two children.',
 'and one of children is adopted.',
 'and one is not.',
 'so I already the story about son being born.',
 'so I guess we can talk about my daughter and the adoption.',
 'tried to have children for many years and could not get pregnant.',
 'and went through lots of fertility treatments and did not they did not really find anything that would because us not to get pregnant.',
 'but I was bound and determined to be mom.',
 'so then started pursuing the adoption route.',
 'and very interesting I never thought how complicated it was to do that and and all that goes into that.',
 'and it was a long process and probably took close to year.',
 'we went domestic adoption agency here South Carolina and actually had a couple birth mothers lined up who changes minds at last minute.',
 'so it that was kind of heartbreaking at the time.',
 'and then finally adopted my daughter when she was about two months old.',
 'so you know we always say we were just for her.',
 'and a