In [78]:
# Based on results from Adaptation theory and non-fluent aphasia in english by Salis and Edwards (2004)

In [79]:
import pandas as pd
import spacy
nlp = spacy.load("en_core_web_sm")
import numpy as np
import random
import re
from pattern.en import conjugate, lemma, lexeme, PRESENT, SG
from spacy.matcher import Matcher


In [80]:
# filepath = "../preprocessing/data_control_preprocessed.csv"
filepath = "canonical.csv"

In [81]:
df = pd.read_csv(filepath).dropna().reset_index()

In [82]:
sentences = df["preprocessed_text"]

In [83]:
dets = {'Art': ['a', 'an', 'the', ''],
           'Dem': ['this', 'that', 'these', 'those', ''],
           'Poss': ['my', 'your', 'his', 'her', 'its', 'our', 'their', '']}

In [84]:
def det_sub(det):
    for _, detrms in dets.items():
        if det.lower() in detrms:
            y = [j for j in detrms if det!=j]
            return random.choice(y)
    return ""

In [85]:
def handle_determiner(tok):
    x = np.random.uniform(0,1)
    utt = tok.text + " "
    
    # 3% of determiners were substituted
    if x >= 0.97:
        if tok.pos_ == "DET" or "Dem" in tok.morph.get('PronType') or "Yes" in tok.morph.get('Poss'):
            utt = det_sub(tok.text) + " "
        
    # 19% determiners were omitted
    if x >= 0.81:
        utt = " "
        
    return utt

In [86]:
def handle_verb(tok):
    x = np.random.uniform(0,1)
    utt = tok.text + " "

    # 5% of copula were substituted
    if x >= 0.95 and tok.pos_ == "AUX" and tok.dep_ == "ROOT":
        utt = conjugate(verb=tok.text,tense=PRESENT,number=SG) + " " 
        
    # 6% of lexical bound morphemes were removed
    if x >= 0.94 and tok.pos_ == "VERB":
        utt = tok.lemma_ + " "
    
    # 8% of auxiliary verbs were substituted
    if x >= 0.92 and tok.pos_ == "AUX" and tok.dep_ == "aux":
        utt = conjugate(verb=tok.text,tense=PRESENT,number=SG) + " " 
    # 12% of lexical verbs were substituted (tense error) 
    if x >= 0.88 and tok.pos_ == "VERB":
        utt = conjugate(verb=tok.text,tense=PRESENT,number=SG) + " " 
    # 14% of auxiliary verbs were omitted
    if x >= 0.95 and tok.pos_ == "AUX" and tok.dep_ == "aux":
        utt = " " 
    # 17% verbs were omitted
    if x >= 0.83:
        utt = " "
    return utt

In [87]:
def handle_preposition(tok):
    x = np.random.uniform(0,1)
    utt = tok.text + " "
    # 2% of prepositions were substituted
    if x >= 0.98:
        # TODO substitute
        pass
    # 37% of prepositions were omitted
    if x >= 0.63:
        utt = " "
    return utt

In [88]:
def handle_person_pron(tok):
    utt = tok.text + " "
    x = np.random.uniform(0,1)
    # 27% of personal pronouns were omitted
    if x >= 0.73:
        utt = " "
    return utt

In [89]:
def aphasic_speech(sentence):
    vp_pattern = [[{'POS': 'VERB', 'OP': '?'},
                   {'POS': 'ADV', 'OP': '*'},
                   {'POS': 'AUX', 'OP': '*'},
                   {'POS': 'VERB', 'OP': '+'}]]
    matcher = Matcher(nlp.vocab)
    matcher.add("Verb phrase", vp_pattern)
    n = 15
    aphasic_utt = ""
    doc = nlp(sentence)
    # print(sentence, len(sentence.split()))
    if len(sentence.split()) <= n:
        # get NPs
        noun_phrases = set()

        for nc in doc.noun_chunks:
            for nop in [nc, doc[nc.root.left_edge.i:nc.root.right_edge.i + 1]]:
                noun_phrases.add(nop.text.strip())
                # get VPs
        verb_phrases = matcher(doc)
        verb_phrases = [doc[start:end] for _, start, end in verb_phrases]

        try:
            ratio = len(noun_phrases) / len(verb_phrases)
        except:
            # print("Division by zero")
            return aphasic_utt

        X = np.random.uniform(0, 1)
        # print(ratio, X)
        if ratio > 2 and X <= 0.8:
            # skip sentence
            return aphasic_utt
    
        else:
            # don't skip sentence
            for tok in doc:
                if tok.pos_ in ["DET", "PRON"]:
                    aphasic_utt += handle_determiner(tok)      
                elif tok.pos_ in ["VERB", "AUX"]:
                    aphasic_utt += handle_verb(tok)
                elif tok.dep_ == "prep" or tok.pos_ == "ADP":
                    aphasic_utt += handle_preposition(tok)
                elif tok.morph.get('Case') == 'Nom' and tok.morph.get('Person') == 1:
                    aphasic_utt += handle_person_pron(tok)
                else:
                    aphasic_utt += tok.text + " "
    
    return aphasic_utt

In [90]:
aphasic = []
for sent in sentences:
    modify = aphasic_speech(sent)
    if modify is not None and modify != '':
        # remove excess whitespaces (and capitalize first word?)
        modify = re.sub(r'\s([?.!"](?:\s|$))', r'\1', modify)
        modify = " ".join(modify.split())
        if modify.strip() != sent.strip():
            print(modify.strip())
            # print(sent.strip())
            # print(modify.strip()!=sent.strip())
            # print()
            aphasic.append(modify.strip())

So i already told story my son born.
But i bound and determined to be a mom.
So then started pursuing the adoption route.
So you know we always say we just waiting for her.
Also i adopted child.
But you know a lot of people .
There is difference between them.
That she would like to adopt a child from foster system.
The little boy was playing soccer his yard.
His dad was the living room.
ball came through the window and knocked over lamp.
Johnny you need to take umbrella.
I not an umbrella.
Sure enough as got halfway school it started raining.
So he ended taking the umbrella and walking back to school.
So the little girl was riding on her tricycle in her yard.
She was with her cat and her dog.
said.
Dad sparky chased peanut up the tree.
Well let me if i can go get cat.
She not get to participate in the same things that her stepsisters did.
Oh well can not the ball because have nothing to wear.
stepsisters off the ball.
She .
Cinderella went into carriage and went on to the ball.
She dan

In [91]:
df = pd.DataFrame(data={"preprocessed_text": aphasic})
df.to_csv("../classifiers/test.csv", sep=',',index=False)

In [92]:
len(aphasic)/len(sentences)

0.4521013297097933