In [61]:
import pandas as pd
import spacy
import glob
import re
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split

In [62]:
path_control = '../data/Pitt/Control/cookie/*.cha'
path_dementia = '../data/Pitt/Dementia/cookie/*.cha'

files_control = sorted(glob.glob(path_control))
files_dementia = sorted(glob.glob(path_dementia))

In [63]:
def extract_participant_from_file(file):
    par_re = re.compile(r'^\*PAR:\s(.*)')
    cont_re = re.compile(r'^\t(.*)')
    
    document = open(file).read()
    doc = document.split('\n')    

    pre_list = []
    in_par = False
    for line in doc:
        pattern = cont_re if in_par else par_re
        match = pattern.match(line)
        if match:
            utterance = match.group(1)
            utterance = re.sub('\\x15.*\\x15$', '', utterance)
            pre_list.append(utterance)
            
            in_par = True
        else:
            in_par = False
            
    return(pre_list)


In [64]:
utterances_symbol_control = [extract_participant_from_file(f) for f in files_control]
utterances_symbol_dementia = [extract_participant_from_file(f) for f in files_dementia]

control_df = pd.DataFrame(
    {'label': 0,   # Control = 0
     'sentence': utterances_symbol_control
     })

dementia_df = pd.DataFrame(
    {'label': 1,    # Dementia = 1
     'sentence': utterances_symbol_dementia
     })

# Create shuffled dataframes
df_temp = pd.concat([control_df, dementia_df])
df_temp = shuffle(df_temp).reset_index(drop=True)
df_temp['text'] = df_temp.apply(lambda row: ' '.join(row.sentence), axis=1)

df = df_temp.copy()
#df['text'][0]

### POS TAGGING

In [65]:
nlp = spacy.load('en_core_web_sm')

In [66]:
def tagged_dialogue(dialogue):
    tagged = [(token.text, token.pos_) for token in nlp(dialogue)]  # To get tag and dep labels, replace pos_ by tag_ and dep_. See spacy documentation for meanings.
    tagged_temp = [' '.join(j) for j in tagged]
    tagged_final = ' '.join(tagged_temp)
    return(tagged_final)

df['pos_text'] = df.apply(lambda x: tagged_dialogue(x['text']), axis = 1 )

In [67]:
def only_tags(dialogue):
    tagged = [(token.text, token.pos_) for token in nlp(dialogue)]
    tagged_temp = [i[1] for i in tagged]
    tag_final = ' '.join(tagged_temp)
    return(tag_final)

df['pos_'] = df.apply(lambda x: only_tags(x['text']), axis = 1 )


#### Replace abbreviations with full words

In [68]:
dictionary = { "ADJ":"adjective",
"ADP":"adposition",
"ADV":"adverb",
"AUX":"auxiliary",
"CONJ":"conjunction",
"CCONJ":"coordinating conjunction",
"DET":"determiner",
"INTJ":"interjection",
"NOUN":"noun",
"NUM":"numeral",
"PART":"particle",
"PRON":"pronoun",
"PROPN":"proper noun",
"PUNCT":"punctuation",
"SCONJ":"subordinating conjunction",
"SYM":"symbol",
"VERB":"verb",
"X":"other",
"SPACE":"space"}

def pos_complete(dialogue):
    address = dialogue
    for word, initial in dictionary.items():
        address = address.replace(word, initial)
    return(address)
    
df['pos_text_complete'] = df.apply(lambda x: pos_complete(x['pos_text']), axis = 1 )
df['pos_complete'] = df.apply(lambda x: pos_complete(x['pos_']), axis = 1 )



### Replace transcription symbols with words that can be understood by pre-trained embeddings 

In [69]:
def special_tags(line):
    line = re.sub(r'\[\/\]', '[ repetition ]', line)
    line = re.sub(r'\[\/\/\]', '[ retraction ]', line)
    line = re.sub(r'\(\.\.\)', '[ pause ]', line)
    line = re.sub(r'\(\.\)', '[ short_pause ]', line)
    line = re.sub(r'\(\.\.\.\)', '[ long_pause ]', line)
    line = re.sub(r'(\([a-zA-Z0-9_]+\))', ' [ incomplete_word ]', line)
    line = re.sub(r'(\[\:\s.*?\])', '[ assimilation ]', line)   
    line = re.sub(r'\(', '', line) # Eliminamos parentesis izquierdos
    line = re.sub(r'\)', '', line)
    line = re.sub('[<]', '', line)
    line = re.sub('[>]', '', line)
    line = re.sub('<', '', line)
    line = re.sub('>', '', line)
    line = re.sub(r'\[\]', '', line)
    line = re.sub(r'\[x\s2\]', '', line) # Eliminamos los [x numero]
    line = re.sub(r'\[x\s3\]', '[ repetition_repetition_repetition ]', line)
    line = re.sub(r'\[x\s4\]', '', line)
    line = re.sub(r'\[x\s6\]', '[ repetition_repetition_repetition_repetition_repetition_repetition ]', line)
    line = re.sub(r'(\+\.\.\.)', '[ incomplete_sentence ]', line)
    line = re.sub(r'(\+\.\.\?)', '[ incomplete_sentence ]', line)
    line = re.sub(r'(\[\+\sgram])', '[ grammatical_error ]', line) 
    #line = re.sub(r'(\[\+\sgram])', r'\[\sgrammatical\_error\s\]', line)
    line = re.sub(r'(\[\+\sjar])', '[ jargon_error ]', line)
    line = re.sub(r'(\[\+\ses])', '[ meaningless_error ]', line)
    line = re.sub(r'(\[\+\scir])', '[ circumlocution_error ]', line)
    line = re.sub(r'(\[\+\sexc])', '', line) # Elimianmos +exc (exclusion) y no contamos las ocurrencias
    line = re.sub(r'(\[\*\s.*?\])', '[ word_error ]', line)
    line = re.sub(r'(\s\+\W.*?)', '', line)
    line = re.sub(r'(\s\+\s\W.*?)', '', line)
    line = re.sub(r'(\B\&=\w+)', '[ action ]', line)
    line = re.sub(r'xxx', '[ unintelligible ]', line)
    line = re.sub(r'(\B\&uh)', '[ hesitation ]', line)
    line = re.sub(r'(\B\&um)', '[ hesitation ]', line)
    line = re.sub(r'(\B\&hm)', '[ hesitation ]', line)
    line = re.sub(r'(\B\&mm)', '[ hesitation ]', line)
    line = re.sub(r'(\smm)', '[ hesitation ]', line)
    line = re.sub(r'(huh)', '[ hesitation ]', line)
    line = re.sub(r'(hum)', '[ hesitation ]', line)
    line = re.sub(r'(hm)', '[ hesitation ]', line)
    line = re.sub(r'(mhm)', '[ hesitation ]', line)
    line = re.sub(r'(\B\&\w+)', '[ disfluency ]', line)
    line = re.sub(r'(\B\&)', '', line)
    line = re.sub(r'(\:)', '', line)
    line = re.sub(r'(\/)', '', line)
    line = re.sub(r'(\+)', '', line)
    line = re.sub(r'(\+\s)', '', line)
    line = re.sub(r'(\‡)', '', line)
    line = re.sub(r'(\@\w+)', '', line)
    line = re.sub(r'www', '', line)
    
    return(line)
  
    

df['new_text'] = df.apply(lambda x: special_tags(x['text']), axis = 1 )

   
# We generate a column without annotations in squared brackets for not skewing the POS tagging
df['text_for_POS'] = df.apply(lambda x: re.sub(r'(\[\s.*?\])', '', (x['new_text'])), axis = 1 )

df.head(0)

Unnamed: 0,label,sentence,text,pos_text,pos_,pos_text_complete,pos_complete,new_text,text_for_POS


In [70]:
df.to_csv('cookie_tagged.csv', index = False, sep = ';')