# Ground Truth Cleaning #

### Pipeline processed most recently: Mar 07, 2024 ###

**_All data collected from TalkBank's AphasiaBank under permission. Data are not public._**

Output data are located in `'../../data/'`

---

In [1]:
import os
import re
import pylangacq as pla
import pandas as pd
from datetime import datetime

current_date = datetime.now().strftime('%Y-%m-%d')
print(current_date)

2024-03-07


---

## 3. Clean `raw_transcription` and create `clean_groundtruth`. Create `aphasia/control_concat_fix.csv` file. ###

## 3-1. Clean `raw_transcription` and create `clean` and `clean_original`. ##

`clean_original` is the programmatically cleaned groundtruth without manual checking below. Use `clean` in the later final output files.
`clean_transcription_error` generates version of groundtruth with the intended "target" speech, with speech errors corrected.

In [2]:
def clean_transcription(transcription):
        
    transcription = transcription.replace('&-', 'FILLER')
    transcription = transcription.replace('&+', 'FRAGMENT')
    
    transcription = transcription.replace('[<]', '').replace('[.]', '').replace('[>]', '')
    transcription = transcription.replace('[>1]', '').replace('[>2]', '').replace('[>3]', '').replace('[<1]', '').replace('[<2]', '').replace('[<3]', '')
    transcription = transcription.replace('+<', '').replace('+,', '').replace('+..?', '').replace('+..', '').replace('+/?', '').replace('[?]', '')
    transcription = transcription.replace('(.)', '').replace('(..)', '').replace('(...)', '')
    transcription = transcription.replace('<', '').replace('>', '')
    transcription = transcription.replace('(', '').replace(')', '')
    transcription = transcription.replace('‡', '').replace('[/]', '').replace('[//]', '').replace('[///]', '').replace('[/?]', '').replace('[/-]', '')
    
    # Remove [+ ~]
    transcription = transcription.replace('[+ gram]', '').replace('[+ gram', '').replace('[+ gra', '').replace('[+ gr', '').replace('[+ g', '')
    transcription = transcription.replace('[+ exc]', '').replace('[+ exc', '').replace('[+ ex', '')
    transcription = transcription.replace('[+ esc]', '').replace('[+ esc', '').replace('[+ es]', '').replace('[+ es', '').replace('[+ e', '')
    transcription = transcription.replace('[+ jar]', '').replace('[+ per]', '').replace('[+ jar', '').replace('[+ ja', '').replace('[+ j', '')
    transcription = transcription.replace('[+ circ]', '').replace('[+ cir]', '').replace('[+ cir', '').replace('[+ ci', '').replace('[+ c', '')
    transcription = transcription.replace('[+ ', '').replace('[+', '')
    
    # Replace xxx
    transcription = transcription.replace('xxx', 'UNK')
    
    # Error words
    # on [: and] [* p:w] -> on
    # transcription = re.sub(r"\s([a-zA-Z'_]+)\s\[\:\s([a-zA-Z'_\s]+)\]\s\[\*\s([a-zA-Z'_]*\:[a-z-]*)\]", r' \1', transcription)
    transcription = re.sub(r"(?:^|\s)([a-zA-Z'_-]+)\s\[\:\s([a-zA-Z'\s_-]+)\]\s\[\*\s*([*:a-zA-Z\d\s'+-=]*)\]", r' \1', transcription)

    # ain't [: are not] (without error code) -> ain't
    transcription = re.sub(r"(?:^|\s)([a-zA-Z'_-]+)\s\[\:\s([a-zA-Z'\s_-]+)\](?!\s\[)", r' \1', transcription)
    
    # honli@u [: only] [* p:n] -> honli
    transcription = re.sub(r"(?:^|\s)([a-zA-Z'_-]+)@u\s\[\:\s([a-zA-Z'_@\s]*)\]\s\[\*\s*([*:a-zA-Z\d\s'+-=]*)\]", r' \1', transcription)
         
    # kotəgəl@u [: comfortable] [* n:k] -> UNK
    # hɑspəlɪd@u [: hospital] [* n:k-ret] -> UNK
    # ðæɾɪ@u [: x@n] [* n:uk] -> UNK
    # mɔ@u [: x@n] [* n:uk-rep] -> UNK
    # fɪŋks@u [: sphinx] [*] -> UNK
    transcription = re.sub(r"(\S*@u)\s\[\:\s([a-zA-Z'_@\s]*)\]\s\[\*\s*([*:a-zA-Z\d\s'+-=]*)\]", r'UNK', transcription)

    # ʌt [: up] [* p:n] -> UNK
    # iʔi [: x@n] [* n:uk] -> UNK
    transcription = re.sub(r"([a-zA-Z'_]*[^a-zA-Z'_\s]+[a-zA-Z'_]*[^a-zA-Z'_\s]*[a-zA-Z'_]*[^a-zA-Z'_\s]*[a-zA-Z'_]*[^a-zA-Z'_\s]*[a-zA-Z'_]*[^a-zA-Z'_\s]*[a-zA-Z'_]*[^a-zA-Z'_\s]*[a-zA-Z'_]*[^a-zA-Z'_\s]*)\s\[\:\s([a-zA-Z'_@\s]+)\]\s\[\*\s*([*:a-zA-Z\s'+-=]*)\]", r'UNK', transcription)
    transcription = re.sub(r"\S*\s\[\:\s([a-zA-Z'_@$\s]+)\]\s\[\*\s*([*:a-zA-Z\s'+-=]*)\]", r'UNK', transcription)
                 
    # sɪnrɛlə@u [: Cinderella] (without error code) -> UNK
    # transcription = re.sub(r"(\S*@u)\s\[\:\s([a-zA-Z'_@\s]*)\](?!\s\[\*)", r'UNK', transcription)
    transcription = re.sub(r"([a-zA-Z'_]*[^a-zA-Z'_\s]+[a-zA-Z'_]*[^a-zA-Z'_\s]*[a-zA-Z'_]*[^a-zA-Z'_\s]*[a-zA-Z'_]*[^a-zA-Z'_\s]*[a-zA-Z'_]*[^a-zA-Z'_\s]*[a-zA-Z'_]*[^a-zA-Z'_\s]*[a-zA-Z'_]*[^a-zA-Z'_\s]*)\s\[\:\s([a-zA-Z'_@\s]+)\]", r'UNK', transcription)
    
    # walked [: s:uk-ret] (without replacement, error code typo) -> walked
    transcription = re.sub(r"(?:^|\s)([a-zA-Z'_-]+)\s\[\:\s([a-zA-Z':\s-]+)\](?!\s\[\*)", r' \1', transcription)
    
    # remaining error codes
    transcription = re.sub(r'\[\*\s[^\]]+\]', '', transcription)
    transcription = transcription.replace('[*]', '')
    
    # untackled cases
    transcription = re.sub(r"(\S*@u)\s\[\:\s(\S*)\]", 'UNK', transcription)
    transcription = re.sub(r"([a-zA-Z'_]*(?!@)[^a-zA-Z'_\s]+[a-zA-Z'_]*)\s\[\:\s([a-zA-Z'_@\s]+)\]", 'UNK', transcription)
    
    
    # start of the string
    # transcription = re.sub(r"([a-zA-Z']+)\s\[:\s([a-zA-Z']+)\]", r'\1', transcription)
                           
    # Error codes
    transcription = re.sub(r'\$\S+', '', transcription)  
    # transcription = re.sub(r'@l', '', transcription)
    # change letter to upper class
    transcription = re.sub(r'\b([a-zA-Z])@l\b', lambda m: m.group(1).upper(), transcription)
    transcription = re.sub(r'@o', '', transcription)
    transcription = re.sub(r'@b', '', transcription)
    transcription = re.sub(r'@q', '', transcription)
    transcription = re.sub(r'@k', '', transcription)
    transcription = re.sub(r'@i', '', transcription)
    # transcription = re.sub(r'@n', '', transcription) 
    transcription = re.sub(r'@si', '', transcription)
    
    transcription = re.sub(r'([a-zA-Z]+)@n', r'\1', transcription)
    transcription = re.sub(r'\S+@u', r'UNK', transcription)

    # INV
    transcription = re.sub(r'&\*INV\S+', '', transcription)
    
    # Remove words with &-, &+, &=, [=! ], [= ], [!], [% ]
    # need to include & because typo
    #transcription = re.sub(r'&[+-]?.*?\s', '', transcription)
    transcription = re.sub(r'&\S*', '', transcription)
    transcription = re.sub(r'&[^ ]*', '', transcription)
    # transcription = re.sub(r'\[=!\s[a-zA-Z]*\]', '', transcription)
    transcription = transcription.replace('[!]', '')
    transcription = re.sub(r'\[=[^\]]+\]', '', transcription)
    transcription = transcription.replace('[=! laughin', '')
    transcription = re.sub(r"\[%\s(.*)\]", '', transcription)
    
    # Remove unnecessary chars
    transcription = transcription.replace('+', '').replace('"', '').replace('...', '').replace('//', '').replace('/', '').replace('^', '').replace('„', '')
    
    # Replace _ with space
    transcription = transcription.replace('_', ' ')

    # Remove punctuation
    transcription = transcription.replace('.', ' ')
    transcription = transcription.replace('?', ' ')
    transcription = transcription.replace('!', ' ')
    transcription = transcription.replace('”', '').replace('“', '')  
     
    # Remove :
    transcription = transcription.replace(':', '')
    
    # Replace - with whitespace
    transcription = transcription.replace('-', ' ')
    
    # Remove words containing '0' - there's error code with '0
    transcription = ' '.join(word for word in transcription.split() if '0' not in word)
    
    # Standardize whitespace
    transcription = re.sub(r'\s+', ' ', transcription)
    
    # Unknown - I suspect linebreak
    transcription = transcription.replace('', '')

    # transcription = transcription.replace('FILLER', '&-')
    # transcription = transcription.replace('FRAGMENT', '&+')
    
    return transcription.strip()

In [14]:
# cleaning function for speech errors - in progress
def clean_transcription_error(transcription):
        
    transcription = transcription.replace('&-', 'FILLER')
    transcription = transcription.replace('&+', 'FRAGMENT')
    
    transcription = transcription.replace('[<]', '').replace('[.]', '').replace('[>]', '')
    transcription = transcription.replace('[>1]', '').replace('[>2]', '').replace('[>3]', '').replace('[<1]', '').replace('[<2]', '').replace('[<3]', '')
    transcription = transcription.replace('+<', '').replace('+,', '').replace('+..?', '').replace('+..', '').replace('+/?', '').replace('[?]', '')
    transcription = transcription.replace('(.)', '').replace('(..)', '').replace('(...)', '')
    transcription = transcription.replace('<', '').replace('>', '')
    transcription = transcription.replace('(', '').replace(')', '')
    transcription = transcription.replace('‡', '').replace('[/]', '').replace('[//]', '').replace('[///]', '').replace('[/?]', '').replace('[/-]', '')
    
    # Remove [+ ~]
    transcription = transcription.replace('[+ gram]', '').replace('[+ gram', '').replace('[+ gra', '').replace('[+ gr', '').replace('[+ g', '')
    transcription = transcription.replace('[+ exc]', '').replace('[+ exc', '').replace('[+ ex', '')
    transcription = transcription.replace('[+ esc]', '').replace('[+ esc', '').replace('[+ es]', '').replace('[+ es', '').replace('[+ e', '')
    transcription = transcription.replace('[+ jar]', '').replace('[+ per]', '').replace('[+ jar', '').replace('[+ ja', '').replace('[+ j', '')
    transcription = transcription.replace('[+ circ]', '').replace('[+ cir]', '').replace('[+ cir', '').replace('[+ ci', '').replace('[+ c', '')
    transcription = transcription.replace('[+ ', '').replace('[+', '')
    
    # Replace xxx
    transcription = transcription.replace('xxx', 'UNK')
    
    # keep semantic errors
    transcription = re.sub(r"(?:^|\s)([a-zA-Z'_-]+)(?:\s\[\:\s([a-zA-Z'\s_-]+)\])?\s\[\*\s*s:\s*([*:a-zA-Z\d\s'+-=]*)\]", r' \1', transcription)    

    # keep morphological errors
    transcription = re.sub(r"(?:^|\s)([a-zA-Z'_-]+)(?:\s\[\:\s([a-zA-Z'\s_-]+)\])?\s\[\*\s*m:\s*([*:a-zA-Z\d\s'+-=]*)\]", r' \1', transcription)    

    # Error words
    # on [: and] [* p:w] -> on
    # transcription = re.sub(r"\s([a-zA-Z'_]+)\s\[\:\s([a-zA-Z'_\s]+)\]\s\[\*\s([a-zA-Z'_]*\:[a-z-]*)\]", r' \1', transcription)
    transcription = re.sub(r"(?:^|\s)([a-zA-Z'_-]+)\s\[\:\s([a-zA-Z'\s_-]+)\]\s\[\*\s*([*:a-zA-Z\d\s'+-=]*)\]", r' \1', transcription)    

    # ain't [: are not] (without error code) -> ain't
    transcription = re.sub(r"(?:^|\s)([a-zA-Z'_-]+)\s\[\:\s([a-zA-Z'\s_-]+)\](?!\s\[)", r' \1', transcription)
    
    # honli@u [: only] [* p:n] -> honli
    transcription = re.sub(r"(?:^|\s)([a-zA-Z'_-]+)@u\s\[\:\s([a-zA-Z'\s]*)\]\s\[\*\s*([*:a-zA-Z\d\s'+-=]*)\]", r' \2', transcription)
         
    # kotəgəl@u [: comfortable] [* n:k] -> UNK
    # hɑspəlɪd@u [: hospital] [* n:k-ret] -> UNK
    # ðæɾɪ@u [: x@n] [* n:uk] -> UNK
    # mɔ@u [: x@n] [* n:uk-rep] -> UNK
    # fɪŋks@u [: sphinx] [*] -> UNK
    transcription = re.sub(r"(\S*@u)\s\[\:\s([a-zA-Z'_@\s]*)\]\s\[\*\s*([*:a-zA-Z\d\s'+-=]*)\]", r'UNK', transcription)

    # ʌt [: up] [* p:n] -> UNK
    # iʔi [: x@n] [* n:uk] -> UNK
    transcription = re.sub(r"([a-zA-Z'_]*[^a-zA-Z'_\s]+[a-zA-Z'_]*[^a-zA-Z'_\s]*[a-zA-Z'_]*[^a-zA-Z'_\s]*[a-zA-Z'_]*[^a-zA-Z'_\s]*[a-zA-Z'_]*[^a-zA-Z'_\s]*[a-zA-Z'_]*[^a-zA-Z'_\s]*[a-zA-Z'_]*[^a-zA-Z'_\s]*)\s\[\:\s([a-zA-Z'_@\s]+)\]\s\[\*\s*([*:a-zA-Z\s'+-=]*)\]", r'UNK', transcription)
    transcription = re.sub(r"\S*\s\[\:\s([a-zA-Z'_@$\s]+)\]\s\[\*\s*([*:a-zA-Z\s'+-=]*)\]", r'UNK', transcription)
                 
    # sɪnrɛlə@u [: Cinderella] (without error code) -> UNK
    # transcription = re.sub(r"(\S*@u)\s\[\:\s([a-zA-Z'_@\s]*)\](?!\s\[\*)", r'UNK', transcription)
    transcription = re.sub(r"([a-zA-Z'_]*[^a-zA-Z'_\s]+[a-zA-Z'_]*[^a-zA-Z'_\s]*[a-zA-Z'_]*[^a-zA-Z'_\s]*[a-zA-Z'_]*[^a-zA-Z'_\s]*[a-zA-Z'_]*[^a-zA-Z'_\s]*[a-zA-Z'_]*[^a-zA-Z'_\s]*[a-zA-Z'_]*[^a-zA-Z'_\s]*)\s\[\:\s([a-zA-Z'_@\s]+)\]", r'UNK', transcription)
    
    # walked [: s:uk-ret] (without replacement, error code typo) -> walked
    transcription = re.sub(r"(?:^|\s)([a-zA-Z'_-]+)\s\[\:\s([a-zA-Z':\s-]+)\](?!\s\[\*)", r' \1', transcription)
    
    # remaining error codes
    transcription = re.sub(r'\[\*\s[^\]]+\]', '', transcription)
    transcription = transcription.replace('[*]', '')
    
    # untackled cases
    transcription = re.sub(r"(\S*@u)\s\[\:\s(\S*)\]", 'UNK', transcription)
    transcription = re.sub(r"([a-zA-Z'_]*(?!@)[^a-zA-Z'_\s]+[a-zA-Z'_]*)\s\[\:\s([a-zA-Z'_@\s]+)\]", 'UNK', transcription)
    
    # start of the string
    # transcription = re.sub(r"([a-zA-Z']+)\s\[:\s([a-zA-Z']+)\]", r'\1', transcription)
                           
    # Error codes
    transcription = re.sub(r'\$\S+', '', transcription)  
    # transcription = re.sub(r'@l', '', transcription)
    # change letter to upper class
    transcription = re.sub(r'\b([a-zA-Z])@l\b', lambda m: m.group(1).upper(), transcription)
    transcription = re.sub(r'@o', '', transcription)
    transcription = re.sub(r'@b', '', transcription)
    transcription = re.sub(r'@q', '', transcription)
    transcription = re.sub(r'@k', '', transcription)
    transcription = re.sub(r'@i', '', transcription)
    # transcription = re.sub(r'@n', '', transcription) 
    transcription = re.sub(r'@si', '', transcription)
    
    transcription = re.sub(r'([a-zA-Z]+)@n', r'\1', transcription)
    transcription = re.sub(r'\S+@u', r'UNK', transcription)

    # INV
    transcription = re.sub(r'&\*INV\S+', '', transcription)
    
    # Remove words with &-, &+, &=, [=! ], [= ], [!], [% ]
    # need to include & because typo
    #transcription = re.sub(r'&[+-]?.*?\s', '', transcription)
    transcription = re.sub(r'&\S*', '', transcription)
    transcription = re.sub(r'&[^ ]*', '', transcription)
    # transcription = re.sub(r'\[=!\s[a-zA-Z]*\]', '', transcription)
    transcription = transcription.replace('[!]', '')
    transcription = re.sub(r'\[=[^\]]+\]', '', transcription)
    transcription = transcription.replace('[=! laughin', '')
    transcription = re.sub(r"\[%\s(.*)\]", '', transcription)
    
    # Remove unnecessary chars
    transcription = transcription.replace('+', '').replace('"', '').replace('...', '').replace('//', '').replace('/', '').replace('^', '').replace('„', '')
    
    # Replace _ with space
    transcription = transcription.replace('_', ' ')

    # Remove punctuation
    transcription = transcription.replace('.', ' ')
    transcription = transcription.replace('?', ' ')
    transcription = transcription.replace('!', ' ')
    transcription = transcription.replace('”', '').replace('“', '')  
     
    # Remove :
    transcription = transcription.replace(':', '')
    
    # Replace - with whitespace
    transcription = transcription.replace('-', ' ')
    
    # Remove words containing '0' - there's error code with '0
    transcription = ' '.join(word for word in transcription.split() if '0' not in word)
    
    # Standardize whitespace
    transcription = re.sub(r'\s+', ' ', transcription)
    
    # Unknown - I suspect linebreak
    transcription = transcription.replace('', '')

    # transcription = transcription.replace('FILLER', '&-')
    # transcription = transcription.replace('FRAGMENT', '&+')
    
    return transcription.strip()

In [5]:
aphasia_df = pd.read_csv(f'../../data/aphasia_concat_{current_date}.csv')
aphasia_df['clean'] = aphasia_df['raw_transcript'].apply(lambda x: clean_transcription(str(x)))
# aphasia_df['clean_original'] = aphasia_df['clean']
# aphasia_df['clean_error'] = aphasia_df['raw_transcript'].apply(lambda x: clean_transcription_error(str(x)))
aphasia_df.to_csv(f'../../data/aphasia_concat_fix_{current_date}.csv', index=False)

control_df = pd.read_csv(f'../../data/control_concat_{current_date}.csv')
control_df['clean'] = control_df['raw_transcript'].apply(lambda x: clean_transcription(str(x)))
# control_df['clean_original'] = control_df['clean']
# control_df['clean_error'] = control_df['raw_transcript'].apply(lambda x: clean_transcription_error(str(x)))
control_df.to_csv(f'../../data/control_concat_fix_{current_date}.csv', index=False)

---

## 3-2. Additional cleaning performed on `clean` using regex search of non-alphabetic characters `[^a-zA-Z',\s]` and manually updating a lit ###

Search done on spreadsheet based on regex search filter.

In [9]:
aphasia_df = pd.read_csv(f'../../data/Old data/aphasia_concat_fix_{current_date}.csv')
aphasia_replacements = {
#    "NEURAL67-2_1792832_2031274.wav": ("him [ her]", "her"),
    "NEURAL67-2_1792832_2031274.wav": [("him [ her]", "him")],
    "kurland01c_132769_370511.wav": [("ɑbɑbɑbɑbɑ", "UNK")],
    "kurland25b_681268_917950.wav": [("haute@s", "haute"), ("bourgeoisie@s", "bourgeoisie")],
    "NEURAL59-1_44865_280647.wav": [("collet [ kɑlɪt]", "UNK")],
#    "tcu03a_1073314_1307547.wav": ("bard[ ball]", "ball"),
    "tcu03a_1073314_1307547.wav": [("bard[ ball]", "bard")],
#    "MSU04a_1239644_1419449.wav": ("he[ she]", "she"),
    "MSU04a_1239644_1419449.wav": [("he[ she]", "he")],
    "adler17a_1651920_1812484.wav": [( " ]", "")],
    "NEURAL42-1_546470_673027.wav": [( " exc]", "")],
    "UNH06a_2135010_2235876.wav": [("↫h h↫halve", "FRAGMENTh FRAGMENTh halve")],
    "UNH10a_1360383_1433940.wav": [("FRAGMENTfɪ", "UNK")],
    "MSU04a_912746_984244.wav": [(" ,]", "")],
    "MSU02a_232000_240450.wav": [("Firstname r", "FirstnameR"), ("Firstname d", "FirstnameD")],
    "thompson07b_128276_199076.wav": [("∬", "")],
    "UNH01a_675064_743878.wav": [(" ram]", "")],
    "BU08a_279598_343252.wav": [(" am]", "")],
    "BU02a_486009_548010.wav": [(" ]", "")],
    "kurland12b_136053_193875.wav": [("↑", "")],
    "ACWT09a_1673873_1726242.wav": [("tabeta [ volcano]", "volcano")],
    "fridriksson02a_475028_525966.wav": [(" ]", "")],
    "kurland02b_605125_655010.wav": [(" FRAGMENThɛ FRAGMENThɛ", " UNK UNK")],
    "kansas16a_1022310_1067896.wav": [(" am]", "")],
    "UNH06a_2240715_2266423.wav": [(" ↫lem lem lem↫lemonade", " FRAGMENTlem FRAGMENTlem FRAGMENTlem lemonade")],
    "ACWT12a_1252133_1277726.wav": [(" FRAGMENTbæ FRAGMENTbæ FRAGMENTbeɪ FRAGMENTbeɪ", " UNK UNK UNK UNK"), (" FRAGMENTbæ FRAGMENTbæ", " UNK UNK")],
    "UNH06a_1928325_1950276.wav": [(" ↫vu vu mu↫", " FRAGMENTvu FRAGMENTvu FRAGMENTmu ")],
    "kurland29e_704228_724804.wav": [("beaucoup@sfra", "beaucoup")],
    "UNH06a_1146121_1153861.wav": [("↫s s↫", "FRAGMENTs FRAGMENTs ")],
    "UNH06a_1269070_1276487.wav": [("↫te te↫", "FRAGMENTte FRAGMENTte] ")],
    "UNH06a_2831334_2836658.wav": [("↫ru ru↫", "FRAGMENTru FRAGMENTru ")],
    "UNH06a_1532990_1536660.wav": [("↫b b↫", "FRAGMENTb FRAGMENTb ")],
    "UNH05a_2911821_2915027.wav": [("≠", "")],
    "UNH17a_470998_481334.wav": [("FirstNameG", "FirstnameG")],
    "UNH17a_656203_676521.wav": [("FirstNameJ", "FirstnameJ"), ("FirstNameA", "FirstnameA"), ("FirstNameL", "FirstnameL")],
    "UNH17a_782685_791881.wav": [("FirstNameN", "FirstnameN")],
    "UNH17a_823164_834384.wav": [("FirstNameB", "FirstnameB")],
    "scale18a_200827_211307.wav": [("and FILLERum Kansas the FRAGMENTs station in Kansas", "and FILLERum Kansas the FRAGMENTs station in Kansas do you remember the first time")],
    "elman14a_2356292_2364616.wav": [("dicshinery[ dictionary]", "dicshinery")],
    "tap09a_275377_278447.wav": [("yeah that FRAGMENTw FRAGMENTmo FRAGMENTme", "yeah that FRAGMENTw FRAGMENTmo FRAGMENTme give me a shaker")],
    
}                                   
                    
for segment_name, replacements in aphasia_replacements.items():
    condition = aphasia_df['segment_name'] == segment_name
    for old, new in replacements:
        aphasia_df.loc[condition, 'clean'] = aphasia_df.loc[condition, 'clean'].str.replace(old, new, regex=False)
aphasia_df.to_csv(f'../../data/Old data/aphasia_concat_fix_{current_date}.csv', index=False)
                                   
control_df = pd.read_csv(f'../../data/Old data/control_concat_fix_{current_date}.csv')
control_replacements = {
    "wright42a_467440_706986.wav": [("federales@sspa", "federales")],
    "richardson21_1668821_1908210.wav": [("hoing@wp", "hoing")],
    "richardson21_405805_643550.wav": [("bibity@wp bobity@wp boo@wp", "bibity bobity boo")],
    "NEURAL2-2_666955_902459.wav": [("konoko@sjpn", "konoko")],
    "UNH1051_1722235_1957113.wav": [("libero@sita", "libero")],
    "NEURAL6-1_1321076_1555428.wav": [("verre@sfra", "verre")],
    "wright29a_350_232930.wav": [(" [ ]", "")],
    "wright69a_345680_545780.wav": [("carabinieri@sita", "carabinieri")],
    "NEURAL39-1_699293_880658.wav": [("bon@sfra appetit@sfra", "bon appetit")],
    "NEURAL2-1_890927_1058781.wav": [("otokonoko@sjpn", "otokonoko")],
    "NEURAL2-2_903204_1031394.wav": [("akachon@sjpn akachon@sjpn watashi@sjpn no alachon@sjpn", "akachon akachon watashi no alachon")],
    "capilouto80a_408260_523451.wav": [("koumon@s ou@s ye@s", "koumon ou ye")],
    "capilouto21a_268870_370494.wav": [("merci@sfra obrigado@spor gracias@sspa", "")],
    "capilouto09a_151107_223399.wav": [("jambo@sswa", "")],
    "capilouto59a_307038_378432.wav": [("haben@sdeu sie@sdeu eis@sdeu", ""), ("wurfel@sdeu eis@sdeu wurfel@sdeu", ""), ("nein@sdeu", ""), ("wurfel@sdeu", "")],
    "UNH1018_1893871_1944504.wav": [("ruck@seng", "")],
    "wright45a_167196_201841.wav": [("buona@sita sera@sita", "")],
    "UNH1018_1858187_1892376.wav": [("ruck@seng", "")],
    "wright45a_260500_289118.wav": [("hors@sfra d'oeuvres@sfra", "")],
    "UNH1037_329826_348156.wav": [("ditziest@sdeu", "")],
    "capilouto39a_220584_230404.wav": [("comprende@sspa", "")],
    "UNH1051_685561_689353.wav": [("like carrying a tent around with you", "like carrying a tent around with you, literally it was so big")],
    "UNH1034_2482928_2489098.wav": [("yeah she's like I'm fighting for my own hand I remember seeing that I'm like", "yeah she's like I'm fighting for my own hand I remember seeing that I'm like you should definitely watch it")],
    "wright18a_266347_272519.wav": [("and consequently from that FILLERuh I had ototoxic drugs so I lost my hearing that way", "Were much larger than Cinderellas and it looked like the footman and the prince were going to leave.")],
    
    
}

for segment_name, replacements in control_replacements.items():
    condition = control_df['segment_name'] == segment_name
    for old, new in replacements:
        control_df.loc[condition, 'clean'] = control_df.loc[condition, 'clean'].str.replace(old, new, regex=False)
control_df.to_csv(f'../../data/Old data/control_concat_fix_{current_date}.csv', index=False)

## 3-3. Different versions of cleaning are done. Save everything to `aphasia/control_all_fix.csv`. ##
`clean_v1` includes both fillers and phonological fragments.
`clean_v2` includes phonological fragments, but fillers are removed.
`clean_v3` removes both fillers and phonological fragments.

In [10]:
def clean_version1(text):
# fillers and phonological fragments are both left in

    text = str(text)
    return text.replace('FILLER', '').replace('FRAGMENT', '')

def clean_version2(text):
# fillers are removed, phonological fragments are left in

    text = str(text)
    text = ' '.join([word for word in text.split() if not word.startswith('FILLER')])
    text = text.replace('FRAGMENT', '')
    
    return text

def clean_version3(text):
# fillers and phonological fragments are both removed

    text = str(text)
    text = ' '.join([word for word in text.split() if not word.startswith('FRAGMENT') and not word.startswith('FILLER')])

    return text

aphasia_df = pd.read_csv(f'../../data/Old data/aphasia_concat_fix_{current_date}.csv')
control_df = pd.read_csv(f'../../data/Old data/control_concat_fix_{current_date}.csv')

aphasia_df['clean_v1'] = aphasia_df['clean'].apply(clean_version1)
aphasia_df['clean_v2'] = aphasia_df['clean'].apply(clean_version2)
aphasia_df['clean_v3'] = aphasia_df['clean'].apply(clean_version3)

control_df['clean_v1'] = control_df['clean'].apply(clean_version1)
control_df['clean_v2'] = control_df['clean'].apply(clean_version2)
control_df['clean_v3'] = control_df['clean'].apply(clean_version3)

aphasia_df.to_csv(f'../../data/Old data/aphasia_concat_fix_{current_date}.csv')
control_df.to_csv(f'../../data/Old data/control_concat_fix_{current_date}.csv')

In [8]:
aphasia_df = pd.read_csv(f'../../data/aphasia_concat_fix_{current_date}.csv')
control_df = pd.read_csv(f'../../data/control_concat_fix_{current_date}.csv')

unique_aphasia_filenames = aphasia_df['filename'].nunique()
unique_control_filenames = control_df['filename'].nunique()

total_aphasia_snippets = aphasia_df.shape[0]
total_control_snippets = control_df.shape[0]

print(f"Total number of unique aphasia interviews: {unique_aphasia_filenames}")
print(f"Total number of unique control interviews: {unique_control_filenames}")

print(f"Total number of aphasia snippets: {total_aphasia_snippets}")
print(f"Total number of control snippets: {total_control_snippets}")

Total number of unique aphasia interviews: 550
Total number of unique control interviews: 347
Total number of aphasia snippets: 23057
Total number of control snippets: 5342
