# ADReSS 2020 - CHA to Text

By: Jimuel Celeste, Jr. 

Objective: To extract participant answers from .cha to .txt files.

In [9]:
import re
import os
from chamd import ChatReader

In [12]:
def get_speaker(line):
    speaker = str(line.metadata['speaker'])
    matches = re.search(r'.*(INV|PAR)$', speaker)
    return matches[1]

def get_lines(chat):
    return [line.text for line in chat.lines]

def get_lines_of(chat, speaker): # speaker = CODE, or all 
    lines = []
    for line in chat.lines:
        if get_speaker(line) == speaker:
            lines.append(line.text)
    return lines

def clean_cha_file(file):
    reader = ChatReader()
    chat = reader.read_file(file)
    lines = get_lines_of(chat=chat, speaker='PAR') # PAR - Participant; INV - Investigator
    results = " ".join(lines)
    return results

def clean_cha_files(source_dir, destination_dir):
    """
    This function cleans .cha files inside the source 
    directory (source_dir) and outputs the cleaned data 
    as .txt files in the destination directory (destination_dir).
    In addition, it recreates the structure of the subdirectories
    of the source directory to the destination directory.
    """
    for root, directories, files in os.walk(source_dir):     
        # Subdirectory of the files
        subdirectory = re.findall(r"%s\/{0,1}(.*)" % (source_dir), root)[0]
        print("Cleaning in", root)
        
        # Make the saving directory
        saving_dir = destination_dir + "/" + subdirectory
        os.makedirs(saving_dir, exist_ok=True)
        
        # Data Cleaning Loop
        for file in files: 
            # Filters out files with extensions other than .cha
            if file.strip().endswith(".cha"):
                #Clean data
                file_path = root + "/" + file
                print("Cleaning file:", file_path)
                cleaned_data = clean_cha_file(file_path)
                print("\tfile cleaned...")
                
                # Save
                saving_path = saving_dir + "/" + re.sub(r"(\.cha)$", ".txt", file)            
                with open(saving_path, 'w') as saving_file:
                    saving_file.write(cleaned_data)
                print("\tfile saved:", saving_path)

In [13]:
source_dir = '/Users/jimuelcelestejr/Documents/codebook/MLSpeech4MH/data/ADReSS2020/train_transcription_cha'
destination_dir = '/Users/jimuelcelestejr/Documents/codebook/MLSpeech4MH/data/ADReSS2020/train_transcription_txt'
clean_cha_files(source_dir, destination_dir)

Cleaning in /Users/jimuelcelestejr/Documents/codebook/MLSpeech4MH/data/ADReSS2020/train_transcription_cha
Cleaning file: /Users/jimuelcelestejr/Documents/codebook/MLSpeech4MH/data/ADReSS2020/train_transcription_cha/S032.cha
	file cleaned...
	file saved: /Users/jimuelcelestejr/Documents/codebook/MLSpeech4MH/data/ADReSS2020/train_transcription_txt//S032.txt
Cleaning file: /Users/jimuelcelestejr/Documents/codebook/MLSpeech4MH/data/ADReSS2020/train_transcription_cha/S033.cha
	file cleaned...
	file saved: /Users/jimuelcelestejr/Documents/codebook/MLSpeech4MH/data/ADReSS2020/train_transcription_txt//S033.txt
Cleaning file: /Users/jimuelcelestejr/Documents/codebook/MLSpeech4MH/data/ADReSS2020/train_transcription_cha/S027.cha
	file cleaned...
	file saved: /Users/jimuelcelestejr/Documents/codebook/MLSpeech4MH/data/ADReSS2020/train_transcription_txt//S027.txt
Cleaning file: /Users/jimuelcelestejr/Documents/codebook/MLSpeech4MH/data/ADReSS2020/train_transcription_cha/S153.cha
	file cleaned...
	fi