# Transcript Collection #

### Pipeline processed most recently: Mar 07, 2024 ###

**_All data collected from TalkBank's AphasiaBank under permission. Data are not public._**

Input transcripts are located in `'../../../Aphasia_transcript/'` and `'../../../Control_transcript/'`.

Output data are located in `'../../data/'`

---

In [1]:
import os
import re
import pylangacq as pla
import pandas as pd
from datetime import datetime

current_date = datetime.now().strftime('%Y-%m-%d')
print(current_date)

2024-03-07


## 1. Download transcript and audio files from AphasiaBank’s website. ###

In [2]:
def count_files_in_directory(directory_path, file_extension=".cha"):
    return len([f for f in os.listdir(directory_path) 
                if os.path.isfile(os.path.join(directory_path, f)) and f.endswith(file_extension)])

aphasia_count = count_files_in_directory('../../../Aphasia_transcript/')
control_count = count_files_in_directory('../../../Control_transcript/')

print(f"Total number of aphasia interviews: {aphasia_count}")
print(f"Total number of control interviews: {control_count}")


Total number of aphasia interviews: 551
Total number of control interviews: 347


---

## 2. Collect `raw_transcription` from CHAT transcript files. ##

## 2-1. Create `aphasia/control_all.csv` file. ##



In [3]:
def process_directory(directory_path, output_csv):
    data = []

    for filename in os.listdir(directory_path):
        if filename.endswith('.cha'):
            file_path = os.path.join(directory_path, filename)

            with open(file_path, "r", encoding="utf-8") as file:
                lines = file.readlines()

            previous_line = ""
            file_id = os.path.basename(file_path)

            for line in lines:
                if line.startswith("\t"):
                    previous_line += " " + line.strip()
                else:
                    if previous_line and (previous_line.startswith("*PAR") or previous_line.startswith("*IN")):
                        speaker_id = previous_line.split(":")[0].strip()
                        speaker_tag = file_id.split('.')[0] + "_" + speaker_id[1:]

                        raw_transcript = previous_line.split("\t")[1].split("\x15")[0].strip()

                        if speaker_id.startswith("INV") and raw_transcript == "www .":
                            data.append([file_id, speaker_tag, raw_transcript, "", ""])
                        else:
                            time_mark_parts = previous_line.split("\x15")
                            if len(time_mark_parts) > 1:
                                start_time = time_mark_parts[1].split("_")[0].strip()
                                end_time = time_mark_parts[1].split("_")[1].strip()

                                data.append([file_id, speaker_tag, raw_transcript, start_time, end_time])

                    previous_line = line.strip()

    df = pd.DataFrame(data, columns=['filename', 'speaker_id', 'raw_transcript', 'start_time', 'end_time'])

    df.to_csv(output_csv, index=False)
    print(f"Saved to {output_csv}")

process_directory('../../../Aphasia_transcript', f'../../data/aphasia_all_{current_date}.csv')
process_directory('../../../Control_transcript', f'../../data/control_all_{current_date}.csv')


Saved to ../../data/aphasia_all_2024-03-07.csv
Saved to ../../data/control_all_2024-03-07.csv


## 2-2. Make edits to the `aphasia/control_all.csv` file. ##

`Baycrest9336a.cha` removed for having multiple participants.\
Change wrong INV tags to PAR tags. 


In [4]:
aphasia_df = pd.read_csv(f'../../data/aphasia_all_{current_date}.csv')
control_df = pd.read_csv(f'../../data/control_all_{current_date}.csv')

def filter_rows(df):
    return df[
        df['speaker_id'].str.contains('_INV', case=False, na=False) & 
        df['raw_transcript'].str.contains('\+"\s*', na=False)
    ]

aphasia_filtered = filter_rows(aphasia_df)
control_filtered = filter_rows(control_df)
combined_df = pd.concat([aphasia_filtered, control_filtered], ignore_index=True)

combined_df.to_csv(f'../../data/combined_filtered_{current_date}.csv', index=False)

In [5]:
wrong_INV = pd.read_csv('../../data/Old data/wrong_INV_2023-09-27.csv')
aphasia_df = pd.read_csv(f'../../data/aphasia_all_{current_date}.csv')
control_df = pd.read_csv(f'../../data/control_all_{current_date}.csv')

for index, row in wrong_INV.iterrows():
    
    mask_aphasia = aphasia_df.eq(row).all(axis=1)
    aphasia_df.loc[mask_aphasia, 'speaker_id'] = aphasia_df.loc[mask_aphasia, 'speaker_id'].str.replace("_INV", "_PAR")

    mask_control = control_df.eq(row).all(axis=1)
    control_df.loc[mask_control, 'speaker_id'] = control_df.loc[mask_control, 'speaker_id'].str.replace("_INV", "_PAR")

aphasia_df = aphasia_df[aphasia_df['filename'] != 'Baycrest9336a.cha']    

aphasia_df.to_csv(f'../../data/aphasia_all_{current_date}.csv', index=False)
control_df.to_csv(f'../../data/control_all_{current_date}.csv', index=False)

## 2-3. Extract paticipant speech and create `aphasia/control_concat.csv` file. ##


In [6]:
def concatenate_rows(input_csv, output_csv):
    df = pd.read_csv(input_csv)
    
    # df = df[~((df['speaker_id'].str.contains('_PAR')) & (df['raw_transcript'] == 'www .'))]

    concatenated_rows = []
    is_par = False
    start_time = None
    current_filename = None
    inv_present = False 
    transcript = ""
    previous_end_time = None
    
    for idx, row in df.iterrows():
        if current_filename != row['filename']:
            if is_par:
                concatenated_rows.append({
                    'filename': current_filename,
                    'speaker_id': 'concat_PAR',
                    'raw_transcript': transcript,
                    'start_time': start_time,
                    'end_time': end_time
                })

            is_par = False
            inv_present = False
            current_filename = row['filename']
            transcript = ""
            previous_end_time = None

        if "_IN" in row['speaker_id']:
            inv_present = True
            if is_par:  
                concatenated_rows.append({
                    'filename': current_filename,
                    'speaker_id': 'concat_PAR',
                    'raw_transcript': transcript,
                    'start_time': start_time,
                    'end_time': end_time
                })
                is_par = False
            transcript = ""
            previous_end_time = None
        elif "_PAR" in row['speaker_id']:
            if is_par:  

                if (row['end_time'] - start_time) > 240000:
                    potential_split_idx = -1
                    for i in range(idx-1, 0, -1):
                        if (df.iloc[i]['end_time'] - df.iloc[i-1]['start_time']) >= 1:
                            potential_split_idx = i
                            break

                    if potential_split_idx != -1:
                        split_transcripts = transcript.split(' ')
                        first_transcript = ' '.join(split_transcripts[:potential_split_idx])
                        second_transcript = ' '.join(split_transcripts[potential_split_idx:])
                        
                        concatenated_rows.append({
                            'filename': current_filename,
                            'speaker_id': 'concat_PAR',
                            'raw_transcript': first_transcript,
                            'start_time': start_time,
                            'end_time': df.iloc[potential_split_idx]['end_time']
                        })

                        start_time = df.iloc[potential_split_idx+1]['start_time']
                        transcript = second_transcript
                    else:

                        concatenated_rows.append({
                            'filename': current_filename,
                            'speaker_id': 'concat_PAR',
                            'raw_transcript': transcript,
                            'start_time': start_time,
                            'end_time': previous_end_time
                        })
                        start_time = row['start_time']
                        transcript = row['raw_transcript']

                transcript += ' ' + row['raw_transcript']
                end_time = row['end_time']
            else:  
                is_par = True
                start_time = row['start_time']
                end_time = row['end_time']
                transcript = row['raw_transcript']
            previous_end_time = row['end_time']

    if is_par:
        concatenated_rows.append({
            'filename': current_filename,
            'speaker_id': 'concat_PAR',
            'raw_transcript': transcript,
            'start_time': start_time,
            'end_time': end_time
        })

    concatenated_df = pd.DataFrame(concatenated_rows)
    concatenated_df = concatenated_df.drop(columns=['speaker_id'])
    concatenated_df.to_csv(output_csv, index=False)
    
    print(f"Saved to {output_csv}")

concatenate_rows(f'../../data/aphasia_all_{current_date}.csv', f'../../data/aphasia_concat_{current_date}.csv')
concatenate_rows(f'../../data/control_all_{current_date}.csv', f'../../data/control_concat_{current_date}.csv')


Saved to ../../data/aphasia_concat_2024-03-07.csv
Saved to ../../data/control_concat_2024-03-07.csv


## 2-4. Give segment ID to snippets and create `duration` column. ##

In [7]:
def add_segment_names(df, original_filename):
    
    df['file_base'] = df['filename'].str.split('.').str[0]
    df['segment_name'] = df['file_base'] + "_" + df['start_time'].astype(str) + "_" + df['end_time'].astype(str) + '.wav'
    # df['segment_name'] = df['filename'] + "_" + df['start_time'].astype(str) + "_" + df['end_time'].astype(str) + '.wav'
    df.drop('file_base', axis=1, inplace=True)
    df.to_csv(original_filename, index=False)

    return df

aphasia_df = pd.read_csv(f"../../data/aphasia_concat_{current_date}.csv")
control_df = pd.read_csv(f"../../data/control_concat_{current_date}.csv")

aphasia_df = add_segment_names(aphasia_df, f"../../data/aphasia_concat_{current_date}.csv")
control_df = add_segment_names(control_df, f"../../data/control_concat_{current_date}.csv")


In [8]:
aphasia_df = pd.read_csv(f'../../data/aphasia_concat_{current_date}.csv')
control_df = pd.read_csv(f'../../data/control_concat_{current_date}.csv')

aphasia_df['duration'] = aphasia_df['end_time'] - aphasia_df['start_time']
control_df['duration'] = control_df['end_time'] - control_df['start_time']

aphasia_df = aphasia_df.sort_values(by='duration', ascending=False)
control_df = control_df.sort_values(by='duration', ascending=False)

aphasia_df.to_csv(f'../../data/aphasia_concat_{current_date}.csv', index=False)
control_df.to_csv(f'../../data/control_concat_{current_date}.csv', index=False)

aphasia_df.head()

Unnamed: 0,filename,raw_transcript,start_time,end_time,segment_name,duration
526,fridriksson09b.cha,&=chuckles well ‡ sɪndə˞ɛʔɛt@u [: Cinderella] ...,730492,970447,fridriksson09b_730492_970447.wav,239955
58,scale18d.cha,&-um (..) Cinderella . [+ gram] one [: once] [...,1111586,1351478,scale18d_1111586_1351478.wav,239892
13232,fridriksson01a.cha,okay . [+ exc] &+st &-um &=lips:smack &-um sɪn...,853763,1093628,fridriksson01a_853763_1093628.wav,239865
2414,elman01b.cha,okay ‡ there's a: (.) <I think> [//] for is a ...,604018,843878,elman01b_604018_843878.wav,239860
20479,thompson11a.cha,<and the [/] the> [//] &-um your [//] she [//]...,3423199,3663054,thompson11a_3423199_3663054.wav,239855


In [9]:
aphasia_df = pd.read_csv(f'../../data/aphasia_concat_{current_date}.csv')
control_df = pd.read_csv(f'../../data/control_concat_{current_date}.csv')

unique_aphasia_filenames = set(aphasia_df['filename'])
unique_control_filenames = set(control_df['filename'])

def check_missing_files(directory, unique_filenames):
    all_files_in_directory = set(os.listdir(directory))

    missing_files = all_files_in_directory - unique_filenames
    
    return missing_files

missing_aphasia_files = check_missing_files('../../../Aphasia_transcript/', unique_aphasia_filenames)
missing_control_files = check_missing_files('../../../Control_transcript/', unique_control_filenames)

print(f"Missing files from Aphasia_transcript: {missing_aphasia_files}")
print(f"Missing files from Control_transcript: {missing_control_files}")

print(f"Number of unique filenames in aphasia_df: {len(unique_aphasia_filenames)}")
print(f"Number of unique filenames in control_df: {len(unique_control_filenames)}")

Missing files from Aphasia_transcript: {'Baycrest9336a.cha', '.ipynb_checkpoints', '.DS_Store'}
Missing files from Control_transcript: {'.ipynb_checkpoints', '.DS_Store'}
Number of unique filenames in aphasia_df: 550
Number of unique filenames in control_df: 347
