# Transcript Preparation

By: Jimuel Celeste, Jr. 

Objective: To prepare a clean dataset for the transcripts.

In [1]:
import os 

import json
import pandas as pd

## Check: completeness

In [2]:
metadata = '/Users/jimuelcelestejr/Downloads/Dataset/Thesis - Combined/metadata.csv'
metadata_df = pd.read_csv(metadata)
metadata_df.head()

Unnamed: 0,filename,record_id,subject_id,age,sex,educ,mmse,dx,dx_binary,dataset
0,S002.wav,S002,S002,62.0,F,,30.0,NC,0,ADReSS
1,S003.wav,S003,S003,69.0,F,,29.0,NC,0,ADReSS
2,S004.wav,S004,S004,71.0,F,,30.0,NC,0,ADReSS
3,S005.wav,S005,S005,74.0,F,,30.0,NC,0,ADReSS
4,S006.wav,S006,S006,67.0,F,,29.0,NC,0,ADReSS


In [3]:
input_dir = '/Users/jimuelcelestejr/Downloads/Dataset/Thesis - Whisper Transcripts'
i = 0
for file in metadata_df['filename'].values:
    base, ext = os.path.splitext(file)
    json_file = base + '.json'
    json_file = os.path.join(input_dir, json_file)
    
    if os.path.exists(json_file):
        i +=1 
    else:
        print('File does not exist:', json_file)
print(i)

1023


Complete dataset

## Metadata

In [4]:
# new_metadata
def format_ext(file, new_ext):
    base, ext = os.path.splitext(file)
    json_file = base + new_ext
    return json_file 
    
metadata_df['filename'].apply(lambda x: format_ext(x, '.json')).head()

0    S002.json
1    S003.json
2    S004.json
3    S005.json
4    S006.json
Name: filename, dtype: object

In [5]:
new_metadata_df = metadata_df.copy(deep=True)
new_metadata_df.head()

Unnamed: 0,filename,record_id,subject_id,age,sex,educ,mmse,dx,dx_binary,dataset
0,S002.wav,S002,S002,62.0,F,,30.0,NC,0,ADReSS
1,S003.wav,S003,S003,69.0,F,,29.0,NC,0,ADReSS
2,S004.wav,S004,S004,71.0,F,,30.0,NC,0,ADReSS
3,S005.wav,S005,S005,74.0,F,,30.0,NC,0,ADReSS
4,S006.wav,S006,S006,67.0,F,,29.0,NC,0,ADReSS


In [6]:
new_metadata_df['filename'] = new_metadata_df['filename'].apply(lambda x: format_ext(x, '.json'))
new_metadata_df.head()

Unnamed: 0,filename,record_id,subject_id,age,sex,educ,mmse,dx,dx_binary,dataset
0,S002.json,S002,S002,62.0,F,,30.0,NC,0,ADReSS
1,S003.json,S003,S003,69.0,F,,29.0,NC,0,ADReSS
2,S004.json,S004,S004,71.0,F,,30.0,NC,0,ADReSS
3,S005.json,S005,S005,74.0,F,,30.0,NC,0,ADReSS
4,S006.json,S006,S006,67.0,F,,29.0,NC,0,ADReSS


In [7]:
new_metadata_csv = os.path.join(input_dir, 'metadata.csv')
new_metadata_df.to_csv(new_metadata_csv, index=False)

## Generate text metadata

In [8]:
text_metadata_df = metadata_df.copy(deep=True)
text_metadata_df['filename'] = text_metadata_df['filename'].apply(lambda x: format_ext(x, '.txt'))
text_metadata_df.head()

Unnamed: 0,filename,record_id,subject_id,age,sex,educ,mmse,dx,dx_binary,dataset
0,S002.txt,S002,S002,62.0,F,,30.0,NC,0,ADReSS
1,S003.txt,S003,S003,69.0,F,,29.0,NC,0,ADReSS
2,S004.txt,S004,S004,71.0,F,,30.0,NC,0,ADReSS
3,S005.txt,S005,S005,74.0,F,,30.0,NC,0,ADReSS
4,S006.txt,S006,S006,67.0,F,,29.0,NC,0,ADReSS


## Generate text files

In [9]:
def load_json(file):
    with open(file, 'r') as f:
        return json.load(f)

sample = '/Users/jimuelcelestejr/Downloads/Dataset/Thesis - Whisper Transcripts/taukdial-169-3.json'
sample_json = load_json(sample)
sample_json

{'text': " trouble okay but this one tells a story and i'd like you to look at everything that's happening and tell me a story about what you see with a beginning middle and an end a little girl sees a cat in a tree um our father our element",
 'segments': [{'id': 0,
   'seek': 0,
   'start': 0.0,
   'end': 7.640000000000001,
   'text': " trouble okay but this one tells a story and i'd like you to look at everything that's happening",
   'tokens': [50365,
    5253,
    1392,
    457,
    341,
    472,
    5112,
    257,
    1657,
    293,
    741,
    1116,
    411,
    291,
    281,
    574,
    412,
    1203,
    300,
    311,
    2737,
    50747],
   'temperature': 0.0,
   'avg_logprob': -0.12715835391350513,
   'compression_ratio': 1.511111111111111,
   'no_speech_prob': 8.309671310421929e-13},
  {'id': 1,
   'seek': 0,
   'start': 7.640000000000001,
   'end': 13.3,
   'text': ' and tell me a story about what you see with a beginning middle and an end',
   'tokens': [50747,
    293

In [10]:
sample_json.keys()

dict_keys(['text', 'segments', 'language'])

In [11]:
sample_json['text']

" trouble okay but this one tells a story and i'd like you to look at everything that's happening and tell me a story about what you see with a beginning middle and an end a little girl sees a cat in a tree um our father our element"

In [12]:
sample_json['language']

'en'

In [13]:
sample_json['segments']

[{'id': 0,
  'seek': 0,
  'start': 0.0,
  'end': 7.640000000000001,
  'text': " trouble okay but this one tells a story and i'd like you to look at everything that's happening",
  'tokens': [50365,
   5253,
   1392,
   457,
   341,
   472,
   5112,
   257,
   1657,
   293,
   741,
   1116,
   411,
   291,
   281,
   574,
   412,
   1203,
   300,
   311,
   2737,
   50747],
  'temperature': 0.0,
  'avg_logprob': -0.12715835391350513,
  'compression_ratio': 1.511111111111111,
  'no_speech_prob': 8.309671310421929e-13},
 {'id': 1,
  'seek': 0,
  'start': 7.640000000000001,
  'end': 13.3,
  'text': ' and tell me a story about what you see with a beginning middle and an end',
  'tokens': [50747,
   293,
   980,
   385,
   257,
   1657,
   466,
   437,
   291,
   536,
   365,
   257,
   2863,
   2808,
   293,
   364,
   917,
   51030],
  'temperature': 0.0,
  'avg_logprob': -0.12715835391350513,
  'compression_ratio': 1.511111111111111,
  'no_speech_prob': 8.309671310421929e-13},
 {'id': 2,


In [14]:
text_transcripts = {}
languages = {}

for file in new_metadata_df['filename']:
    input_file = os.path.join(input_dir, file)
    data = load_json(input_file)

    text_filename = format_ext(file, '.txt')
    
    text = data['text']
    text_transcripts[text_filename] = text
    
    language = data['language']
    languages[text_filename] = language

In [15]:
languages_df = pd.DataFrame.from_dict(languages, orient='index').reset_index()
languages_df = languages_df.rename(columns={'index': 'filename', 0: 'language'})
languages_df

Unnamed: 0,filename,language
0,S002.txt,en
1,S003.txt,en
2,S004.txt,en
3,S005.txt,en
4,S006.txt,en
...,...,...
1018,taukdial-168-2.txt,zh
1019,taukdial-168-3.txt,zh
1020,taukdial-169-1.txt,en
1021,taukdial-169-2.txt,en


In [16]:
text_metadata_df = text_metadata_df.merge(languages_df, on='filename', how='left')
text_metadata_df.head()

Unnamed: 0,filename,record_id,subject_id,age,sex,educ,mmse,dx,dx_binary,dataset,language
0,S002.txt,S002,S002,62.0,F,,30.0,NC,0,ADReSS,en
1,S003.txt,S003,S003,69.0,F,,29.0,NC,0,ADReSS,en
2,S004.txt,S004,S004,71.0,F,,30.0,NC,0,ADReSS,en
3,S005.txt,S005,S005,74.0,F,,30.0,NC,0,ADReSS,en
4,S006.txt,S006,S006,67.0,F,,29.0,NC,0,ADReSS,en


In [17]:
text_metadata_df['language'].value_counts()

language
en    708
zh    261
el     54
Name: count, dtype: int64

- 708 English 
- 261 Chinese 
- 54 Greek

In [18]:
output_dir = '/Users/jimuelcelestejr/Downloads/Dataset/Thesis - Text Transcripts'
text_metadata_csv = os.path.join(output_dir, 'metadata.csv')
text_metadata_df.to_csv(text_metadata_csv, index=False)

## Save text files

In [19]:
def save_text(text, file_path):
    with open(file_path, 'w') as f:
        f.write(text)
    return None
    
for file in text_transcripts:
    file_path = os.path.join(output_dir, file) 
    text = text_transcripts[file]
    save_text(text, file_path)

## Check for completeness

In [20]:
input_dir = '/Users/jimuelcelestejr/Downloads/Dataset/Thesis - Text Transcripts'
i = 0
for file in text_metadata_df['filename'].values:
    text_file = os.path.join(input_dir, file)
    
    if os.path.exists(text_file):
        i +=1 
    else:
        print('File does not exist:', textfile)
print(i)

1023


Complete list