# Transcription

By: Jimuel Celeste, Jr.

Objective: To generate transcripts for audio files from DementiaBank.

In [1]:
import whisper
import json
import os

In [2]:
def transcribe(audio_file, model):
    return model.transcribe(audio_file)

model = whisper.load_model("turbo")
sample = "C:\\Users\\Jimuel Celeste\\Documents\\thesis\\data\\Thesis - Combined\\adrsdt1.wav"
transcribe(sample, model)



{'text': " I want you to tell me everything you see happening in that picture. Everything? Everything you see happening. Happening. Everything that's going on. Mm-hmm. What's happening in that picture? Well, this is the boy trying to get the cookie jar done for his sister, I would imagine. And he's going to fall off the ladder, off the stool of where we sat. And the mother's over here. She just dropped some water or whatever on the floor. And she looks pretty mad. And the water's still running. And it looks like the girl is laughing at the boy because the stool is going to fall over. And there's water on the floor there. And some cups and so forth. Okay. Yeah. Thank you. Thank you. Thank you.",
 'segments': [{'id': 0,
   'seek': 0,
   'start': 0.0,
   'end': 3.72,
   'text': ' I want you to tell me everything you see happening in that picture.',
   'tokens': [50365,
    286,
    528,
    291,
    281,
    980,
    385,
    1203,
    291,
    536,
    2737,
    294,
    300,
    3036,
 

In [3]:
def save_result(result, output_file):
    with open(output_file, 'w') as f:
        json.dump(result, f, indent=4)
    return None 

In [4]:
def batch_transcribe(input_dir, output_dir, file_format, model):
    for root, directories, files in os.walk(input_dir):
        n = len(files)
        i = 1
        for file in files: 
            print(f"{i}/{n}: {file}")
            filename, ext = os.path.splitext(file)
            if ext.lstrip('.') not in file_format:
                print("Skipping file: file does not have the set format")
                i += 1
                continue 
            
            input_file = os.path.join(root, file).replace('\\', '/')
            print(f"input file: {input_file}")
            output_file = os.path.join(output_dir, filename + ".json").replace('\\', '/')
            print(f"output file: {output_file}")
            
            if os.path.isfile(output_file):
                print("Skipping transcription: file is previously processed!")
                i += 1
                continue 
                
            result = transcribe(input_file, model)
            save_result(result, output_file)
            print("Transcription saved")
            i += 1

input_dir = "C:\\Users\\Jimuel Celeste\\Documents\\thesis\\data\\Thesis - Combined"
output_dir = "C:\\Users\\Jimuel Celeste\\Documents\\thesis\\results\\Thesis - Whisper Transcripts"
file_format = ['wav', 'mp3']
batch_transcribe(input_dir, output_dir, file_format, model)

1/1025: .DS_Store
Skipping file: file does not have the set format
2/1025: adrsdt1.wav
input file: C:/Users/Jimuel Celeste/Documents/thesis/data/Thesis - Combined/adrsdt1.wav
output file: C:/Users/Jimuel Celeste/Documents/thesis/results/Thesis - Whisper Transcripts/adrsdt1.json
Skipping transcription: file is previously processed!
3/1025: adrsdt10.wav
input file: C:/Users/Jimuel Celeste/Documents/thesis/data/Thesis - Combined/adrsdt10.wav
output file: C:/Users/Jimuel Celeste/Documents/thesis/results/Thesis - Whisper Transcripts/adrsdt10.json
Skipping transcription: file is previously processed!
4/1025: adrsdt11.wav
input file: C:/Users/Jimuel Celeste/Documents/thesis/data/Thesis - Combined/adrsdt11.wav
output file: C:/Users/Jimuel Celeste/Documents/thesis/results/Thesis - Whisper Transcripts/adrsdt11.json
Skipping transcription: file is previously processed!
5/1025: adrsdt12.wav
input file: C:/Users/Jimuel Celeste/Documents/thesis/data/Thesis - Combined/adrsdt12.wav
output file: C:/Use