In [31]:
import os
import datetime
import time
import yaml
from pathlib import Path
from huggingface_hub import login

def format_time(seconds):
    minutes, seconds = divmod(seconds, 60)
    return f"{int(minutes)} minutes and {int(seconds)} seconds"

# INPUT/OUTPUT, MODEL SIZE Variables
INPUT_FILENAME = "testv2.mp4"
OUTPUT_FILENAME = "output.vtt"
MODEL_SIZE = "small"

# Set the name of the configuration file path
CONFIG_FILE_PATH = "/home/idies/workspace/Storage/tsande16/persistent/archive/archive_config.yaml"

start_time = time.time()
print(f"Starting notebook transcription and diarization at {datetime.datetime.now().strftime('%H:%M:%S')}")

with open(CONFIG_FILE_PATH, 'r') as file:
    config = yaml.safe_load(file.read())

# Set config variables
script_path = config['storage']['temp'] + "/transcribe_diarize.py"
input_file = config['storage']['content'] + "/" + INPUT_FILENAME
output_file = config['storage']['output'] + "/" + OUTPUT_FILENAME
cache_dir = Path(config['storage']['hf_cache'])
cache_dir.mkdir(parents=True, exist_ok=True)

# Set the cache dir for HF. Set hugging face token that is stored in the configuration file.
os.environ['HF_HOME'] = str(cache_dir)
print(f"Hugging Face cache directory set to: {cache_dir}")
os.environ['HF_TOKEN'] = config['tokens']['hugging_face']
hf_token = config['tokens']['hugging_face']

# Test that the login worked.
!hf auth whoami

# Get the most recent transcribe_diarize script
!wget https://raw.githubusercontent.com/jhu-sheridan-libraries/whisper_testing/main/transcribe_diarize.py -O {script_path}

if os.path.exists(config['storage']['temp']+"/transcribe_diarize.py"):
  print("Success: The transcribe_diarize.py file downloaded.")
else:
  print("Error: The transcribe_diarize.py file didn't download.")

!python {script_path} {input_file} \
    --whisper_cache_dir {cache_dir} \
    --output {output_file} \
    --model {MODEL_SIZE}

total_time = time.time() - start_time
print(f"\nTotal notebook runtime: {format_time(total_time)}")

# Download results
if os.path.exists(config['storage']['output']+"/output.vtt"):
  print("VTT file was successfully generated and stored here: ", config['storage']['output']+"/output.vtt")
else:
  print("Error: The VTT file was not generated.")

Starting notebook transcription and diarization at 16:06:49
Hugging Face cache directory set to: /home/idies/workspace/Storage/tsande16/persistent/archive/hf_cache
tim-sanders
[1morgs: [0m JohnsHopkins
--2025-09-08 16:06:50--  https://raw.githubusercontent.com/jhu-sheridan-libraries/whisper_testing/main/transcribe_diarize.py
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.111.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 35047 (34K) [text/plain]
Saving to: ‘/home/idies/workspace/Temporary/tsande16/scratch/transcribe_diarize.py’


2025-09-08 16:06:50 (37.0 MB/s) - ‘/home/idies/workspace/Temporary/tsande16/scratch/transcribe_diarize.py’ saved [35047/35047]

Success: The transcribe_diarize.py file downloaded.
--- Verifying environment variables for the shell command ---
HF_HOME is set to: /home/idies/workspa

In [30]:
!pip show torchaudio
!pip show pyannote.audio
!pip show speechbrain

[33mDEPRECATION: Loading egg at /home/idies/miniforge/envs/py312/lib/python3.12/site-packages/SciServer-2.2.0-py3.12.egg is deprecated. pip 25.1 will enforce this behaviour change. A possible replacement is to use pip for package installation. Discussion can be found at https://github.com/pypa/pip/issues/12330[0m[33m
[0mName: torchaudio
Version: 2.8.0
Summary: An audio package for PyTorch
Home-page: https://github.com/pytorch/audio
Author: Soumith Chintala, David Pollack, Sean Naren, Peter Goldsborough, Moto Hira, Caroline Chen, Jeff Hwang, Zhaoheng Ni, Xiaohui Zhang
Author-email: soumith@pytorch.org
License: 
Location: /home/idies/miniforge/envs/py312/lib/python3.12/site-packages
Requires: torch
Required-by: pyannote.audio, speechbrain, torch-audiomentations, torch_pitch_shift
[33mDEPRECATION: Loading egg at /home/idies/miniforge/envs/py312/lib/python3.12/site-packages/SciServer-2.2.0-py3.12.egg is deprecated. pip 25.1 will enforce this behaviour change. A possible replacement is