In [None]:
!apt-get update -y && apt-get install -y ffmpeg libavcodec-extra libavformat-extra
!pip install torch torchaudio
!pip install faster-whisper
!pip install pyannote-audio
!pip install openai-whisper
!pip install huggingface_hub
!pip install webvtt-py
!pip install pydub
!pip install python-dotenv
!wget https://raw.githubusercontent.com/jhu-sheridan-libraries/whisper_testing/main/transcribe_diarize.py -O /content/transcribe_diarize.py

!ffmpeg -formats | grep mp4
# you should see a line with "DE mp4" indicating decode/encode support

import os
import datetime
import time
from huggingface_hub import login
from google.colab import userdata, files


def format_time(seconds):
    minutes, seconds = divmod(seconds, 60)
    return f"{int(minutes)} minutes and {int(seconds)} seconds"

start_time = time.time()
print(f"Starting notebook transcription and diarization at {datetime.datetime.now().strftime('%H:%M:%S')}")

# Replace 'your-huggingface-token' with your real Hugging Face Access Token
# You can generate one at https://huggingface.co/settings/tokens

# This is stored as a key in colab. Add it to the left column <--
login(token=userdata.get('HF_TOKEN'))

# Test that the login worked.
!huggingface-cli whoami

os.makedirs("/content/data", exist_ok=True)

# Check if there are any files in /content/data
data_files = [f for f in os.listdir("/content/data") if os.path.isfile(os.path.join("/content/data", f))]

if not data_files:
    print("No files found in /content/data. Please upload an audio/video file:")
    uploaded = files.upload()
    # Get the first uploaded file
    if uploaded:
        FILENAME = list(uploaded.keys())[0]
        print(f"Using uploaded file: {FILENAME}")
    else:
        print("No file was uploaded. Using default filename.")
        FILENAME = "testv2.mp4"
else:
    # Use the first file found in /content/data
    FILENAME = data_files[0]
    print(f"Using existing file from /content/data: {FILENAME}")

print(f"Selected audio/video file: {FILENAME}")

if os.path.exists("/content/transcribe_diarize.py"):
  print("Success: The transcribe_diarize.py file downloaded.")
else:
  print("Error: The transcribe_diarize.py file didn't download.")

!python transcribe_diarize.py /content/{FILENAME} \
  --output /content/data/output.vtt \
  --model small

total_time = time.time() - start_time
print(f"\nTotal notebook runtime: {format_time(total_time)}")

# Download results
if os.path.exists("/content/data/output.vtt"):
  files.download("/content/data/output.vtt")
else:
  print("Error: The VTT file was not generated.")
