In [3]:
!pip install transformers accelerate faster-whisper python-dotenv numpy>=2.0.2 matplotlib huggingface-hub moviepy==1.0.3 assemblyai

In [None]:
import os
import tempfile
from google.colab import files

import torch
from faster_whisper import WhisperModel
from transformers import pipeline
from moviepy.editor import VideoFileClip
import assemblyai as aai

ASSEMBLYAI_API_KEY = "set_your_key"
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
MODEL_SIZE = "tiny"

CANDIDATE_EMOTIONS = ["joy", "anger", "sadness", "excitement", "calmness", "interest", "confusion"]
CANDIDATE_TONES = ["enthusiastic", "confident", "inquisitive", "hesitant", "professional", "sarcastic", "neutral"]

print(f"Using device: {DEVICE}")
print(f"Loading faster-whisper ({MODEL_SIZE})...")
whisper_model = WhisperModel(MODEL_SIZE, device=DEVICE, compute_type="float16" if DEVICE == "cuda" else "int8")

print("Loading Zero-Shot BART for metadata...")
classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli", device=0 if DEVICE == "cuda" else -1)

aai.settings.api_key = ASSEMBLYAI_API_KEY

def format_timestamp(seconds: float) -> str:
    h = int(seconds // 3600)
    m = int((seconds % 3600) // 60)
    s = int(seconds % 60)
    return f"[{h:02d}:{m:02d}:{s:02d}]"

def detect_reaction(text: str) -> str:
    lower = text.lower()
    if any(p in lower for p in ["great job", "fantastic", "excellent", "i agree"]):
        return "Positive Acknowledgment"
    if any(p in lower for p in ["i don't think so", "not sure", "problematic"]):
        return "Concern/Disagreement"
    return "None"

def get_metadata(text: str) -> dict:
    emotion_res = classifier(text, candidate_labels=CANDIDATE_EMOTIONS, multi_label=False)
    tone_res = classifier(text, candidate_labels=CANDIDATE_TONES, multi_label=False)
    return {
        "Emotion": emotion_res["labels"][0].capitalize(),
        "Tone": tone_res["labels"][0].capitalize(),
        "Reaction": detect_reaction(text),
    }

def extract_audio_from_video(video_path: str) -> str:
    print(f"\nExtracting audio from: {video_path}")
    video = VideoFileClip(video_path)
    tmp = tempfile.NamedTemporaryFile(delete=False, suffix=".mp3")
    audio_path = tmp.name
    tmp.close()
    video.audio.write_audiofile(audio_path, verbose=False, logger=None)
    video.close()
    print(f"Audio saved to: {audio_path}")
    return audio_path

def transcribe_audio(audio_path: str):
    print("\nStarting transcription...")
    segments, _ = whisper_model.transcribe(audio_path, beam_size=5)
    return list(segments)

def get_speaker_labels(audio_path: str):
    if not aai.settings.api_key:
        print("No ASSEMBLYAI_API_KEY set; skipping diarization.")
        return None

    print("\nRunning AssemblyAI diarization...")
    transcriber = aai.Transcriber()
    try:
        config = aai.TranscriptionConfig(speaker_labels=True)
        transcript = transcriber.transcribe(audio_path, config=config)

        if transcript.status == aai.TranscriptStatus.error:
            print("AssemblyAI error:", transcript.error)
            return None
        if not transcript.utterances:
            print("No utterances from AssemblyAI.")
            return None

        segs = []
        for utt in transcript.utterances:
            segs.append({
                "start": utt.start / 1000.0,
                "end": utt.end / 1000.0,
                "speaker": utt.speaker,
            })
        print("AssemblyAI speakers:", {s["speaker"] for s in segs})
        return segs
    except Exception as e:
        print("AssemblyAI exception:", e)
        return None

def align_speakers(whisper_segments, speaker_labels):
    if not speaker_labels:
        return {}
    alignment = {}
    for seg in whisper_segments:
        s_start, s_end = seg.start, seg.end
        best_match, best_overlap = None, 0.0
        for sp in speaker_labels:
            sp_start, sp_end = sp["start"], sp["end"]
            over_start = max(s_start, sp_start)
            over_end = min(s_end, sp_end)
            overlap = max(0.0, over_end - over_start)
            if overlap > best_overlap:
                best_overlap = overlap
                best_match = sp["speaker"]
        if best_match is not None:
            alignment[s_start] = best_match
    return alignment

def annotate_segment(segment, speaker_label: str) -> str:
    text = segment.text.strip()
    meta = get_metadata(text)
    ts = format_timestamp(segment.start)
    return f"{ts} {speaker_label}: \"{text}\" [Emotion: {meta['Emotion']}, Tone: {meta['Tone']}]"

def process_video_call(video_path: str):
    audio_path = extract_audio_from_video(video_path)
    try:
        segments = transcribe_audio(audio_path)
        speaker_labels = get_speaker_labels(audio_path)
        alignment = align_speakers(segments, speaker_labels) if speaker_labels else {}

        transcript_lines = []
        fallback_speaker = 1
        speaker_index_map = {}
        next_idx = 1

        for seg in segments:
            raw = alignment.get(seg.start)
            if raw is None:
                label = f"Speaker {fallback_speaker}"
                fallback_speaker = 2 if fallback_speaker == 1 else 1
            else:
                if raw not in speaker_index_map:
                    speaker_index_map[raw] = next_idx
                    next_idx += 1
                label = f"Speaker {speaker_index_map[raw]}"

            line = annotate_segment(seg, label)
            transcript_lines.append(line)

        return transcript_lines
    finally:
        if os.path.exists(audio_path):
            os.unlink(audio_path)

print("Upload a video file (e.g. sample2.mp4)...")
uploaded = files.upload()
video_path = next(iter(uploaded.keys()))

lines = process_video_call(video_path)

os.makedirs("output", exist_ok=True)
out_path = "output/final_annotated_transcript.txt"
with open(out_path, "w", encoding="utf-8") as f:
    f.write("\n".join(lines))

print("\n--- SAMPLE OUTPUT ---")
print("\n".join(lines[:5]))
print(f"\nFull transcript saved to: {out_path}")

Using device: cuda
Loading faster-whisper (tiny)...
Loading Zero-Shot BART for metadata...


Device set to use cuda:0


Upload a video file (e.g. sample2.mp4)...


Saving rag.mp4 to rag (1).mp4

Extracting audio from: rag (1).mp4
Audio saved to: /tmp/tmpd681vwqg.mp3

Starting transcription...

Running AssemblyAI diarization...
AssemblyAI speakers: {'A', 'B'}

--- SAMPLE OUTPUT ---
[00:00:00] Speaker 1: "So, first of all, can you tell us in simple terms what RAG is and why is it important" [Emotion: Interest, Tone: Inquisitive]
[00:00:06] Speaker 1: "in the development of AI systems?" [Emotion: Interest, Tone: Inquisitive]
[00:00:09] Speaker 2: "Right, let me start by full form of the RAG." [Emotion: Interest, Tone: Inquisitive]
[00:00:14] Speaker 2: "It's retrieval augmented generation, so now I'll break down each part of the RAG." [Emotion: Interest, Tone: Inquisitive]
[00:00:19] Speaker 2: "Retrieval augmented generation and explain you in very simple words of what it is," [Emotion: Interest, Tone: Confident]

Full transcript saved to: output/final_annotated_transcript.txt
