In [9]:
# ===========================
# Colab setup
# ===========================

!pip install -q openai-whisper pydub requests iso8601 reportlab
!apt-get update -qq && apt-get install -y -qq ffmpeg

W: Skipping acquire of configured file 'main/source/Sources' as repository 'https://r2u.stat.illinois.edu/ubuntu jammy InRelease' does not seem to provide it (sources.list entry misspelt?)


In [10]:
# ===========================
# Imports
# ===========================
import whisper
import json
import datetime
from datetime import timedelta, timezone
from pathlib import Path
import torch
from pydub import AudioSegment
import numpy as np
import requests
import re
import iso8601
import csv
import subprocess
from IPython.display import clear_output
from tqdm.notebook import tqdm  # For Jupyter/Colab

# ReportLab
from reportlab.lib import colors
from reportlab.lib.pagesizes import letter
from reportlab.platypus import SimpleDocTemplate, Table, TableStyle, Paragraph, Spacer, PageBreak
from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle
from reportlab.lib.units import inch

In [11]:
# ===========================
# Helpers — cURL, paths, time
# ===========================
def extract_ids_from_curl(curl_string: str):
    """
    Pull course_id, section_id, class_id from the Referer in the cURL (if present).
    Fallback: try to find classes/<digits> anywhere in the cURL.
    """
    out = {"course_id": None, "section_id": None, "class_id": None, "referer": None}
    # Referer header
    ref_match = re.search(r"-H\s+'referer:\s*(.*?)'", curl_string, re.IGNORECASE)
    if ref_match:
        ref = ref_match.group(1)
        out["referer"] = ref
        m = re.search(r"/courses/(\d+)/sections/(\d+)/classes/(\d+)", ref)
        if m:
            out["course_id"], out["section_id"], out["class_id"] = m.group(1), m.group(2), m.group(3)

    if not out["class_id"]:
        m2 = re.search(r"/classes/(\d+)", curl_string)
        if m2:
            out["class_id"] = m2.group(1)
    return out

def derive_class_link_from_curl(curl_string: str, course_id: str = None, section_id: str = None, class_id: str = None):
    """
    If referer exists, return it; else build link if we have ids; else empty string.
    """
    ref_match = re.search(r"-H\s+'referer:\s*(.*?)'", curl_string, re.IGNORECASE)
    if ref_match:
        return ref_match.group(1)
    if course_id and section_id and class_id:
        return f"https://forum.minerva.edu/app/courses/{course_id}/sections/{section_id}/classes/{class_id}"
    if class_id:
        return f"https://forum.minerva.edu/app/classes/{class_id}"
    return ""

def _safe_date(date_str):
    # return YYYY-MM-DD from ISO8601, or ""
    if not date_str:
        return ""
    try:
        return date_str.split('T')[0]
    except Exception:
        return ""

def _fmt_mmss(seconds_float):
    if seconds_float is None:
        return ""
    seconds = max(0, int(seconds_float))
    m, s = divmod(seconds, 60)
    return f"{m:02d}:{s:02d}"

def normalize_sentence_spacing(text: str) -> str:
    """
    Fixes missing spaces after sentence punctuation and collapses over-spacing.
    Handles cases like "taught CS51.So you've" -> "taught CS51. So you've"
    """
    if not text:
        return text
    # Ensure space after . ! ? when followed by letter/quote/number
    text = re.sub(r'([.!?])(?=[A-Za-z0-9"\'])', r'\1 ', text)
    # Collapse multiple spaces
    text = re.sub(r'\s{2,}', ' ', text)
    # Trim space before punctuation
    text = re.sub(r'\s+([,.!?;:])', r'\1', text)
    # Make sure quotes then letter also have space before if needed: already covered by first rule most times
    return text.strip()

_ZWSP = "\u200b"
def soft_break_long_token(s: str, every: int = 14) -> str:
    """
    Insert zero-width breaks into very long tokens to avoid PDF table overlap.
    Keeps spaces intact; only breaks long runs of non-space characters.
    """
    if not s:
        return s
    parts = []
    for token in re.split(r"(\s+)", s):
        if token.strip() == "":
            parts.append(token)
        else:
            # insert breaks every N characters
            chunks = [token[i:i+every] for i in range(0, len(token), every)]
            parts.append(_ZWSP.join(chunks))
    return "".join(parts)

def clean_curl(curl_string):
   headers = {}

   # Extract headers with -H flag
   header_matches = re.findall(r"-H ['\"](.*?): (.*?)['\"]", curl_string)
   for name, value in header_matches:
       headers[name] = value

   # Extract cookies with -b flag
   cookie_match = re.search(r"-b ['\"](.*?)['\"]", curl_string)
   if cookie_match:
       cookie_str = cookie_match.group(1)
       headers['Cookie'] = cookie_str

   # Ensure we look like XHR JSON
   headers.setdefault("accept", "application/json, text/javascript, */*; q=0.01")
   headers.setdefault("x-requested-with", "XMLHttpRequest")
   return headers

In [12]:
# ===========================
# Audio preprocessor
# ===========================
class AudioPreprocessor:
    @staticmethod
    def validate_and_fix_file(file_path: str) -> str:
        """
        Validates and preprocesses audio files for optimal transcription.
        Supports direct URLs (downloads to /content/input_downloaded.*).
        For MP4 files, converts to WAV for Whisper.
        """
        print(f"Validating file or URL: {file_path}")

        # If URL, download
        if isinstance(file_path, str) and re.match(r"^https?://", file_path.strip(), re.IGNORECASE):
            try:
                print("Detected URL — downloading...")
                resp = requests.get(file_path, stream=True, timeout=60)
                resp.raise_for_status()
                suffix = ".mp4" if ".mp4" in file_path.lower() else (".mp3" if ".mp3" in file_path.lower() else ".bin")
                dl_path = "/content/input_downloaded" + suffix
                with open(dl_path, "wb") as f:
                    for chunk in resp.iter_content(chunk_size=1024 * 1024):
                        if chunk:
                            f.write(chunk)
                print(f"Downloaded to: {dl_path}")
                file_path = dl_path
            except Exception as e:
                raise RuntimeError(f"Failed to download media: {e}")

        # Local/Downloaded path must exist now
        if not Path(file_path).exists():
            raise FileNotFoundError(f"File not found: {file_path}")

        try:
            if file_path.lower().endswith('.mp4'):
                print(f"Converting MP4 → Whisper-optimized WAV...")
                return AudioPreprocessor._convert_to_whisper_wav(file_path)
            elif file_path.lower().endswith(('.mp3', '.m4a', '.aac', '.ogg', '.wav')):
                print(f"Normalizing to Whisper-optimized WAV...")
                return AudioPreprocessor._convert_to_whisper_wav(file_path)
            else:
                print("Unknown format — attempting Python fallback decode...")
                return AudioPreprocessor._python_extract_audio(file_path)
        except Exception as e:
            print(f"Error during file processing: {str(e)}")
            raise

    @staticmethod
    def _convert_to_whisper_wav(audio_path: str) -> str:
        """Convert any audio file to WAV format optimized for Whisper model"""
        wav_path = audio_path.rsplit('.', 1)[0] + '.wav'
        try:
            result = subprocess.run([
                'ffmpeg','-y','-i', audio_path,
                '-acodec','pcm_s16le','-ar','16000','-ac','1', wav_path
            ], capture_output=True, text=True, check=False)
            if result.returncode != 0:
                raise RuntimeError(result.stderr or "ffmpeg failed")
            print(f"Created: {wav_path}")
            return wav_path
        except Exception as e:
            raise RuntimeError(f"Failed to convert {audio_path} → WAV: {e}")

    @staticmethod
    def _python_extract_audio(file_path: str) -> str:
        """
        Fallback: use PyDub to decode & write 16kHz mono WAV.
        """
        print("Attempting Python-based audio extraction...")
        wav_path = file_path.rsplit('.', 1)[0] + '_extracted.wav'
        audio = AudioSegment.from_file(file_path)
        audio = audio.set_frame_rate(16000).set_channels(1).set_sample_width(2)
        audio.export(wav_path, format="wav")
        if not Path(wav_path).exists() or Path(wav_path).stat().st_size == 0:
            raise RuntimeError("Python audio extraction produced empty file")
        print(f"Created: {wav_path}")
        return wav_path

In [13]:
# ===========================
# Whisper transcription
# ===========================
class TranscriptionProcessor:
    def __init__(self, segment_length=14400):
        self.device = "cuda" if torch.cuda.is_available() else "cpu"
        print(f"Using device: {self.device}")
        if self.device == "cuda":
            torch.backends.cuda.matmul.allow_tf32 = True
            torch.backends.cudnn.benchmark = True
            torch.backends.cudnn.allow_tf32 = True
            torch.backends.cudnn.deterministic = False
            torch.cuda.empty_cache()
        print("Loading Whisper model...")
        self.model = whisper.load_model("medium").to(self.device)
        if self.device == "cuda":
            self.model = self.model.half()
        self.segment_length = segment_length

    def transcribe(self, audio_path, class_id):
        print(f"Processing audio to generate transcript JSON...")
        try:
            audio = AudioSegment.from_file(audio_path)
            total_duration = len(audio) / 1000
            print(f"Total duration: {timedelta(seconds=int(total_duration))}")

            all_segments = []
            segment_times = range(0, int(total_duration), self.segment_length)

            for start_time in tqdm(segment_times, desc="Processing segments", unit="segment"):
                duration = min(self.segment_length, total_duration - start_time)
                segment = audio[start_time*1000:(start_time+duration)*1000]
                temp_path = f"/content/temp_segment_{start_time}.wav"
                segment.export(temp_path, format="wav")

                try:
                    # Updated deprecation: use torch.amp.autocast("cuda", ...)
                    use_amp = (self.device == "cuda")
                    ctx_mgr = torch.amp.autocast("cuda") if use_amp else nullcontext()
                except Exception:
                    # Fallback if torch.amp not available
                    class _Dummy:
                        def __enter__(self): pass
                        def __exit__(self, *a): pass
                    ctx_mgr = _Dummy()

                try:
                    with ctx_mgr:
                        result = self.model.transcribe(
                            temp_path,
                            word_timestamps=True,
                            language='en',
                            task='transcribe',
                            fp16=(self.device=="cuda"),
                            condition_on_previous_text=True,
                            initial_prompt="This is a university lecture."
                        )

                    for seg in result["segments"]:
                        seg_start = float(seg["start"]) + start_time
                        seg_end   = float(seg["end"]) + start_time
                        words = []
                        for w in seg.get("words", []):
                            words.append({
                                "word": w["word"].strip(),
                                "start": float(w["start"]) + start_time,
                                "end": float(w["end"]) + start_time
                            })
                        all_segments.append({
                            "start": seg_start,
                            "end": seg_end,
                            "text": seg["text"].strip(),
                            "words": words
                        })

                finally:
                    try:
                        Path(temp_path).unlink(missing_ok=True)
                    except:
                        pass
                    if self.device == "cuda":
                        torch.cuda.empty_cache()

            if not all_segments:
                print("Warning: No segments were transcribed.")
                return None

            transcript_path = f"/content/session_{class_id}_transcript.json"
            with open(transcript_path, 'w', encoding='utf-8') as f:
                json.dump({"segments": sorted(all_segments, key=lambda x: x["start"])}, f, indent=2)

            print(f"Transcript JSON saved to: {transcript_path}")
            return transcript_path

        except Exception as e:
            print(f"Error in transcription process: {str(e)}")
            raise

# tiny helper for autocast fallback
from contextlib import nullcontext

In [14]:
# ===========================
# Forum data (events, voice, attendance)
# ===========================
def get_forum_events(class_id, headers, raw_curl):
    print("Fetching class and event data from Forum...")
    try:
        # Class meta
        class_url = f'https://forum.minerva.edu/api/v1/class_grader/classes/{class_id}'
        r = requests.get(class_url, headers=headers, timeout=30)
        if r.status_code != 200:
            print(f"Class data error: {r.status_code}\n{r.text[:400]}")
            raise RuntimeError(f"Failed to access class data. Status code: {r.status_code}")
        data = r.json()

        session_title = data.get('title') or f"Session {class_id}"
        section_title = (data.get('section') or {}).get('title', '')  # e.g. "Terrana, MW@09:00AM San Francisco"
        course_obj = (data.get('section') or {}).get('course') or {}
        course_code  = course_obj.get('course-code', '')
        course_title = course_obj.get('title', '')
        class_type   = data.get('type', '')

        # Recording window
        rec = (data.get('recording-sessions') or [{}])[0]
        recording_start = rec.get('recording-started')
        recording_end   = rec.get('recording-ended')

        # Guess schedule from section_title "Terrana, MW@09:00AM San Francisco"
        schedule_guess = ''
        if isinstance(section_title, str) and ',' in section_title:
            parts = [p.strip() for p in section_title.split(',', 1)]
            schedule_guess = parts[1] if len(parts) > 1 else ''

        ids = extract_ids_from_curl(raw_curl)
        course_id  = ids.get("course_id")
        section_id = ids.get("section_id")
        class_link = derive_class_link_from_curl(raw_curl, course_id, section_id, str(class_id))

        # ---- Events ----
        events_url = f'https://forum.minerva.edu/api/v1/class_grader/classes/{class_id}/class-events'
        r = requests.get(events_url, headers=headers, timeout=30)
        if r.status_code != 200:
            print(f"Class events error: {r.status_code}\n{r.text[:400]}")
            raise RuntimeError(f"Failed to access class events. Status code: {r.status_code}")

        events = r.json()
        if not isinstance(events, list):
            raise ValueError("No valid class events returned from API")

        # Parse voice & timeline
        voice_events = []
        timeline_segments = []

        if not recording_start:
            raise KeyError("No recording-started found in class data")
        ref_time = iso8601.parse_date(recording_start)

        for ev in events:
            et = ev.get('event-type')
            try:
                if et == 'voice':
                    duration_ms = (ev.get('event-data') or {}).get('duration', 0)
                    duration = duration_ms / 1000.0
                    if duration >= 1:
                        start_time = iso8601.parse_date(ev['start-time'])
                        end_time   = iso8601.parse_date(ev['end-time'])
                        actor = ev.get('actor') or {}
                        voice_events.append({
                            'start': (start_time - ref_time).total_seconds(),
                            'end': (end_time - ref_time).total_seconds(),
                            'duration': duration,
                            'speaker': {
                                'id': actor.get('id'),
                                'first_name': actor.get('first-name'),
                                'last_name':  actor.get('last-name')
                            }
                        })
                elif et == 'timeline-segment':
                    start_time = iso8601.parse_date(ev['start-time'])
                    seg = (ev.get('event-data') or {})
                    timeline_segments.append({
                        'abs_start': ev['start-time'],
                        'offset_seconds': (start_time - ref_time).total_seconds(),
                        'section': seg.get('timeline-section-title', ''),
                        'title':   seg.get('timeline-segment-title', ''),
                    })
            except KeyError:
                continue

        timeline_segments.sort(key=lambda x: x['offset_seconds'])

        # Attempt attendance from class-users (best-effort)
        attendance = []
        for cu in data.get('class-users', []):
            role = (cu.get('role') or "").lower()
            if role == 'student':
                u = cu.get('user') or {}
                name = f"{(u.get('first-name') or '').strip()} {(u.get('last-name') or '').strip()}".strip()
                uid  = u.get('id') or cu.get('user-id')
                # infer absence if possible; default = present
                abs_flag = cu.get('absent')
                if abs_flag is None:
                    att = cu.get('attended')
                    abs_flag = (False if att is None else (not bool(att)))
                attendance.append({'name': name or f"ID {uid}", 'id': uid, 'absent': bool(abs_flag)})

        # Build class meta (NO instructor field)
        # Also include class_id, course/section IDs and link for headers
        class_meta = {
            'class_id': str(class_id),
            'session_title': session_title,
            'course_code': course_code,
            'course_title': course_title,
            'section_title': section_title,   # "Terrana, MW@09:00AM San Francisco"
            'schedule': schedule_guess,
            'class_type': class_type,
            'recording_start': recording_start,
            'recording_end': recording_end,
            'course_id': course_id,
            'section_id': section_id,
            'class_link': class_link
        }

        events_data = {
            'class_id': class_id,
            'class_meta': class_meta,
            'voice_events': voice_events,
            'timeline_segments': timeline_segments,
            'attendance': attendance
        }
        temp_events_path = f"/content/session_{class_id}_events.json"
        with open(temp_events_path, 'w', encoding='utf-8') as f:
            json.dump(events_data, f, indent=2)

        print(f"Processed voice events: {len(voice_events)}; timeline segments: {len(timeline_segments)}; attendance: {len(attendance)}")
        return events_data

    except Exception as e:
        print(f"Error fetching Forum events: {e}")
        raise

In [15]:
# ===========================
# Compile (PDF / CSV)
# ===========================
def _build_speaker_window_map(events_data, privacy_mode: str):
    """
    Build a map of (start,end) -> display_name based on privacy mode (names/ids).
    """
    speaker_map = {}
    for ev in events_data.get('voice_events', []):
        fn = (ev['speaker'].get('first_name') or '').strip()
        ln = (ev['speaker'].get('last_name') or '').strip()
        uid = ev['speaker'].get('id')
        name = (f"{fn} {ln}".strip() or "Professor")
        if privacy_mode == "ids" and uid:
            disp = f"ID {uid}"
        else:
            disp = name
        speaker_map[(ev['start'], ev['end'])] = disp
    return speaker_map

def compile_transcript_to_pdf(class_id, headers, privacy_mode="names"):
    try:
        # Load JSONs
        with open(f"/content/session_{class_id}_transcript.json", 'r') as f:
            transcript_data = json.load(f)
        with open(f"/content/session_{class_id}_events.json", 'r') as f:
            events_data = json.load(f)

        class_meta = events_data.get('class_meta', {})
        timeline_segments = events_data.get('timeline_segments', [])
        attendance = events_data.get('attendance', [])

        # Speaker mapping
        speaker_map = _build_speaker_window_map(events_data, privacy_mode)

        def find_speaker_at_time(time_point):
            for (start, end), speaker in speaker_map.items():
                if start <= time_point <= end:
                    return speaker
            return "Professor"  # generic fallback

        # Combine consecutive segments by same speaker
        compiled_entries = []
        current_entry = {'speaker': None, 'start_time': None, 'text': [], 'end_time': None}
        for segment in transcript_data['segments']:
            start_time = segment['start']; end_time = segment['end']
            raw_text = (segment['text'] or "").strip()
            if not raw_text:
                continue
            current_speaker = find_speaker_at_time(start_time)

            start_new = False
            if not current_entry['speaker']:
                start_new = True
            elif current_entry['speaker'] != current_speaker:
                start_new = True
            elif current_entry['end_time'] is not None and start_time - current_entry['end_time'] > 2:
                start_new = True

            if start_new:
                if current_entry['speaker']:
                    compiled_entries.append(current_entry)
                current_entry = {
                    'speaker': current_speaker,
                    'start_time': start_time,
                    'text': [raw_text],
                    'end_time': end_time
                }
            else:
                current_entry['text'].append(raw_text)
                current_entry['end_time'] = end_time
        if current_entry['speaker']:
            compiled_entries.append(current_entry)

        # Styles
        styles = getSampleStyleSheet()
        contribution_style = ParagraphStyle('ContributionStyle', parent=styles['Normal'],
                                            fontName='Helvetica', fontSize=10, leading=12, wordWrap='CJK')
        header_style = ParagraphStyle('HeaderStyle', parent=styles['Normal'], fontName='Helvetica-Bold',
                                      fontSize=12, textColor=colors.whitesmoke, alignment=1)
        speaker_style = ParagraphStyle('SpeakerStyle', parent=styles['Normal'],
                                       fontName='Helvetica', fontSize=10, leading=12, wordWrap='CJK')

        # PDF
        suffix = "names" if privacy_mode == "names" else "ids"
        output_path = f"/content/session_{class_id}_transcript_{suffix}.pdf"
        doc = SimpleDocTemplate(output_path, pagesize=letter, rightMargin=60, leftMargin=60, topMargin=60, bottomMargin=60)

        elements = []

        # ====== HEADER / TITLE ======
        session_line = class_meta.get('session_title') or f"Session {class_id}"
        elements.append(Paragraph(session_line, styles['Title']))

        # Centered schedule line (sec_sched)
        sec_sched = class_meta.get('section_title', '') or class_meta.get('schedule', '')
        if sec_sched:
            centered_info_style = ParagraphStyle('CenteredInfo', parent=styles['Heading3'], alignment=1)
            elements.append(Paragraph(sec_sched, centered_info_style))
        elements.append(Spacer(1, 10))

        # Left-aligned meta lines (bold labels)
        # Class ID, Class Date/Time from recording_start (UTC), Class Link
        rec_start = class_meta.get('recording_start')
        dt_str = ""
        if rec_start:
            try:
                dt = iso8601.parse_date(rec_start).astimezone(timezone.utc)
                dt_str = dt.strftime("%Y-%m-%d %H:%M UTC")
            except:
                dt_str = _safe_date(rec_start)
        meta_lines = []
        meta_lines.append(Paragraph(f"<b>Class ID:</b> {class_meta.get('class_id','')}", styles['Normal']))
        if dt_str:
            meta_lines.append(Paragraph(f"<b>Class Date/Time:</b> {dt_str}", styles['Normal']))
        link = class_meta.get('class_link') or ""
        if link:
            meta_lines.append(Paragraph(f"<b>Class Link:</b> {link}", styles['Normal']))
        for p in meta_lines:
            elements.append(p)
        elements.append(Spacer(1, 14))

        # ====== ATTENDANCE TABLE ======
        if attendance:
            elements.append(Paragraph("Attendance", styles['Heading3']))
            att_rows = [[Paragraph('Student', header_style), Paragraph('Status', header_style)]]
            for a in attendance:
                display_name = a.get('name','')
                if privacy_mode == "ids" and a.get('id'):
                    display_name = f"ID {a['id']}"
                att_rows.append([
                    Paragraph(soft_break_long_token(display_name, 14), speaker_style),
                    Paragraph("Absent" if a.get('absent') else "Present", speaker_style)
                ])
            att_table = Table(att_rows, colWidths=[4.5*inch, 1.5*inch], repeatRows=1)
            att_style = TableStyle([
                ('BACKGROUND', (0,0), (-1,0), colors.grey),
                ('TEXTCOLOR', (0,0), (-1,0), colors.whitesmoke),
                ('ALIGN', (0,0), (-1,0), 'CENTER'),
                ('FONTNAME', (0,0), (-1,0), 'Helvetica-Bold'),
                ('FONTNAME', (0,1), (-1,-1), 'Helvetica'),
                ('FONTSIZE', (0,1), (-1,-1), 10),
                ('VALIGN', (0,0), (-1,-1), 'TOP'),
                ('GRID', (0,0), (-1,-1), 1, colors.black),
                ('LEFTPADDING', (0,0), (-1,-1), 6),
                ('RIGHTPADDING', (0,0), (-1,-1), 6),
                ('TOPPADDING', (0,0), (-1,-1), 3),
                ('BOTTOMPADDING', (0,0), (-1,-1), 3),
            ])
            for i, a in enumerate(attendance, start=1):
                color = colors.red if a.get('absent') else colors.green
                att_style.add('TEXTCOLOR', (1,i), (1,i), color)
            att_table.setStyle(att_style)
            elements.append(att_table)
            elements.append(Spacer(1, 16))

        # ====== CLASS EVENTS TABLE ======
        if timeline_segments:
            elements.append(Paragraph("Class Events", styles['Heading3']))
            events_data_rows = [[Paragraph('Time', header_style),
                                 Paragraph('Section', header_style),
                                 Paragraph('Event', header_style)]]
            for seg in timeline_segments:
                events_data_rows.append([
                    _fmt_mmss(seg.get('offset_seconds')),
                    Paragraph((seg.get('section','') or ''), styles['Normal']),
                    Paragraph((seg.get('title','') or ''), styles['Normal']),
                ])
            # Adjusted widths to reduce overlap, text wrapped via Paragraph
            events_table = Table(events_data_rows, colWidths=[0.9*inch, 2.2*inch, 3.9*inch], repeatRows=1)
            events_table.setStyle(TableStyle([
                ('BACKGROUND', (0,0), (-1,0), colors.grey),
                ('TEXTCOLOR', (0,0), (-1,0), colors.whitesmoke),
                ('ALIGN', (0,0), (-1,0), 'CENTER'),
                ('FONTNAME', (0,0), (-1,0), 'Helvetica-Bold'),
                ('FONTNAME', (0,1), (-1,-1), 'Helvetica'),
                ('FONTSIZE', (0,1), (-1,-1), 10),
                ('VALIGN', (0,0), (-1,-1), 'TOP'),
                ('GRID', (0,0), (-1,-1), 1, colors.black),
                ('LEFTPADDING', (0,0), (-1,-1), 6),
                ('RIGHTPADDING', (0,0), (-1,-1), 6),
                ('TOPPADDING', (0,0), (-1,-1), 3),
                ('BOTTOMPADDING', (0,0), (-1,-1), 3),
            ]))
            elements.append(events_table)
            elements.append(Spacer(1, 16))

        # ====== TRANSCRIPT (break only by class events) ======
        elements.append(Paragraph("Transcript", styles['Heading3']))
        elements.append(Spacer(1, 6))

        # Flatten and normalize
        all_items = []
        for entry in compiled_entries:
            text = normalize_sentence_spacing(' '.join(entry['text']).strip())
            if text in ['...', '.', '', 'Mm-hmm.'] or len(text) < 3:
                continue
            timestamp = _fmt_mmss(entry['start_time'])
            # split into printable chunks to avoid super long cells
            max_chars_per_chunk = 500
            sentences = re.split(r'(?<=[.!?])\s+', text)
            chunks, curr = [], ""
            for s in sentences:
                candidate = (curr + s + " ").strip() if curr else (s + " ")
                if len(candidate) <= max_chars_per_chunk:
                    curr = candidate
                else:
                    if curr: chunks.append(curr.strip())
                    curr = s + " "
            if curr: chunks.append(curr.strip())

            for i, chunk in enumerate(chunks or [text]):
                display_ts = "(cont.)" if i > 0 else timestamp
                all_items.append({
                    'start_time': entry['start_time'],
                    'end_time': entry['end_time'],
                    'timestamp': display_ts,
                    'speaker': entry['speaker'],
                    'text': chunk
                })
        all_items.sort(key=lambda x: x['start_time'])

        # Build event windows; include preamble
        seg_windows = []
        if timeline_segments:
            first_start = max(0, (timeline_segments[0].get('offset_seconds') or 0))
            if first_start > 0:
                seg_windows.append({'start': 0, 'end': first_start, 'label': f"{_fmt_mmss(0)} — Before first event"})
            for idx, seg in enumerate(timeline_segments):
                start = max(0, (seg.get('offset_seconds') or 0))
                end = (timeline_segments[idx+1].get('offset_seconds') if idx+1 < len(timeline_segments) else float('inf')) or float('inf')
                bits = []
                if seg.get('section'): bits.append(seg['section'])
                if seg.get('title'):   bits.append(seg['title'])
                label = f"{_fmt_mmss(start)} — " + (' · '.join(bits) if bits else 'Event')
                seg_windows.append({'start': start, 'end': end, 'label': label})
        else:
            seg_windows.append({'start': 0, 'end': float('inf'), 'label': "Transcript"})

        for win in seg_windows:
            bucket = [it for it in all_items if win['start'] <= it['start_time'] < win['end']]
            if not bucket:
                continue
            elements.append(Paragraph(win['label'], styles['Heading4']))
            elements.append(Spacer(1, 4))

            data = [[Paragraph('Time', header_style),
                     Paragraph('Speaker', header_style),
                     Paragraph('Contribution', header_style)]]
            for item in bucket:
                spk_txt = soft_break_long_token(item['speaker'], 14)
                data.append([
                    item['timestamp'],
                    Paragraph(spk_txt, speaker_style),
                    Paragraph(normalize_sentence_spacing(item['text']), contribution_style)
                ])

            table = Table(data, colWidths=[0.75*inch, 2.1*inch, 4.25*inch], repeatRows=1)
            table.setStyle(TableStyle([
                ('BACKGROUND', (0, 0), (-1, 0), colors.grey),
                ('TEXTCOLOR', (0, 0), (-1, 0), colors.whitesmoke),
                ('ALIGN', (0, 0), (-1, 0), 'CENTER'),
                ('FONTNAME', (0, 0), (-1, 0), 'Helvetica-Bold'),
                ('FONTNAME', (0, 1), (-1, -1), 'Helvetica'),
                ('FONTSIZE', (0, 1), (-1, -1), 10),
                ('VALIGN', (0, 0), (-1, -1), 'TOP'),
                ('GRID', (0, 0), (-1, -1), 1, colors.black),
                ('LEFTPADDING', (0, 0), (-1, -1), 6),
                ('RIGHTPADDING', (0, 0), (-1, -1), 6),
                ('TOPPADDING', (0, 0), (-1, -1), 3),
                ('BOTTOMPADDING', (0, 0), (-1, -1), 3),
            ]))
            elements.append(table)
            elements.append(Spacer(1, 12))

        doc.build(elements)
        print(f"Created PDF transcript: {output_path}")
        return output_path

    except Exception as e:
        print(f"Error processing transcript: {str(e)}")
        raise e

def compile_transcript_to_csv(class_id, headers, privacy_mode="names"):
    """
    CSV:
      - Header block (centered items as lines)
      - Attendance
      - Class Events
      - Transcript with event headers
    """
    try:
        with open(f"/content/session_{class_id}_transcript.json", 'r') as f:
            transcript_data = json.load(f)
        with open(f"/content/session_{class_id}_events.json", 'r') as f:
            events_data = json.load(f)

        class_meta = events_data.get('class_meta', {})
        timeline_segments = events_data.get('timeline_segments', [])
        attendance = events_data.get('attendance', [])

        # Speaker mapping
        speaker_map = _build_speaker_window_map(events_data, privacy_mode)

        def find_speaker_at_time(t):
            for (start, end), spk in speaker_map.items():
                if start <= t <= end:
                    return spk
            return "Professor"

        # Combine segments by speaker
        compiled_entries = []
        current = {'speaker': None, 'start_time': None, 'text': [], 'end_time': None}
        for seg in transcript_data['segments']:
            st, en, tx = seg['start'], seg['end'], (seg['text'] or '').strip()
            if not tx:
                continue
            spk = find_speaker_at_time(st)

            start_new = (not current['speaker']) or (current['speaker'] != spk) or (current['end_time'] is not None and st - current['end_time'] > 2)
            if start_new:
                if current['speaker']:
                    compiled_entries.append(current)
                current = {'speaker': spk, 'start_time': st, 'text': [tx], 'end_time': en}
            else:
                current['text'].append(tx)
                current['end_time'] = en
        if current['speaker']:
            compiled_entries.append(current)

        # Flatten
        all_items = []
        for entry in compiled_entries:
            text = normalize_sentence_spacing(' '.join(entry['text']).strip())
            if text in ['...', '.', '', 'Mm-hmm.'] or len(text) < 3:
                continue
            timestamp = _fmt_mmss(entry['start_time'])
            all_items.append({
                'timestamp': timestamp,
                'speaker': entry['speaker'],
                'text': text,
                'start_time': entry['start_time'],
                'end_time': entry['end_time']
            })
        all_items.sort(key=lambda x: x['start_time'])

        # Build event windows
        segmented_rows = []
        seg_windows = []
        if timeline_segments:
            first_start = max(0, (timeline_segments[0].get('offset_seconds') or 0))
            if first_start > 0:
                seg_windows.append({'start': 0, 'end': first_start, 'label': f"{_fmt_mmss(0)} — Before first event"})
            for idx, seg in enumerate(timeline_segments):
                start = max(0, (seg.get('offset_seconds') or 0))
                end = (timeline_segments[idx+1].get('offset_seconds') if idx+1 < len(timeline_segments) else float('inf')) or float('inf')
                bits = []
                if seg.get('section'): bits.append(seg['section'])
                if seg.get('title'):   bits.append(seg['title'])
                label = f"{_fmt_mmss(start)} — " + (' / '.join(bits) if bits else 'Event')
                seg_windows.append({'start': start, 'end': end, 'label': label})
        else:
            seg_windows.append({'start': 0, 'end': float('inf'), 'label': "Transcript"})

        for win in seg_windows:
            bucket = [it for it in all_items if win['start'] <= it['start_time'] < win['end']]
            if not bucket:
                continue
            segmented_rows.append({'timestamp': '', 'speaker': '', 'text': f"--- {win['label']} ---"})
            segmented_rows.extend(bucket)

        out_rows = segmented_rows or all_items

        # Write CSV
        suffix = "names" if privacy_mode == "names" else "ids"
        output_path = f"/content/session_{class_id}_transcript_{suffix}.csv"
        with open(output_path, 'w', newline='', encoding='utf-8') as csvfile:
            w = csv.writer(csvfile)

            # Header block
            w.writerow(["Session", class_meta.get('session_title','')])
            sec_sched = class_meta.get('section_title') or class_meta.get('schedule') or ''
            if sec_sched:
                w.writerow([sec_sched])
            # Bold not supported in CSV, but include meta lines:
            w.writerow(["Class ID", class_meta.get('class_id','')])
            rec_start = class_meta.get('recording_start')
            dt_str = ""
            if rec_start:
                try:
                    dt = iso8601.parse_date(rec_start).astimezone(timezone.utc)
                    dt_str = dt.strftime("%Y-%m-%d %H:%M UTC")
                except:
                    dt_str = _safe_date(rec_start)
            if dt_str:
                w.writerow(["Class Date/Time", dt_str])
            link = class_meta.get('class_link') or ""
            if link:
                w.writerow(["Class Link", link])
            w.writerow([])

            # Attendance
            if attendance:
                w.writerow(["Attendance"])
                w.writerow(["Student", "Status"])
                for a in attendance:
                    display_name = a.get('name','')
                    if privacy_mode == "ids" and a.get('id'):
                        display_name = f"ID {a['id']}"
                    w.writerow([display_name, "Absent" if a.get('absent') else "Present"])
                w.writerow([])

            # Class Events
            if timeline_segments:
                w.writerow(["Class Events"])
                w.writerow(["Time", "Section", "Event"])
                for seg in timeline_segments:
                    w.writerow([_fmt_mmss(seg.get('offset_seconds')), seg.get('section',''), seg.get('title','')])
                w.writerow([])

            # Transcript
            w.writerow(['Time', 'Speaker', 'Contribution'])
            for row in out_rows:
                w.writerow([row.get('timestamp',''), row.get('speaker',''), row.get('text','')])

        print(f"Created CSV transcript: {output_path}")
        return output_path

    except Exception as e:
        print(f"Error creating CSV transcript: {str(e)}")
        return None

def create_simplified_csv(class_id, transcript_path):
    """
    Fallback CSV: just time + text.
    """
    try:
        with open(f"/content/session_{class_id}_transcript.json", 'r') as f:
            transcript_data = json.load(f)

        output_path = f"/content/session_{class_id}_transcript_simple.csv"
        with open(output_path, 'w', newline='', encoding='utf-8') as csvfile:
            w = csv.writer(csvfile)
            w.writerow(['Time', 'Text'])
            for seg in transcript_data['segments']:
                minutes = int(seg['start'] // 60)
                seconds = int(seg['start'] % 60)
                timestamp = f"{minutes:02d}:{seconds:02d}"
                w.writerow([timestamp, normalize_sentence_spacing(seg['text'])])

        print(f"Created simplified CSV transcript: {output_path}")
        return output_path
    except Exception as e:
        print(f"Error creating simplified CSV transcript: {str(e)}")
        return None

def create_simplified_transcript(class_id, transcript_path):
    """
    Fallback PDF: no events, no speakers; includes minimal title.
    """
    try:
        output_path = f"/content/session_{class_id}_transcript_simple.pdf"
        styles = getSampleStyleSheet()
        text_style = ParagraphStyle('TextStyle', parent=styles['Normal'],
                                    fontName='Helvetica', fontSize=10, leading=12, spaceAfter=0, spaceBefore=0,
                                    wordWrap='CJK')
        header_style = ParagraphStyle('HeaderStyle', parent=styles['Normal'],
                                      fontName='Helvetica-Bold', fontSize=12, textColor=colors.whitesmoke,
                                      alignment=1)

        with open(f"/content/session_{class_id}_transcript.json", 'r') as f:
            transcript_data = json.load(f)

        doc = SimpleDocTemplate(output_path, pagesize=letter,
                                rightMargin=72, leftMargin=72, topMargin=72, bottomMargin=72)
        elements = []
        title = Paragraph(f"Session {class_id}", styles['Title'])
        date_str = datetime.datetime.now().strftime("%Y-%m-%d")
        subtitle = Paragraph(f"Generated on {date_str}", styles['Heading2'])
        elements.append(title); elements.append(subtitle); elements.append(Spacer(1, 12))

        data = [[Paragraph('Time', header_style), Paragraph('Text', header_style)]]
        for seg in transcript_data['segments']:
            minutes = int(seg['start'] // 60)
            seconds = int(seg['start'] % 60)
            timestamp = f"{minutes:02d}:{seconds:02d}"
            data.append([timestamp, Paragraph(normalize_sentence_spacing(seg['text']), text_style)])

        table = Table(data, colWidths=[0.75*inch, 6.25*inch], repeatRows=1)
        table.setStyle(TableStyle([
            ('BACKGROUND', (0, 0), (-1, 0), colors.grey),
            ('TEXTCOLOR', (0, 0), (-1, 0), colors.whitesmoke),
            ('ALIGN', (0, 0), (-1, 0), 'CENTER'),
            ('FONTNAME', (0, 0), (-1, 0), 'Helvetica-Bold'),
            ('FONTNAME', (0, 1), (-1, -1), 'Helvetica'),
            ('FONTSIZE', (0, 1), (-1, -1), 10),
            ('VALIGN', (0, 0), (-1, -1), 'TOP'),
            ('GRID', (0, 0), (-1, -1), 1, colors.black),
            ('LEFTPADDING', (0, 0), (-1, -1), 6),
            ('RIGHTPADDING', (0, 0), (-1, -1), 6),
            ('TOPPADDING', (0, 0), (-1, -1), 3),
            ('BOTTOMPADDING', (0, 0), (-1, -1), 3),
        ]))
        elements.append(table)
        doc.build(elements)
        print(f"Created simplified PDF transcript: {output_path}")
        return output_path
    except Exception as e:
        print(f"Error creating simplified transcript: {str(e)}")
        return None

In [16]:
# ===========================
# Main pipeline
# ===========================

# 1) cURL
print("1) Paste your Forum cURL (right-click in Chrome DevTools → Copy as cURL)")
raw_curl = input("cURL: ").strip()
clear_output()

# Auto-derive Class ID from the cURL (uses your existing helper)
try:
    _ids = extract_ids_from_curl(raw_curl)
except NameError:
    _ids = {}
CLASS_ID = _ids.get("class_id")

# 2) Media path or URL
print("2) Provide your media file")
print("   • Enter a local/Drive path like /content/your_file.mp3")
print("   • OR paste a https:// URL to the media file (mp3/mp4/wav)")
AUDIO_PATH = input("Path or URL: ").strip()
clear_output()

# 3) Privacy mode (with a real input box + validation loop)
print("3) Student name privacy mode")
print("   Type one of: names  (show names)  |  ids  (anonymize to IDs)  |  both  (generate both)")
while True:
    PRIVACY_MODE = input("Privacy mode [names/ids/both]: ").strip().lower()
    if PRIVACY_MODE in ("names", "ids", "both"):
        break
    print("Please type exactly: names, ids, or both.")

clear_output()

# (Optional) custom terms to preserve spellings
print("4) Optional: comma-separated custom terms to preserve spellings (press Enter to skip)")
USER_TERMS_RAW = input("Custom terms: ").strip()
USER_TERMS = [t.strip() for t in USER_TERMS_RAW.split(",") if t.strip()]
clear_output()

# Summary
print("Thanks! Summary:\n")
print(f"Class ID (detected): {CLASS_ID or '(will fetch from API if needed)'}")
print(f"Media: {AUDIO_PATH}")
print(f"Privacy mode: {PRIVACY_MODE}")
if USER_TERMS:
    print(f"Custom terms: {USER_TERMS}")
print("\nStarting the transcript generation process…")
print("⏳ This can take a while depending on file length and model size...")

def process_lecture(audio_path, class_id, curl_string, privacy_mode="names"):
    try:
        print("Step 1/4: Fetching Forum class events...")
        headers = clean_curl(curl_string)
        events_data = get_forum_events(class_id, headers, curl_string)

        print("\nStep 2/4: Preprocessing audio...")
        preprocessor = AudioPreprocessor()
        fixed_path = preprocessor.validate_and_fix_file(audio_path)

        print("\nStep 3/4: Transcribing...")
        tp = TranscriptionProcessor()
        transcript_path = tp.transcribe(fixed_path, class_id)

        print("\nStep 4/4: Preparing outputs...")

        # Choose output modes
        modes = [privacy_mode] if privacy_mode in ("names", "ids") else ["names", "ids"]

        outputs = []
        for mode in modes:
            pdf_path = compile_transcript_to_pdf(class_id, headers, privacy_mode=mode)
            csv_path = compile_transcript_to_csv(class_id, headers, privacy_mode=mode)
            outputs.append((mode, pdf_path, csv_path))

        # Accuracy caution
        print("\n⚠️  Accuracy caution: Do not rely solely on this transcript. Manually verify key information.")

        # Clean up temp WAV (but keep original)
        try:
            if fixed_path and Path(fixed_path).exists() and (str(fixed_path) != str(audio_path)):
                Path(fixed_path).unlink()
                print(f"Cleaned up temporary file: {fixed_path}")
        except Exception as cleanup_error:
            print(f"Note: Could not clean up temporary files: {str(cleanup_error)}")

        return outputs

    except Exception as e:
        print(f"\nERROR: {str(e)}")
        if "MP4" in str(e) and audio_path.lower().endswith('.mp4'):
            print("\nThere was a problem with your MP4 file. Suggestions:")
            print("1. Try converting it to MP3 or WAV locally before uploading")
            print("2. Use a screen recorder to re-record the audio")
            print("3. Contact Forum support about MP4 download issues")
        else:
            print("\nTranscription failed. Please try again with a different file or path.")
        return []

# Run
outs = process_lecture(AUDIO_PATH, CLASS_ID, raw_curl, PRIVACY_MODE)

# CUDA cleanup
try:
    torch.cuda.empty_cache()
except:
    pass
import gc; gc.collect()
print("CUDA cache cleared")

# Pretty print results
if outs:
    if len(outs) == 1:
        mode, pdfp, csvp = outs[0]
        print(f"\nSuccess! Your transcripts are ready ({mode}):")
        print(f"PDF: {pdfp}")
        print(f"CSV: {csvp}")
    else:
        # both
        print("\nSuccess! Your transcripts are ready (both privacy modes):")
        for mode, pdfp, csvp in outs:
            print(f"PDF ({mode}): {pdfp}")
            print(f"CSV ({mode}): {csvp}")

Thanks! Summary:

Class ID (detected): 87534
Media: https://d2r3twttwe1v7v.cloudfront.net/6b061a86-d83e-4cbe-8c29-4775affe50ac/zencoder-recording-600k.mp4?Expires=1755391364&Signature=baFisiJ86laVhhz7rBNBbwnYhF4hDhZjr5cCMpYnPjSl29-Jzza8JRD9TUaCwuzirjtLtx3D2Q7ucPv9n0eqSz1XeJMnzBXq2G5r8dy31JW1NZwzT98PkC2yYnEujS7c-5m1IM1NTTtB~kHPJl0g~v0mKRIc2Cc75HFjKcr-DzR-QUXNpvsCtyfF17bHXAj0WBG6UlW5vPEDUymDnTxuYQX7p6BdnQI7pNEXfAq6vatwoP9OwAhVOluy3gENyBRxgYqW7hN63HfeQBvF1erVuEevGqSGOG~unQNNBOwqSOZaufSLznW6oAnw0OJSyVWXH96a7lyId8EZh0-xSkCVLw__&Key-Pair-Id=K2PZFX9H2TL8YQ
Privacy mode: both

Starting the transcript generation process…
⏳ This can take a while depending on file length and model size...
Step 1/4: Fetching Forum class events...
Fetching class and event data from Forum...
Processed voice events: 160; timeline segments: 7; attendance: 23

Step 2/4: Preprocessing audio...
Validating file or URL: https://d2r3twttwe1v7v.cloudfront.net/6b061a86-d83e-4cbe-8c29-4775affe50ac/zencoder-recording-600k.mp4?E

100%|█████████████████████████████████████| 1.42G/1.42G [00:21<00:00, 69.7MiB/s]


Processing audio to generate transcript JSON...
Total duration: 1:41:00


Processing segments:   0%|          | 0/1 [00:00<?, ?segment/s]

KeyboardInterrupt: 

In [None]:
# @title
def process_lecture(audio_path, class_id, curl_string):
    """
    End-to-end pipeline with graceful fallbacks.
    """
    output_pdf, output_csv = None, None  # NEW: ensure defined for the return
    try:
        # 1) Forum events
        print("Step 1/4: Processing Forum class events...")
        headers = clean_curl(curl_string)
        events_data = get_forum_events(class_id, headers)

        # 2) Audio preprocess
        print("\nStep 2/4: Preprocessing audio file...")
        # Resolve remote URLs to a local temp file before conversion
        resolved_media_path = download_if_url(audio_path, headers)

        preprocessor = AudioPreprocessor()
        fixed_path = preprocessor.validate_and_fix_file(resolved_media_path)


        # 3) Transcribe
        print("\nStep 3/4: Generating transcript...")
        tp = TranscriptionProcessor()
        transcript_path = tp.transcribe(fixed_path, class_id)

        # 4/4: Optional LLM cleanup (Ollama), then compile outputs
        print("\nStep 4/4: Preparing outputs...")

        try:
            # Build compiled entries once from JSON + Forum events
            compiled_entries_raw = build_compiled_entries_from_json(class_id, events_data)

            def _emit_variant(entries, variant_label, name_mode, suffix_extra=""):
                """Emit one PDF/CSV pair for a given entries set + privacy mode."""
                suffix = f"_{variant_label}{suffix_extra}" if variant_label else f"{suffix_extra}"
                pdf = compile_transcript_to_pdf(
                    class_id, headers, name_mode=name_mode, file_suffix=suffix,
                    entries_override=entries, output_dir=OUTPUT_DIR
                )
                csvp = compile_transcript_to_csv(
                    class_id, headers, name_mode=name_mode, file_suffix=suffix,
                    entries_override=entries, output_dir=OUTPUT_DIR
                )
                return pdf, csvp

            if USE_OLLAMA:
                print("Running Ollama cleanup (spellings/terms)...")
                corrected_entries = postprocess_with_ollama(
                    compiled_entries_raw, events_data, CUSTOM_TERMS,
                    model=OLLAMA_MODEL, url=OLLAMA_URL
                )

                if PRODUCE_BOTH_VARIANTS:
                    # Original (orig) + Corrected (corr)
                    if PRIVACY_MODE == "both":
                        on_pdf, on_csv = _emit_variant(compiled_entries_raw, "orig", "names", "_names")
                        oi_pdf, oi_csv = _emit_variant(compiled_entries_raw, "orig", "ids",   "_ids")
                        cn_pdf, cn_csv = _emit_variant(corrected_entries,    "corr", "names", "_names")
                        ci_pdf, ci_csv = _emit_variant(corrected_entries,    "corr", "ids",   "_ids")
                        print("\nSuccess! Your transcripts are ready (both variants & both privacy modes):")
                        print(f"PDF (orig, names): {on_pdf}"); print(f"CSV (orig, names): {on_csv}")
                        print(f"PDF (orig, ids):   {oi_pdf}"); print(f"CSV (orig, ids):   {oi_csv}")
                        print(f"PDF (corr, names): {cn_pdf}"); print(f"CSV (corr, names): {cn_csv}")
                        print(f"PDF (corr, ids):   {ci_pdf}"); print(f"CSV (corr, ids):   {ci_csv}")
                        output_pdf, output_csv = cn_pdf, cn_csv
                    else:
                        variant_suffix = "" if PRIVACY_MODE == "names" else "_ids"
                        o_pdf, o_csv = _emit_variant(compiled_entries_raw, "orig", PRIVACY_MODE, variant_suffix)
                        c_pdf, c_csv = _emit_variant(corrected_entries,    "corr", PRIVACY_MODE, variant_suffix)
                        print("\nSuccess! Your transcripts are ready (original + corrected):")
                        print(f"PDF (original):  {o_pdf}"); print(f"CSV (original):  {o_csv}")
                        print(f"PDF (corrected): {c_pdf}"); print(f"CSV (corrected): {c_csv}")
                        output_pdf, output_csv = c_pdf, c_csv
                else:
                    # Corrected only (default)
                    if PRIVACY_MODE == "both":
                        cn_pdf, cn_csv = _emit_variant(corrected_entries, "corr", "names", "_names")
                        ci_pdf, ci_csv = _emit_variant(corrected_entries, "corr", "ids",   "_ids")
                        print("\nSuccess! Your transcripts are ready (corrected, both privacy modes):")
                        print(f"PDF (names): {cn_pdf}"); print(f"CSV (names): {cn_csv}")
                        print(f"PDF (ids):   {ci_pdf}"); print(f"CSV (ids):   {ci_csv}")
                        output_pdf, output_csv = cn_pdf, cn_csv
                    else:
                        variant_suffix = "" if PRIVACY_MODE == "names" else "_ids"
                        out_pdf, out_csv = _emit_variant(corrected_entries, "corr", PRIVACY_MODE, variant_suffix)
                        print("\nSuccess! Your transcripts are ready (corrected):")
                        print(f"PDF: {out_pdf}"); print(f"CSV: {out_csv}")
                        output_pdf, output_csv = out_pdf, out_csv
            else:
                # No Ollama: emit original (raw Whisper) only
                if PRIVACY_MODE == "both":
                    on_pdf, on_csv = _emit_variant(compiled_entries_raw, "", "names", "_names")
                    oi_pdf, oi_csv = _emit_variant(compiled_entries_raw, "", "ids",   "_ids")
                    print("\nSuccess! Your transcripts are ready (original, both privacy modes):")
                    print(f"PDF (names): {on_pdf}"); print(f"CSV (names): {on_csv}")
                    print(f"PDF (ids):   {oi_pdf}"); print(f"CSV (ids):   {oi_csv}")
                    output_pdf, output_csv = on_pdf, on_csv
                else:
                    variant_suffix = "" if PRIVACY_MODE == "names" else "_ids"
                    out_pdf, out_csv = _emit_variant(compiled_entries_raw, "", PRIVACY_MODE, variant_suffix)
                    print("\nSuccess! Your transcripts are ready (original):")
                    print(f"PDF: {out_pdf}"); print(f"CSV: {out_csv}")
                    output_pdf, output_csv = out_pdf, out_csv

            print("\n⚠️  Accuracy caution: Do not rely solely on this transcript. Manually verify key information.")

        except Exception as e:
            print(f"Error during output preparation: {e}")
            print("Proceeding to simplified fallback (no LLM corrections)...")
            output_pdf = create_simplified_transcript(class_id, transcript_path)
            output_csv = create_simplified_csv(class_id, transcript_path)
            if output_pdf and output_csv:
                print(f"\nCreated simplified transcripts:")
                print(f"PDF: {output_pdf}"); print(f"CSV: {output_csv}")
                print("\n⚠️  Accuracy caution: Do not rely solely on this transcript. Manually verify key information.")
            else:
                print("Failed to create simplified transcripts.")
                return None, None

        # Clean up temporary files
        try:
            temp_files = [fixed_path]  # converted WAV

            # If we downloaded from a URL into /content, also clean that file
            try:
                if 'resolved_media_path' in locals() and resolved_media_path != audio_path:
                    if isinstance(resolved_media_path, str) and resolved_media_path.startswith("/content/") and Path(resolved_media_path).exists():
                        temp_files.append(resolved_media_path)
            except Exception:
                pass

            for temp_file in temp_files:
                try:
                    if temp_file and Path(temp_file).exists():
                        Path(temp_file).unlink()
                        print(f"Cleaned up temporary file: {temp_file}")
                except Exception as e:
                    print(f"Note: Could not delete temp file {temp_file}: {e}")
        except Exception as cleanup_error:
            print(f"Note: Could not clean up temporary files: {str(cleanup_error)}")

        return output_pdf, output_csv

    except Exception as e:
        print(f"\nERROR: {str(e)}")
        if "MP4" in str(e) and audio_path.lower().endswith('.mp4'):
            print("\nThere was a problem with your MP4 file. Suggestions:")
            print("1. Convert it to MP3 on your computer before uploading")
            print("2. Use a screen recorder to record Forum while playing back the class")
            print("3. Contact Forum support about MP4 download issues")
        else:
            print("\nTranscription failed. Please try again with a different file.")
        return None, None

In [None]:
# Run
outs = process_lecture(AUDIO_PATH, CLASS_ID, raw_curl, PRIVACY_MODE)

# CUDA cleanup
try:
    torch.cuda.empty_cache()
except:
    pass
import gc; gc.collect()
print("CUDA cache cleared")

# Pretty print results
if outs:
    if len(outs) == 1:
        mode, pdfp, csvp = outs[0]
        print(f"\nSuccess! Your transcripts are ready ({mode}):")
        print(f"PDF: {pdfp}")
        print(f"CSV: {csvp}")
    else:
        print("\nSuccess! Your transcripts are ready (both privacy modes):")
        for mode, pdfp, csvp in outs:
            print(f"PDF ({mode}): {pdfp}")
            print(f"CSV ({mode}): {csvp}")