In [3]:
import requests
from bs4 import BeautifulSoup
import csv
from datetime import datetime, timedelta
import csv
from datetime import datetime


In [None]:

BASE_URL = "https://www.adminmonitor.com/tx/puct/open_meeting/"

CSV_FILE = "puct_meeting_links.csv"

def fetch_meeting_page(url):
    """Fetch the HTML content from the given URL."""
    try:
        response = requests.get(url)
        response.raise_for_status()
        return response.text
    except requests.RequestException as e:
        print(f"Error fetching {url}: {e}")
        return ""

def parse_meeting_links(html):
    """Parse HTML to extract meeting links from the past three years."""
    soup = BeautifulSoup(html, "html.parser")
    meeting_links = []
    
    # cutoff date
    today = datetime.today()
    cutoff_date = datetime(2021, 1, 1)
    
    # loop through all <a> tags with an href attribute
    for a_tag in soup.find_all("a", href=True):
        href = a_tag["href"]
        # check if the href matches the meeting URL pattern
        if "/tx/puct/open_meeting/" in href:
            parts = href.strip("/").split("/")
            if len(parts) == 4:  # expecting something like: tx/puct/open_meeting/YYYYMMDD
                date_str = parts[-1]
                try:
                    meeting_date = datetime.strptime(date_str, "%Y%m%d")
                    if meeting_date >= cutoff_date:
                        full_url = f"https://www.adminmonitor.com{href}"
                        meeting_links.append((meeting_date.strftime("%Y-%m-%d"), full_url))
                except ValueError:
                    # if the date string isn't in the expected format, skip it
                    continue

    # sort the meeting links in descending order by date
    return sorted(meeting_links, key=lambda x: x[0], reverse=True)

def save_links_to_csv(links, filename):
    """Save the list of meeting links to a CSV file."""
    with open(filename, mode="w", newline="") as file:
        writer = csv.writer(file)
        writer.writerow(["Date", "URL"])
        for date, url in links:
            writer.writerow([date, url])
    print(f"Saved {len(links)} meeting links to {filename}")

def main():
    html = fetch_meeting_page(BASE_URL)
    if not html:
        print("Failed to retrieve the meeting page.")
        return

    meeting_links = parse_meeting_links(html)
    if meeting_links:
        save_links_to_csv(meeting_links, CSV_FILE)
    else:
        print("No meeting links found.")

if __name__ == "__main__":
    main()

Saved 315 meeting links to puct_meeting_links.csv


In [None]:
INPUT_CSV = 'puct_meeting_links.csv'
OUTPUT_CSV = 'puct_meeting_links_filtered.csv'

def filter_meetings(input_csv, output_csv):
    unique_meetings = {}
    today = datetime.today()

    # read from the input CSV
    with open(input_csv, newline='') as csvfile:
        reader = csv.DictReader(csvfile)
        for row in reader:
            date_str = row['Date']  # expecting format YYYY-MM-DD
            url = row['URL']
            try:
                meeting_date = datetime.strptime(date_str, '%Y-%m-%d')
            except ValueError:
                # skip rows with invalid date format
                continue

            key = (date_str, url)
            if key not in unique_meetings:
                unique_meetings[key] = row

    # sort by date descending (latest first)
    sorted_meetings = sorted(unique_meetings.values(), key=lambda x: x['Date'], reverse=True)

    # write the filtered, unique meetings to a new CSV file
    with open(output_csv, 'w', newline='') as csvfile:
        fieldnames = ['Date', 'URL']
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        writer.writeheader()
        for row in sorted_meetings:
            writer.writerow(row)

    print(f"Filtered meetings written to {output_csv}")

if __name__ == '__main__':
    filter_meetings(INPUT_CSV, OUTPUT_CSV)

Filtered meetings written to puct_meeting_links_filtered.csv


In [None]:
import requests
import csv
import re
import time

INPUT_CSV = 'puct_meeting_links_filtered.csv'
OUTPUT_CSV = 'puct_meeting_links_with_m3u8.csv'

def extract_m3u8(url):
    """
    Fetch the page at the given URL and extract the first occurrence
    of a URL containing 'master.m3u8'. Returns an empty string if not found.
    """
    try:
        response = requests.get(url, timeout=10)
        response.raise_for_status()
        html = response.text

        m3u8_links = re.findall(r'(https?://[^\'" >]+master\.m3u8)', html)
        if m3u8_links:
            return m3u8_links[0]
        else:
            return ""
    except Exception as e:
        print(f"Error fetching {url}: {e}")
        return ""

def main():
    with open(INPUT_CSV, newline='', encoding='utf-8') as infile:
        reader = csv.DictReader(infile)
        rows = list(reader)
    
    total = len(rows)
    print(f"Found {total} meeting rows to process.")

    with open(OUTPUT_CSV, 'w', newline='', encoding='utf-8') as outfile:
        fieldnames = reader.fieldnames + ['m3u8_url']
        writer = csv.DictWriter(outfile, fieldnames=fieldnames)
        writer.writeheader()

        for index, row in enumerate(rows, start=1):
            meeting_url = row['URL']
            print(f"[{index}/{total}] Processing: {meeting_url}", end=" ... ")

            m3u8_url = extract_m3u8(meeting_url)
            if m3u8_url:
                print("Found master.m3u8.")
            else:
                print("No master.m3u8 found.")

            row['m3u8_url'] = m3u8_url 
            writer.writerow(row)
            # be polite to the server with a short delay
            time.sleep(1)

    print(f"Finished processing. Results saved to {OUTPUT_CSV}")

if __name__ == "__main__":
    main()

Found 102 meeting rows to process.
[1/102] Processing: https://www.adminmonitor.com/tx/puct/open_meeting/20250313/ ... Found master.m3u8.
[2/102] Processing: https://www.adminmonitor.com/tx/puct/open_meeting/20250220/ ... Found master.m3u8.
[3/102] Processing: https://www.adminmonitor.com/tx/puct/open_meeting/20250213/ ... Found master.m3u8.
[4/102] Processing: https://www.adminmonitor.com/tx/puct/open_meeting/20250131/ ... Found master.m3u8.
[5/102] Processing: https://www.adminmonitor.com/tx/puct/open_meeting/20250116/ ... Found master.m3u8.
[6/102] Processing: https://www.adminmonitor.com/tx/puct/open_meeting/20241219/ ... Found master.m3u8.
[7/102] Processing: https://www.adminmonitor.com/tx/puct/open_meeting/20241212/ ... Found master.m3u8.
[8/102] Processing: https://www.adminmonitor.com/tx/puct/open_meeting/20241121/ ... Found master.m3u8.
[9/102] Processing: https://www.adminmonitor.com/tx/puct/open_meeting/20241114/ ... Found master.m3u8.
[10/102] Processing: https://www.admin

In [None]:
import os
import csv
import json
import subprocess
from pydub import AudioSegment
import openai



openai.api_key = os.getenv("OPENAI_API_KEY")
if not openai.api_key:
    raise ValueError("OpenAI API key not set. Please set the OPENAI_API_KEY environment variable.")
else:   
    print("OpenAI API key has been set.\n")


csv_file = "puct_meeting_links_with_m3u8.csv"  # csv with columns date, url, m3u8_url
mp3_dir = "mp3_files"
transcripts_dir = "transcripts"


os.makedirs(mp3_dir, exist_ok=True)
os.makedirs(transcripts_dir, exist_ok=True)


CHUNK_SIZE_MB = 25


def convert_m3u8_to_mp3(m3u8_url, output_mp3_path):
    """
    Uses ffmpeg to download the audio from the m3u8 stream and converts it to an MP3 file.
    """
    print(f"Extracting audio from m3u8 URL:\n{m3u8_url}")
    ffmpeg_cmd = [
        "ffmpeg", "-y",               # Overwrite without prompting
        "-i", m3u8_url,               # Input m3u8 URL
        "-vn",                        # No video
        "-ar", "16000",               # Set audio sample rate to 16kHz
        "-ac", "1",                   # Mono audio
        output_mp3_path
    ]
    print("Running ffmpeg command:")
    print(" ".join(ffmpeg_cmd))
    subprocess.run(ffmpeg_cmd, check=True)
    print(f"Audio extracted and saved as {output_mp3_path}\n")


def split_audio(file_path):
    """
    Splits an MP3 file into in-memory chunks, ensuring each chunk is under 25MB.
    Returns a list of AudioSegment chunks.
    """
    max_size_bytes = CHUNK_SIZE_MB * 1024 * 1024
    audio = AudioSegment.from_mp3(file_path)
    chunks = []
    queue = [audio]

    while queue:
        chunk = queue.pop(0)
        # save temporarily in memory and check size
        temp_path = "temp_check.mp3"
        chunk.export(temp_path, format="mp3")
        chunk_size = os.path.getsize(temp_path)
        os.remove(temp_path)

        if chunk_size > max_size_bytes:
            # split into two halves and recheck
            midpoint = len(chunk) // 2
            queue.append(chunk[:midpoint])
            queue.append(chunk[midpoint:])
        else:
            chunks.append(chunk)

    print(f"Total chunks created: {len(chunks)}\n")
    return chunks


def transcribe_audio(audio_chunk, chunk_index):
    """
    Transcribes an in-memory MP3 chunk using OpenAI Whisper API.
    Saves the chunk temporarily as 'temp_chunk.mp3' and returns the list of segments.
    """
    temp_file = "temp_chunk.mp3"
    audio_chunk.export(temp_file, format="mp3")
    print(f"Transcribing chunk {chunk_index} (temporary file: {temp_file})...")
    with open(temp_file, "rb") as audio_file:
        response = openai.audio.transcriptions.create(
            model="whisper-1",
            file=audio_file,
            response_format="verbose_json",
            timestamp_granularities=["segment"]
        )
    os.remove(temp_file)
    print(f"Finished transcribing chunk {chunk_index}\n")

    return response.model_dump()["segments"]


def process_meeting(mp3_path, transcript_json_path):
    """
    Processes an MP3 file:
      - If its size exceeds 25MB, split it into chunks.
      - Transcribe each chunk.
      - Merge the transcriptions (adjusting timestamps by chunk offset).
      - Save the final transcript as JSON.
    """
    print(f"Processing MP3 file: {mp3_path}")
    if os.path.getsize(mp3_path) > 25_000_000:
        print("File exceeds 25MB; splitting into chunks...")
        chunks = split_audio(mp3_path)
    else:
        print("File is within size limits; processing as a single chunk.\n")
        chunks = [AudioSegment.from_mp3(mp3_path)]
    
    full_transcript = []
    cumulative_offset = 0.0  # in seconds
    for idx, chunk in enumerate(chunks):
        print(f"Transcribing chunk {idx+1}/{len(chunks)}...")
        try:
            segments = transcribe_audio(chunk, idx)
        except Exception as e:
            print(f"Error transcribing chunk {idx}: {e}")
            continue
        # duration of this chunk in seconds
        chunk_duration_sec = len(chunk) / 1000.0
        print(f"Chunk {idx+1} duration: {chunk_duration_sec:.2f} seconds. Applying offset: {cumulative_offset:.2f} seconds.")
        for seg in segments:
            seg_start = float(seg.get("start", 0)) + cumulative_offset
            seg_end = float(seg.get("end", 0)) + cumulative_offset
            full_transcript.append({
                "text": seg.get("text", "").strip(),
                "start": round(seg_start, 4),
                "duration": round(seg_end - seg_start, 4)
            })
        cumulative_offset += chunk_duration_sec

    try:
        with open(transcript_json_path, "w", encoding="utf-8") as f:
            json.dump(full_transcript, f, indent=4)
        print(f"Transcript saved to {transcript_json_path}\n")
    except Exception as e:
        print(f"Failed to write transcript JSON for {mp3_path}: {e}")


def main():
    with open(csv_file, newline="", encoding="utf-8") as f:
        meetings = list(csv.DictReader(f))

    print(f"Found {len(meetings)} meetings\n")
    for idx, row in enumerate(meetings, start=1):
        meeting_date = row.get("Date", "unknown").replace("/", "-")
        m3u8_url     = row.get("m3u8_url", "").strip()
        if not m3u8_url:
            print(f"[{idx}] No URL for {meeting_date}, skipping.\n")
            continue

        mp3_filename          = f"{meeting_date}.mp3"
        mp3_path              = os.path.join(mp3_dir, mp3_filename)
        transcript_json_name  = f"{meeting_date}_transcript.json"
        transcript_json_path  = os.path.join(transcripts_dir, transcript_json_name)


        if os.path.exists(transcript_json_path):
            print(f"[{idx}] Transcript already exists for {meeting_date}, skipping transcription.\n")
            continue


        if os.path.exists(mp3_path):
            print(f"[{idx}] MP3 already exists for {meeting_date}, skipping conversion.\n")
        else:
            print(f"[{idx}] Converting m3u8 stream for {meeting_date} to MP3...")
            try:
                convert_m3u8_to_mp3(m3u8_url, mp3_path)
            except Exception as e:
                print(f"Error converting m3u8 for {meeting_date}: {e}\n")
                continue


        try:
            process_meeting(mp3_path, transcript_json_path)
            print(f"[{idx}] Completed {meeting_date}\n")
        except Exception as e:
            print(f"Error processing meeting on {meeting_date}: {e}\n")
            continue

if __name__ == "__main__":

    # main()
    print("----- All meetings processed -----")

✅ OpenAI API key has been set.

----- All meetings processed -----
