# Scraping City Council Meeting Videos

## Setup

In [1]:
import re
import json
import math
import bisect
import requests
import datetime
from pathlib import Path

import yt_dlp
import ffmpeg
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm

## Helper Functions
efore doing anything, define some helper functions that will be useful

In [2]:
def get_video_duration(vidpath: Path): 
    """ Returns the length of a video """
    try:
        probe = ffmpeg.probe(vidpath, v="error", show_entries="format=duration")
        duration = float(probe["format"]["duration"])
        return duration
    except ffmpeg.Error as e:
        print(e.stderr.decode())
        return None

def closest_different(lst, x, mode="smaller"): 
    """ Find the closest different element in a list """
    lst = sorted(lst)

    if mode=="smaller":
        i = bisect.bisect_left(lst, x)
        return lst[i-1] if i > 0 else lst[0]
    elif mode=="bigger": 
        i = bisect.bisect_right(lst, x)
        return lst[i+1] if i < len(lst) - 1 else lst[-1]
    else: 
        raise ValueError("mode must be 'bigger' or 'smaller'")

def split_if_longer(times: tuple[int], filename: Path, mx: int = 1800):
    """ Split a clip longer than 30 minutes in half. Times are in seconds """
    start, end = times
    duration = start-end

    if duration > mx: 
        mid = start + duration // 2
        clips = split_if_longer(times, filename)
    else: 
        return [(times, filename)]
    
def clip_video(inpath: Path, outpath: Path, start: int, end: int):
    """ Clips a video using ffmpeg """
    outpath = str(outpath)

    try:
        ffmpeg.input(inpath, ss=start, to=end).output(outpath, c="copy").run(overwrite_output=True, capture_stdout=True, capture_stderr=True)
    except ffmpeg.Error as e: 
        print(e.stderr.decode())

def concat_videos(v1, v2, outpath: Path):
    """ Concatenates two video files using demux """

    if v1.parent != v2.parent: 
        raise ValueError("Input files must be in the same directory")
    
    tmp_file = v1.parent / "filelist.txt"
    
    with open(tmp_file, "w") as f:
        f.write(f"file '{v1.name}'\nfile '{v2.name}'\n")

    outpath = str(outpath)

    #concat video
    try:
        ffmpeg.input(tmp_file, format='concat', safe=0).output(outpath, c='copy').run(overwrite_output=True, capture_stdout=True, capture_stderr=True)
    except ffmpeg.Error as e:
        print(e.stderr.decode())

    #remove temp files
    tmp_file.unlink()   

    

## Downloading video data

First we need to extract all unique meetings. I use groupby and just keep the first elements (since the agenda items don't actually matter here), I then drop any column related to voting

In [3]:
dataset = pd.read_csv("./toronto_city_council_2462.csv")

In [4]:
meeting_df = dataset.copy()
meeting_df = meeting_df.groupby("meetingId", as_index=False).first()

# Drop unecessary 
meeting_df = meeting_df[[c for c in meeting_df.columns if "item" not in c]]
meeting_df = meeting_df[[c for c in meeting_df.columns if "vote" not in c]]

To make my life simpler, I will only look at the three most recent meetings

In [5]:
meeting_df = meeting_df.sort_values("meetingStartDate", ascending=False).iloc[1:4]
meeting_df

Unnamed: 0,meetingId,meetingCd,meetingStartDate,meetingEndDate,meetingSpecialFlag,meetingYear,meetingUrl
17,24390,2024.CC22,1728446400000,1728532800000,False,2024,https://secure.toronto.ca/council/api/individu...
20,24393,2024.CC21,1725508800000,1725508800000,True,2024,https://secure.toronto.ca/council/api/individu...
16,24389,2024.CC20,1721793600000,1721880000000,False,2024,https://secure.toronto.ca/council/api/individu...


Then we loop through all meetings and download the corresponding videos

In [6]:
output_dir = Path("./full_videos/")
output_dir.mkdir(parents=True, exist_ok=True)

In [12]:
for _, r in meeting_df.iterrows():
    response = requests.get(r["meetingUrl"])
    meeting_json = response.json()

    for video in meeting_json["Record"]["videoArchives"]:
        start = video["startDateTime"]
        
        video_url = video["url"]

        # Get the end time of the video
        ydl_opts = {
            'format': 'worst',
            'quiet': True,
            'no_warnings': False,
            # Extract metadata first without downloading
            'skip_download': True
        }
        with yt_dlp.YoutubeDL(ydl_opts) as ydl: 
                info = ydl.extract_info(video_url, download=False)
                duration = info.get('duration')  # Duration in seconds

        end = start + duration*1000

        # Download and name the files
        ydl_opts.update({
            'skip_download': False,
            'outtmpl': f"{str(output_dir)}/{r["meetingId"]}-{start}-{end}.%(ext)s"
        })

        with yt_dlp.YoutubeDL(ydl_opts) as ydl: 
            ydl.download([video_url])
        

                                                            

### Splitting and labelling video data
The videos have been downloaded, but they are full videos! The tricky thing is cutting them to match the agenda items.

First, we get a dataframe of all agenda items that have consideration start times (i.e. they show up in the youtube videos).

In [7]:
subset_ids = meeting_df["meetingId"].to_list()
item_df = dataset[dataset["meetingId"].isin(subset_ids)].copy()

# We don't need any vote information,
item_df = item_df.groupby("itemId", as_index=False).first()
item_df = item_df[[c for c in item_df.columns if "vote" not in c]]

# We don't need to consider items that don't show up in the youtube videos
item_df = item_df[item_df["itemConsiderStartTime"] > 0].copy()

The next step is to loop through every meeting and cut the videos according to agenda items

In [8]:
input_dir = Path("./full_videos/")

output_dir = Path("./agenda_clips/")
output_dir.mkdir(exist_ok=True)

In [9]:
for m_id, i_df in tqdm(item_df.groupby("meetingId"), leave=False):
    
    # Get all relevant agenda items and sort them earliest to latest
    items = list(zip(i_df["itemId"], i_df["itemCd"], i_df["itemConsiderStartTime"]))
    items.sort(key=lambda x: x[2])

    # Collect metadata of relevant videos
    vid_files = {}
    for f in input_dir.glob(f"{m_id}*.mp4"):
        metadata = f.stem.split("-") # meetingId-startTime-endTime
        vid_files[int(metadata[1])] = {
            "path": f,
            "end_time": int(metadata[2])
        }

    # loop through all items
    for j, item in enumerate(tqdm(items, leave=False)):
        
        # Get the video the agenda item starts in
        item_start = item[2]
        v1_start = closest_different(vid_files.keys(), item_start)
        v1_file = vid_files[v1_start]["path"]
        
        # Get the video the agenda item ends in
        if j == len(items)-1: # if its the last agenda item of the meeting then the end is just the end of the videos
            v2_start = sorted(vid_files.keys())[-1]
            item_end = vid_files[v2_start]["end_time"] 
        else: # otherwise the end time is the start time of the next item
            item_end = closest_different([x[2] for x in items], item[2], mode="bigger")
            v2_start = closest_different(vid_files.keys(), item_end)
        v2_file = vid_files[v2_start]["path"]

            
        # Generate the clips for every video
        if v2_file != v1_file: # If the agenda item is spread over two video files
            v1_end = vid_files[v1_start]["end_time"]
            
            # Convert from datetime milliseconds to the reference frame of the video
            times_1 = (item_start - v1_start, v1_end - v1_start)
            times_2 = (0, item_end - v2_start)      

            times_1 = tuple(int(x / 1000) for x in times_1)
            times_2 = tuple(int(x / 1000) for x in times_2)

            clips = [(times_1, v1_file), (times_2, v2_file)]
        else: 
            times = (item_start - v1_start, item_end - v1_start)
            times = tuple(int(x/1000) for x in times)

            clips = [(times, v1_file)]

        
        # Gemini can only handle file up to 50 min. Lets clip to a max of 30min to be safe
        final_clips = []
        max_duration = 1800
        for t, file in clips: 
            duration = t[1]-t[0]
            n = math.ceil(duration / max_duration)
            new_duration = duration / n
            for i in range(n):
                start_time = math.floor(t[0] + i * new_duration)
                end_time = math.ceil(t[0] + min((i+1) * new_duration, duration))
        
                final_clips.append(((start_time, end_time), file))

        # Finally, generate the clips
        clips = final_clips
        for t, input_file in clips:
            clip_start = item[2] + t[0]
            output_file = output_dir / f"{item[0]}-{item[1]}-{item[2]}-{clip_start}{v1_file.suffix}"
            clip_video(input_file, output_file, t[0], t[1])

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/122 [00:00<?, ?it/s]

  0%|          | 0/77 [00:00<?, ?it/s]

  0%|          | 0/14 [00:00<?, ?it/s]

In [16]:
clips = [((0,2000), "f")]

final_clips = []
max_duration = 1800
for t, file in clips: 
    duration = t[1]-t[0]
    n = math.ceil(duration / max_duration)
    new_duration = duration / n
    for i in range(n):
        start_time = math.floor(i * new_duration)
        end_time = math.ceil(min((i+1) * new_duration, duration))

        final_clips.append(((start_time, end_time), file))

print(final_clips)

[((0, 1000), 'f'), ((1000, 2000), 'f')]


### Checking clip lengths

In [23]:
import plotly.express as px
import plotly.io as pio
pio.renderers.default = 'iframe'

In [25]:
filename = []
duration = []
for file in Path("./agenda_clips/").glob("*.mp4"): 
    probe = ffmpeg.probe(str(file))

    filename.append(str(file))
    duration.append(float(probe['format']['duration']) / 60)

d = pd.DataFrame(data = zip(filename, duration), columns = ["filename", "duration"])

In [26]:
fig = px.histogram(duration, nbins=20)
fig.show()

In [54]:
subset_ids=[24390]

item_df = dataset[dataset["meetingId"].isin(subset_ids)].copy()

# We don't need any vote information,
item_df = item_df.groupby("itemId", as_index=False).first()
item_df = item_df[[c for c in item_df.columns if "vote" not in c]]

# We don't need to consider items that don't show up in the youtube videos
item_df = item_df[item_df["itemConsiderStartTime"] > 0].copy()
item_df

Unnamed: 0,itemId,meetingId,meetingCd,meetingStartDate,meetingEndDate,meetingSpecialFlag,meetingYear,meetingUrl,itemCd,itemCategory,...,itemNativeStatus,itemWards,itemInCameraFlag,itemStatutoryFlag,itemConsiderStartTime,itemUrl,itemVoteFlag,itemVoteType,itemVoteDescription,itemVoteResult
0,136118,24390,2024.CC22,1728446400000,1728532800000,False,2024,https://secure.toronto.ca/council/api/individu...,2024.EC14.11,Deferred Item,...,ADOPTED,All,False,False,1728507776000,https://secure.toronto.ca/council/agenda-item....,True,,,
1,136502,24390,2024.CC22,1728446400000,1728532800000,False,2024,https://secure.toronto.ca/council/api/individu...,2024.RM22.1,Routine Matters,...,ADOPTED,All,False,False,1728481191000,https://secure.toronto.ca/council/agenda-item....,True,,,
2,136503,24390,2024.CC22,1728446400000,1728532800000,False,2024,https://secure.toronto.ca/council/api/individu...,2024.RM22.2,Routine Matters,...,ADOPTED,All,False,False,1728482685000,https://secure.toronto.ca/council/agenda-item....,True,,,
3,136505,24390,2024.CC22,1728446400000,1728532800000,False,2024,https://secure.toronto.ca/council/api/individu...,2024.RM22.3,Routine Matters,...,ADOPTED,All,False,False,1728482720000,https://secure.toronto.ca/council/agenda-item....,True,,,
4,136506,24390,2024.CC22,1728446400000,1728532800000,False,2024,https://secure.toronto.ca/council/api/individu...,2024.RM22.4,Routine Matters,...,RECEIVED,All,False,False,1728483661000,https://secure.toronto.ca/council/agenda-item....,True,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
157,137117,24390,2024.CC22,1728446400000,1728532800000,False,2024,https://secure.toronto.ca/council/api/individu...,2024.MM22.35,Member Motions,...,ADOPTED,10,False,False,1728603189000,https://secure.toronto.ca/council/agenda-item....,True,,,
158,137122,24390,2024.CC22,1728446400000,1728532800000,False,2024,https://secure.toronto.ca/council/api/individu...,2024.MM22.31,Member Motions,...,ADOPTED,24,False,False,1728586087000,https://secure.toronto.ca/council/agenda-item....,False,Adopt Item,Majority required - MM22.31 - Adopt the item,"Carried, 23-1"
159,137125,24390,2024.CC22,1728446400000,1728532800000,False,2024,https://secure.toronto.ca/council/api/individu...,2024.MM22.32,Member Motions,...,AMENDED,10,False,False,1728586181000,https://secure.toronto.ca/council/agenda-item....,True,,,
160,137132,24390,2024.CC22,1728446400000,1728532800000,False,2024,https://secure.toronto.ca/council/api/individu...,2024.MM22.33,Member Motions,...,ADOPTED,All,False,False,1728586242000,https://secure.toronto.ca/council/agenda-item....,True,,,


In [72]:
for m_id, i_df in item_df.groupby("meetingId"):
    
    # Get all relevant agenda items and sort them earliest to latest
    items = list(zip(i_df["itemId"], i_df["itemCd"], i_df["itemConsiderStartTime"]))
    items.sort(key=lambda x: x[2])

    l = [(j, x) for j, x in enumerate(items) if x[1]=="2024.RM22.4"]

    # Collect metadata of relevant videos
    vid_files = {}
    for f in input_dir.glob(f"{m_id}*.mp4"):
        metadata = f.stem.split("-") # meetingId-startTime-endTime
        vid_files[int(metadata[1])] = {
            "path": f,
            "end_time": int(metadata[2])
        }


    for j, item in enumerate(items):
        # Get the video the agenda item starts in
        item_start = item[2]
        v1_start = closest_different(vid_files.keys(), item_start)
        v1_file = vid_files[v1_start]["path"]
        
        # Get the video the agenda item ends in
        if j == len(items)-1: # if its the last agenda item of the meeting then the end is just the end of the videos
            v2_start = sorted(vid_files.keys())[-1]
            item_end = vid_files[v2_start]["end_time"] 
        else: # otherwise the end time is the start time of the next item
            item_end = closest_different([x[2] for x in items], item[2], mode="bigger")
            v2_start = closest_different(vid_files.keys(), item_end)
        v2_file = vid_files[v2_start]["path"]
    
            
        # Generate the clips for every video
        if v2_file != v1_file: # If the agenda item is spread over two video files
            v1_end = vid_files[v1_start]["end_time"]
            
            # Convert from datetime milliseconds to the reference frame of the video
            times_1 = (item_start - v1_start, v1_end - v1_start)
            times_2 = (0, item_end - v2_start)      
    
            times_1 = tuple(int(x / 1000) for x in times_1)
            times_2 = tuple(int(x / 1000) for x in times_2)
    
            clips = [(times_1, v1_file), (times_2, v2_file)]
        else: 
            times = (item_start - v1_start, item_end - v1_start)
            times = tuple(int(x/1000) for x in times)
    
            clips = [(times, v1_file)]
    
        print(clips)
    
        # Gemini can only handle file up to 50 min. Lets clip to a max of 30min to be safe
        final_clips = []
        max_duration = 1800
        for t, file in clips: 
            duration = t[1]-t[0]
            n = math.ceil(duration / max_duration)
            new_duration = duration / n
            for i in range(n):
                start_time = math.floor(i * new_duration)
                end_time = math.ceil(min((i+1) * new_duration, duration))
        
                final_clips.append(((start_time, end_time), file))
    
        print(clips)
        
        for t, input_file in clips:
            print(t, input_file)
            clip_start = item[2] + t[0]
            output_file = Path("./temp/")
            output_file.mkdir(exist_ok = True)
            output_file = output_file / f"{item[0]}-{item[1]}-{item[2]}-{clip_start}{v1_file.suffix}"
            clip_video(input_file, output_file, t[0], t[1])

[((591, 2085), WindowsPath('full_videos/24390-1728480600000-1728490489000.mp4'))]
[((591, 2085), WindowsPath('full_videos/24390-1728480600000-1728490489000.mp4'))]
(591, 2085) full_videos\24390-1728480600000-1728490489000.mp4
[((826, 2120), WindowsPath('full_videos/24390-1728480600000-1728490489000.mp4'))]
[((826, 2120), WindowsPath('full_videos/24390-1728480600000-1728490489000.mp4'))]
(826, 2120) full_videos\24390-1728480600000-1728490489000.mp4
[((2085, 3061), WindowsPath('full_videos/24390-1728480600000-1728490489000.mp4'))]
[((2085, 3061), WindowsPath('full_videos/24390-1728480600000-1728490489000.mp4'))]
(2085, 3061) full_videos\24390-1728480600000-1728490489000.mp4
[((2120, 3077), WindowsPath('full_videos/24390-1728480600000-1728490489000.mp4'))]
[((2120, 3077), WindowsPath('full_videos/24390-1728480600000-1728490489000.mp4'))]
(2120, 3077) full_videos\24390-1728480600000-1728490489000.mp4
[((3061, 3089), WindowsPath('full_videos/24390-1728480600000-1728490489000.mp4'))]
[((3061

In [None]:
l = [x for x in 