<a href="https://colab.research.google.com/github/henricountevans/henricountevans/blob/main/YouTube_Transcript_Scraper.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Cell 1: Install required libraries (if not already installed)
!pip install youtube_transcript_api pandas

# Cell 2: Import required libraries
import sys
import os
import pandas as pd
from youtube_transcript_api import YouTubeTranscriptApi, TranscriptsDisabled, NoTranscriptFound
from youtube_transcript_api.formatters import TextFormatter

# Cell 3: Define functions
def get_video_id(url):
    """Extract video ID from YouTube URL."""
    if "youtu.be" in url:
        return url.split("/")[-1]
    elif "youtube.com" in url:
        return url.split("v=")[1].split("&")[0]
    else:
        raise ValueError("Invalid YouTube URL")

def fetch_transcript(video_id):
    """Fetch the transcript for a given video ID."""
    try:
        transcript = YouTubeTranscriptApi.get_transcript(video_id)
        return transcript  # Return the raw transcript data
    except TranscriptsDisabled:
        print("Transcripts are disabled for this video.")
        return None
    except NoTranscriptFound:
        print("No transcript found for this video.")
        return None
    except Exception as e:
        print(f"An error occurred: {str(e)}")
        return None

# Cell 4: Main execution
# List of YouTube URLs with their corresponding source names
video_data = [
    {"url": "https://www.youtube.com/watch?v=oi6E3tsvelA", "source": "SABC"},
    {"url": "https://www.youtube.com/watch?v=uTzHc8l3BOM", "source": "SABC"},
    {"url": "https://www.youtube.com/watch?v=yNOyu1RMUHY", "source": "Newzroom Afrika"}
]

export_folder = "/Users/henri-countevans/Documents/YouTube Transcripts Scraper"

# Ensure the export folder exists
os.makedirs(export_folder, exist_ok=True)

# Create a list to store all transcript data
all_transcript_data = []

for video in video_data:
    try:
        url = video["url"]
        source = video["source"]
        video_id = get_video_id(url)
        print(f"\nProcessing video: {url}")
        print(f"Source: {source}")
        print(f"Video ID: {video_id}")

        transcript = fetch_transcript(video_id)

        if transcript:
            print("Processing transcript...")

            # Process each line of the transcript
            for i, entry in enumerate(transcript, 1):
                all_transcript_data.append({
                    "Source Name": source,
                    "Video ID": video_id,
                    "Line Number": i,
                    "Text": entry["text"]
                })

            print(f"Processed {len(transcript)} lines from this video.")
        else:
            print("Failed to fetch transcript for this video.")

    except ValueError as e:
        print(f"Error with URL {url}: {str(e)}")
    except Exception as e:
        print(f"An unexpected error occurred with URL {url}: {str(e)}")

print("\nAll videos processed.")

# Create a DataFrame from the transcript data
df = pd.DataFrame(all_transcript_data)

# Save the DataFrame to a CSV file
csv_filename = "structured_transcript_dataset.csv"
csv_filepath = os.path.join(export_folder, csv_filename)
df.to_csv(csv_filepath, index=False)
print(f"\nStructured dataset saved to {csv_filepath}")

# Display the first few rows of the DataFrame
print("\nFirst few rows of the dataset:")
print(df.head())

# Display summary statistics
print("\nDataset summary:")
print(df.describe())
print(f"\nTotal number of lines across all transcripts: {len(df)}")

Collecting youtube_transcript_api
  Downloading youtube_transcript_api-0.6.2-py3-none-any.whl (24 kB)
Installing collected packages: youtube_transcript_api
Successfully installed youtube_transcript_api-0.6.2

Processing video: https://www.youtube.com/watch?v=oi6E3tsvelA
Source: SABC
Video ID: oi6E3tsvelA
Processing transcript...
Processed 892 lines from this video.

Processing video: https://www.youtube.com/watch?v=uTzHc8l3BOM
Source: SABC
Video ID: uTzHc8l3BOM
Processing transcript...
Processed 381 lines from this video.

Processing video: https://www.youtube.com/watch?v=yNOyu1RMUHY
Source: Newzroom Afrika
Video ID: yNOyu1RMUHY
Processing transcript...
Processed 166 lines from this video.

All videos processed.

Structured dataset saved to /Users/henri-countevans/Documents/YouTube Transcripts Scraper/structured_transcript_dataset.csv

First few rows of the dataset:
  Source Name     Video ID  Line Number  \
0        SABC  oi6E3tsvelA            1   
1        SABC  oi6E3tsvelA         