In [17]:
import os
import json
import re
from googleapiclient.discovery import build
from youtube_transcript_api import YouTubeTranscriptApi
from youtube_transcript_api._errors import NoTranscriptFound, TranscriptsDisabled

API_KEY = "AIzaSyAoa3YsfoEc3O-rygR1p-nXJ3jKW0jIUPI"
CHANNEL_ID = "UCh7aLORd63J2_xhIp9uqKHQ"



In [7]:
def get_all_videos(api_key, channel_id):
    """
    Fetch all video IDs and titles from a YouTube channel.
    """
    youtube = build("youtube", "v3", developerKey=api_key)
    videos = []
    next_page_token = None

    while True:
        request = youtube.search().list(
            part="id,snippet",
            channelId=channel_id,
            maxResults=50,
            pageToken=next_page_token,
            type="video"
        )
        response = request.execute()

        for item in response.get("items", []):
            if "videoId" in item["id"]:
                video_id = item["id"]["videoId"]
                title = item["snippet"]["title"]
                videos.append({"id": video_id, "title": title})

        next_page_token = response.get("nextPageToken")
        if not next_page_token:
            break

    return videos

In [9]:
def save_transcript(video_id, title):
    """
    Fetch and save the transcript for a video.
    """
    # Replace invalid characters in the title for file naming
    sanitized_title = "".join(c for c in title if c.isalnum() or c in " -_").strip()
    filename = f"{sanitized_title}.json"

    try:
        transcript = YouTubeTranscriptApi.get_transcript(video_id)
        # Save the transcript to a JSON file
        with open(filename, "w") as f:
            json.dump(transcript, f, indent=4)
        print(f"Transcript saved: {filename}")
    except NoTranscriptFound:
        print(f"No transcript found for video: {title} (ID: {video_id})")
    except TranscriptsDisabled:
        print(f"Transcripts disabled for video: {title} (ID: {video_id})")
    except Exception as e:
        print(f"Error fetching transcript for video: {title} (ID: {video_id}) - {e}")

In [13]:
def get_channel_id(api_key, custom_handle):
    """
    Get the Channel ID for a YouTube channel using its custom handle.
    
    Args:
        api_key (str): YouTube API key.
        custom_handle (str): Custom handle of the channel (e.g., @channelname).
    
    Returns:
        str: The sanitized Channel ID.
    """
    youtube = build("youtube", "v3", developerKey=api_key)
    request = youtube.search().list(
        part="snippet",
        q=custom_handle,
        type="channel",
        maxResults=1
    )
    response = request.execute()
    if response["items"]:
        channel_id = response["items"][0]["snippet"]["channelId"]
        return channel_id.strip()  # Ensure no extra spaces or characters
    else:
        raise ValueError(f"Channel with handle {custom_handle} not found.")


In [18]:
def main():
    # Create a folder for transcripts
    if not os.path.exists("MISS_PSC_transcripts"):
        os.makedirs("MISS_PSC_transcripts")
    os.chdir("MISS_PSC_transcripts")

    # Step 1: Fetch all videos from the channel
    # channel_id = get_channel_id(API_KEY, '@arkansaspublicservicecommi7913')
    videos = get_all_videos(API_KEY, CHANNEL_ID)
    print(f"Found {len(videos)} videos on the channel.")

    # Step 2: Fetch and save transcripts
    for video in videos:
        save_transcript(video["id"], video["title"])

In [19]:
main()

Found 104 videos on the channel.
Transcript saved: ENTERGY Public Workshop - August 9th 2023.json
Transcript saved: Mississippi PSC August 3rd 2021 Public Meeting.json
Transcript saved: Mississippi PSC August 2022 Public Meeting.json
Transcript saved: Mississippi PSC September 2022 Public Meeting.json
Transcript saved: Mississippi PSC February 2023 Public Meeting.json
Transcript saved: Mississippi PSC July 8th 2021 Public Meeting.json
Transcript saved: Mississippi PSC June 2021 Special Meeting.json
Transcript saved: Mississippi PSC March 2021 Public Meeting.json
Transcript saved: September 2022 Working Session 2.json
Transcript saved: March 15 2020 Special Meeting.json
Transcript saved: Mississippi PSC September 2023 Public Meeting.json
Transcript saved: Mississippi PSC April 2021 Public Meeting.json
Transcript saved: Mississippi PSC December 7th 2021 Public Meeting.json
Transcript saved: Mississippi PSC June 8th 2021 Public Meeting.json
Transcript saved: Mississippi PSC Live Stream - 

In [2]:
import os
import json
import re

def clean_transcript(input_file, output_file):
    """
    Cleans a transcript JSON file by removing irrelevant entries.
    - Removes lines containing '[music]' or similar tags.
    - Saves the cleaned data to a new file.
    
    Args:
        input_file (str): Path to the input JSON file.
        output_file (str): Path to the output JSON file.
    """
    # Load the JSON file
    with open(input_file, "r") as f:
        data = json.load(f)

    # Define the pattern to remove irrelevant content
    pattern = re.compile(r"\[.*?\]", re.IGNORECASE)

    # Filter out entries with irrelevant content
    cleaned_data = [
        entry for entry in data if not pattern.search(entry["text"])
    ]

    # Save the cleaned data to the output file
    with open(output_file, "w") as f:
        json.dump(cleaned_data, f, indent=4)

    print(f"Cleaned file saved: {output_file}")

def clean_all_transcripts(input_directory, output_directory):
    """
    Cleans all transcript JSON files in the specified input directory and saves
    them to the specified output directory.
    
    Args:
        input_directory (str): Path to the directory containing JSON files.
        output_directory (str): Path to the directory where cleaned files will be saved.
    """
    # Create the output directory if it doesn't exist
    if not os.path.exists(output_directory):
        os.makedirs(output_directory)

    for filename in os.listdir(input_directory):
        if filename.endswith(".json"):
            input_file = os.path.join(input_directory, filename)
            output_file = os.path.join(output_directory, filename)
            clean_transcript(input_file, output_file)

    print(f"All transcripts cleaned and saved to {output_directory}.")

# Example usage
if __name__ == "__main__":
    input_dir = "transcripts"  # Directory containing original JSON files
    output_dir = "cleaned_transcripts"  # Directory to save cleaned JSON files
    clean_all_transcripts(input_dir, output_dir)


Cleaned file saved: cleaned_transcripts/Louisiana Public Service Commission Live Stream - September 2023.json
Cleaned file saved: cleaned_transcripts/Louisiana Public Service Commission Live Stream - May 2022.json
Cleaned file saved: cleaned_transcripts/Louisiana Public Service Commission Live Stream - October 2021.json
Cleaned file saved: cleaned_transcripts/Louisiana Public Service Commission Live Stream - December 2022.json
Cleaned file saved: cleaned_transcripts/Louisiana Public Service Commission Live Stream - June 2022.json
Cleaned file saved: cleaned_transcripts/Louisiana Public Service Commission Live Stream - July 2021.json
Cleaned file saved: cleaned_transcripts/Louisiana Public Service Commission Live Stream - June 2023.json
Cleaned file saved: cleaned_transcripts/Louisiana Public Service Commission Live Stream - December 2023.json
Cleaned file saved: cleaned_transcripts/Louisiana Public Service Commission Live Stream - November 2024.json
Cleaned file saved: cleaned_transcri

In [5]:
get_channel_id(API_KEY, '@arkansaspublicservicecommi7913')

'UCjCtMIrCuWBm-lXEY5fTYuw'