In [1]:
!pip install google-api-python-client



In [3]:
import os
import json
import re
from googleapiclient.discovery import build
from datetime import datetime
from google.colab import files

In [11]:
API_KEY = "AIzaSyAoa3YsfoEc3O-rygR1p-nXJ3jKW0jIUPI"  # Your API key


In [4]:
CHANNEL_IDS = {
    "LA": "UCzQCvs2iY46TRlQkkn7jeng",  # Louisiana PSC
    "MISS": "UCh7aLORd63J2_xhIp9uqKHQ",  # Mississippi PSC
    "ARK": "UC2OFk5LlXZwwMJEgMIQJDqA"   # Arkansas PSC
}

In [5]:
# Step 4: Define helper functions
def clean_title(title):
    """Clean the title to make it more matchable with transcript filenames."""
    # Remove special characters, convert to lowercase
    clean = re.sub(r'[^\w\s]', ' ', title.lower())
    # Replace multiple spaces with a single underscore
    clean = re.sub(r'\s+', '_', clean.strip())
    return clean

def sanitize_filename(title):
    """Convert video title to match the format used in transcript filenames."""
    # This should match how your transcript filenames were created
    sanitized = "".join(c for c in title if c.isalnum() or c in " -_").strip()
    return sanitized

def extract_date_from_title(title):
    """Try to extract date from video title in various formats."""
    # Try to match common date formats
    date_patterns = [
        # MM/DD/YYYY or MM-DD-YYYY
        r'(\d{1,2}[/-]\d{1,2}[/-]\d{2,4})',
        # Month name DD, YYYY
        r'(January|February|March|April|May|June|July|August|September|October|November|December)\s+\d{1,2},?\s+\d{4}',
        # YYYY-MM-DD
        r'(\d{4}[/-]\d{1,2}[/-]\d{1,2})'
    ]

    for pattern in date_patterns:
        match = re.search(pattern, title)
        if match:
            return match.group(0)

    return None

In [6]:
def get_all_videos(api_key, channel_id):
    """Fetch all video IDs and titles from a YouTube channel."""
    youtube = build("youtube", "v3", developerKey=api_key)
    videos = []
    next_page_token = None

    while True:
        try:
            request = youtube.search().list(
                part="id,snippet",
                channelId=channel_id,
                maxResults=50,
                pageToken=next_page_token,
                type="video"
            )
            response = request.execute()

            for item in response.get("items", []):
                if "videoId" in item["id"]:
                    video_id = item["id"]["videoId"]
                    title = item["snippet"]["title"]
                    published_at = item["snippet"]["publishedAt"]

                    videos.append({
                        "id": video_id,
                        "title": title,
                        "published_at": published_at,
                        "sanitized_title": sanitize_filename(title),
                        "clean_title": clean_title(title)
                    })

            next_page_token = response.get("nextPageToken")
            if not next_page_token:
                break
        except Exception as e:
            print(f"Error fetching videos: {e}")
            break

    return videos

In [9]:
def create_video_mapping():
    """Create a mapping file that connects video titles to YouTube IDs."""
    # Create a dictionary to hold all videos
    video_mapping = {}

    # Process videos from each channel
    for state, channel_id in CHANNEL_IDS.items():
        print(f"Fetching videos for {state} PSC (Channel ID: {channel_id})...")

        videos = get_all_videos(API_KEY, channel_id)
        print(f"Found {len(videos)} videos for {state} PSC")

        # Process each video
        for video in videos:
            # Create various possible keys that might match transcript filenames
            possible_keys = []

            # Try to extract date from title
            date_str = extract_date_from_title(video["title"])

            try:
                published_date = datetime.strptime(video["published_at"], "%Y-%m-%dT%H:%M:%SZ")
                date_key = published_date.strftime("%Y_%m_%d")

                # Create standard format keys
                possible_keys.extend([
                    f"{state}_PSC_{date_key}",
                    f"{state}_PSC_Meeting_{date_key}",
                    f"{date_key}_{state}_PSC"
                ])
            except Exception as e:
                print(f"Error processing published date: {e}")

            # If date was found in title, add additional keys
            if date_str:
                possible_keys.append(f"{state}_PSC_{date_str}")
                possible_keys.append(f"{state}_{date_str}")

            # Add sanitized title as key (most likely to match your transcript files)
            possible_keys.append(video["sanitized_title"])

            # Add sanitized title with state prefix
            possible_keys.append(f"{state}_{video['sanitized_title']}")

            # Add clean title as key
            possible_keys.append(video["clean_title"])

            # Add clean title with state prefix
            possible_keys.append(f"{state}_{video['clean_title']}")

            # Also add the video ID and title directly as keys
            possible_keys.append(video["id"])
            possible_keys.append(video["title"])

            # Add entry for each possible key
            for key in possible_keys:
                # Remove any duplicate underscores and trailing/leading underscores
                clean_key = re.sub(r'_+', '_', key).strip('_')

                if clean_key and len(clean_key) > 3:  # Avoid empty or very short keys
                    video_mapping[clean_key] = {
                        "youtube_id": video["id"],
                        "title": video["title"],
                        "state": state
                    }

    # Save the mapping to a JSON file
    with open("video_mapping.json", "w") as f:
        json.dump(video_mapping, f, indent=2)

    print(f"Created video mapping with {len(video_mapping)} entries")

    # Download the mapping file
    files.download("video_mapping.json")

    return video_mapping

In [12]:
# Step 6: Execute the mapping creation
print("Starting video mapping creation...")
mapping = create_video_mapping()  # Store the result in 'mapping' variable
print("Done! The mapping file has been downloaded to your computer.")


Starting video mapping creation...
Fetching videos for LA PSC (Channel ID: UCzQCvs2iY46TRlQkkn7jeng)...
Found 45 videos for LA PSC
Fetching videos for MISS PSC (Channel ID: UCh7aLORd63J2_xhIp9uqKHQ)...
Found 110 videos for MISS PSC
Fetching videos for ARK PSC (Channel ID: UC2OFk5LlXZwwMJEgMIQJDqA)...
Found 0 videos for ARK PSC
Created video mapping with 1224 entries


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Done! The mapping file has been downloaded to your computer.


In [13]:
# Step 7: Check transcript matches
print("\nWould you like to check how many of your transcript files can be mapped?")
print("Upload your transcript_filenames.txt file")

# Wait for transcript filenames to be uploaded
try:
    uploaded = files.upload()
    if uploaded:
        for filename in uploaded.keys():
            print(f"Processing {filename}...")

            # Read the uploaded file
            with open(filename, "r") as f:
                transcript_filenames = [line.strip() for line in f.readlines() if line.strip()]

            # Check matches
            matches = 0
            unmatched = []

            for transcript_name in transcript_filenames:
                # Check if any key in mapping matches this filename
                found_match = False
                for key in mapping:  # Use the 'mapping' variable here
                    if (transcript_name == key or
                        transcript_name in key or
                        key in transcript_name):
                        found_match = True
                        break

                if found_match:
                    matches += 1
                else:
                    unmatched.append(transcript_name)

            print(f"Found matches for {matches} out of {len(transcript_filenames)} transcript files")
            print(f"Match rate: {matches/len(transcript_filenames)*100:.1f}%")

            if unmatched:
                print(f"First 10 unmatched files: {unmatched[:10]}")

                # Save unmatched files to a text file
                with open("unmatched_transcripts.txt", "w") as f:
                    f.write("\n".join(unmatched))

                # Download the unmatched file
                files.download("unmatched_transcripts.txt")
except Exception as e:
    print(f"Error processing uploaded file: {e}")
    import traceback
    traceback.print_exc()  # Print the full error traceback


Would you like to check how many of your transcript files can be mapped?
Upload your transcript_filenames.txt file


Saving transcript_filenames.txt to transcript_filenames (1).txt
Processing transcript_filenames (1).txt...
Found matches for 123 out of 127 transcript files
Match rate: 96.9%
First 10 unmatched files: ['2024 - Propane Summit - All Sessions', '2024 Solar Summit - Morning and Afternoon Sessions', '2024 Water Summit - Morning and Afternoon Sessions', '2024 Nuclear Summit - Morning and Afternoon Sessions']


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>