# 🎯 YouTube Search Scraper for NotebookLM

Automatically find YouTube videos and format them for bulk import into Google NotebookLM.

## Quick Start

Run all cells in order to:
1. Install dependencies
2. Configure the scraper
3. Run the scrape
4. Download results

---

## 1. Install Dependencies

Install all required Python packages. This may take a few minutes.

In [None]:
# Install dependencies
!pip install selenium webdriver-manager pandas requests colorama ipywidgets

# Install Chrome browser (required for Colab)
!apt-get update
!apt-get install -y chromium-browser

print("✅ Dependencies installed successfully!")

## 2. Configure Search

Enter your search term and configure search options.

In [None]:
# === INTERACTIVE SEARCH WIDGET IMPLEMENTATION ===
# This widget allows you to specify any search term
# The search term you enter will be used EXACTLY as typed

import ipywidgets as widgets
from IPython.display import display, HTML

# Configuration dictionary for search preferences
SEARCH_CONFIG = {
    'term': '',  # Default search term (empty - user must provide)
    'title_filter': True,  # By default, filter videos to only include those with search term in title
    'configured': False, # Track if user has set a search term
}

# Text input widget for entering the search term
search_text = widgets.Text(
    value=SEARCH_CONFIG['term'],
    placeholder='Enter search term (e.g., "football highlights", "cooking recipes", "stock market")',
    description='Search Term:',
    disabled=False,
    style={'description_width': 'initial'}
)

# Checkbox to control title filtering
title_filter_checkbox = widgets.Checkbox(
    value=SEARCH_CONFIG['title_filter'],
    description='Only include videos with search term in title',
    disabled=False,
    indent=False
)

# Button to confirm search settings
search_button = widgets.Button(
    description='Set Search Term',
    disabled=False,
    button_style='primary',
    tooltip='Click to update search term',
    icon='search'
)

# Function that runs when the button is clicked
def on_button_click(b):
    if not search_text.value.strip():
        print("⚠️ Please enter a search term before proceeding.")
        return
        
    SEARCH_CONFIG['term'] = search_text.value
    SEARCH_CONFIG['title_filter'] = title_filter_checkbox.value
    SEARCH_CONFIG['configured'] = True  # Mark as configured
    print(f"✅ Search term set to: '{SEARCH_CONFIG['term']}'")
    print(f"✅ Title filtering: {'Enabled' if SEARCH_CONFIG['title_filter'] else 'Disabled'}")
    print("✅ Ready to run scraper!")

# Connect the function to the button
search_button.on_click(on_button_click)

# Display instructions and widgets
display(HTML("<h3>📌 Configure YouTube Search</h3>"))
display(HTML("<p>Enter what you want to search for on YouTube:</p>"))
display(search_text)
display(title_filter_checkbox)
display(search_button)
display(HTML("<p><em>Important: Click the button after entering your search term!</em></p>"))

print("✅ Search widget initialized. Enter your search term and click 'Set Search Term'.")

## 3. Import Libraries and Setup

Import all necessary libraries and set up the scraper.

In [None]:
import time
import logging
import pandas as pd
from datetime import datetime, timedelta
from typing import List, Dict, Optional, Any
from urllib.parse import urlencode, urlparse, parse_qs
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.options import Options
from selenium.common.exceptions import TimeoutException, NoSuchElementException, WebDriverException

print("✅ Libraries imported successfully!")

## 4. Configuration

Configure the scraper settings. You can modify these values:

In [None]:
# Configuration settings
CONFIG = {
    'debug': False,
    'max_videos': 100,  # Maximum videos to collect
    'headless': True,
    'scroll_pause_time': 2.0,
    'max_videos_per_search': 50,
    'page_load_timeout': 30,
    'implicit_wait': 10,
}

# YouTube search configuration
YOUTUBE_CONFIG = {
    'base_url': 'https://www.youtube.com',
    'search_path': '/results',
    'today_filter': 'CAISBAgBEAE%253D',
    'video_selector': 'ytd-video-renderer,ytd-grid-video-renderer',
    'title_selector': '#video-title',
    'channel_selector': '#channel-name,#text',
    'duration_selector': '#text.ytd-thumbnail-overlay-time-status-renderer',
    'views_selector': '#metadata-line span:nth-child(1)',
    'upload_time_selector': '#metadata-line span:nth-child(2)',
}

# Date formats for search terms
DATE_FORMATS = [
    '%m/%d/%Y',  # 10/19/2023
    '%m-%d-%Y',  # 10-19-2023
    '%B %d, %Y', # October 19, 2023
    '%b %d, %Y', # Oct 19, 2023
    '%Y-%m-%d',  # 2023-10-19
]

print("✅ Configuration loaded!")
if SEARCH_CONFIG.get('configured', False) and SEARCH_CONFIG['term'].strip():
    print(f"📊 Will search for up to {CONFIG['max_videos']} videos matching: '{SEARCH_CONFIG['term']}'")
else:
    print("📊 Search term not configured yet. Set it in Cell 2 before running the scraper.")

## 5. Scraper Class

Define the main YouTube search scraper class.

In [None]:
class YouTubeSearchScraper:
    """YouTube scraper for any search term"""

    def __init__(self, headless: bool = True, debug: bool = False, search_config: Optional[Dict] = None):
        self.headless = headless
        self.debug = debug
        self.driver = None
        self.logger = logging.getLogger(__name__)
        self.search_config = search_config or SEARCH_CONFIG

        # Configure logging
        if debug:
            logging.basicConfig(level=logging.DEBUG, format='%(asctime)s - %(levelname)s - %(message)s')
        else:
            logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

    def setup_driver(self) -> None:
        """Configure and initialize Chrome WebDriver for Colab"""
        try:
            chrome_options = Options()

            # Colab-specific options
            chrome_options.add_argument('--no-sandbox')
            chrome_options.add_argument('--disable-dev-shm-usage')
            chrome_options.add_argument('--disable-gpu')
            chrome_options.add_argument('--disable-extensions')
            chrome_options.add_argument('--disable-background-timer-throttling')
            chrome_options.add_argument('--disable-backgrounding-occluded-windows')
            chrome_options.add_argument('--disable-renderer-backgrounding')
            chrome_options.add_argument('--disable-features=VizDisplayCompositor')

            if self.headless:
                chrome_options.add_argument('--headless')

            # Set binary location for Colab
            chrome_options.binary_location = '/usr/bin/chromium-browser'

            # Anti-detection measures
            chrome_options.add_argument('--disable-blink-features=AutomationControlled')
            chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"])
            chrome_options.add_experimental_option('useAutomationExtension', False)

            # User agent
            chrome_options.add_argument('--user-agent=Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36')

            # Window size
            chrome_options.add_argument('--window-size=1920,1080')

            # Initialize driver
            self.driver = webdriver.Chrome(options=chrome_options)

            # Set timeouts
            self.driver.set_page_load_timeout(CONFIG['page_load_timeout'])
            self.driver.implicitly_wait(CONFIG['implicit_wait'])

            # Execute script to remove webdriver property
            self.driver.execute_script("Object.defineProperty(navigator, 'webdriver', {get: () => undefined})")

            self.logger.info("Chrome driver initialized successfully for Colab")

        except Exception as e:
            self.logger.error(f"Failed to setup Chrome driver: {e}")
            raise

    def cleanup(self) -> None:
        """Close browser and cleanup resources"""
        if self.driver:
            try:
                self.driver.quit()
                self.logger.info("Browser closed successfully")
            except Exception as e:
                self.logger.error(f"Error closing browser: {e}")
            finally:
                self.driver = None

    def get_default_search_terms(self) -> List[str]:
        """Get search term directly from user input"""
        # Return a list containing only the exact search term
        return [self.search_config['term']]

    def build_search_url(self, search_term: str) -> str:
        """Build YouTube search URL with 'Today' filter"""
        base_url = f"{YOUTUBE_CONFIG['base_url']}{YOUTUBE_CONFIG['search_path']}"

        params = {
            'search_query': search_term,
            'sp': YOUTUBE_CONFIG['today_filter']  # Today filter
        }

        return f"{base_url}?{urlencode(params)}"

    def scroll_for_videos(self, target_count: int, max_scrolls: int = 10) -> bool:
        """Scroll down to load more videos. Returns True if target reached."""
        if not self.driver:
            return False

        last_height = self.driver.execute_script("return document.documentElement.scrollHeight")
        scrolls = 0

        while scrolls < max_scrolls:
            # Scroll to bottom
            self.driver.execute_script("window.scrollTo(0, document.documentElement.scrollHeight);")

            # Wait for new content to load
            time.sleep(CONFIG['scroll_pause_time'])

            # Check if we've reached target count
            video_elements = self.driver.find_elements(By.CSS_SELECTOR, YOUTUBE_CONFIG['video_selector'])
            if len(video_elements) >= target_count:
                self.logger.info(f"Reached target video count: {len(video_elements)}")
                return True

            # Check if page height changed
            new_height = self.driver.execute_script("return document.documentElement.scrollHeight")
            if new_height == last_height:
                self.logger.info("No more content to load")
                break

            last_height = new_height
            scrolls += 1
            self.logger.debug(f"Scroll {scrolls}, videos found: {len(video_elements)}")

        return False

    def extract_video_data(self, video_element) -> Optional[Dict[str, Any]]:
        """Extract video data from a video element"""
        try:
            # Extract URL
            title_link = video_element.find_element(By.CSS_SELECTOR, YOUTUBE_CONFIG['title_selector'])
            url = title_link.get_attribute('href')

            if not url or 'watch?v=' not in url:
                return None

            # Extract title
            title = title_link.get_attribute('title') or title_link.text.strip()

            # Check if title contains the search term (case insensitive) - only if filter is enabled
            if self.search_config['title_filter']:
                title_lower = title.lower()
                search_term_lower = self.search_config['term'].lower()
                if search_term_lower not in title_lower:
                    return None  # Skip videos that don't mention the search term in title

            # Extract channel
            try:
                channel_element = video_element.find_element(By.CSS_SELECTOR, YOUTUBE_CONFIG['channel_selector'])
                channel = channel_element.text.strip()
            except NoSuchElementException:
                channel = "Unknown"

            # Extract duration
            try:
                duration_element = video_element.find_element(By.CSS_SELECTOR, YOUTUBE_CONFIG['duration_selector'])
                duration = duration_element.text.strip()
            except NoSuchElementException:
                duration = "Unknown"

            # Extract views and upload time
            try:
                metadata_elements = video_element.find_elements(By.CSS_SELECTOR, '#metadata-line span')
                views = metadata_elements[0].text.strip() if len(metadata_elements) > 0 else "Unknown"
                upload_time = metadata_elements[1].text.strip() if len(metadata_elements) > 1 else "Unknown"
            except (NoSuchElementException, IndexError):
                views = "Unknown"
                upload_time = "Unknown"

            # Extract video ID from URL
            parsed_url = urlparse(url)
            video_id = parse_qs(parsed_url.query).get('v', [None])[0]

            if not video_id:
                return None

            return {
                'video_id': video_id,
                'url': f"https://www.youtube.com/watch?v={video_id}",
                'title': title,
                'channel': channel,
                'duration': duration,
                'views': views,
                'upload_time': upload_time,
                'scraped_at': datetime.now().isoformat(),
                'search_term': getattr(self, '_current_search_term', 'Unknown'),
                'filter_term': self.search_config['term']
            }

        except Exception as e:
            self.logger.debug(f"Error extracting video data: {e}")
            return None

    def scrape_search_term(self, search_term: str, max_videos: int) -> List[Dict[str, Any]]:
        """Scrape videos for a specific search term"""
        self._current_search_term = search_term
        videos = []

        try:
            # For date-based searches, don't use the "Today" filter
            if any(date_str in search_term.lower() for date_str in [
                datetime.now().strftime(fmt).lower() for fmt in DATE_FORMATS
            ]):
                # Use regular search without "Today" filter for date-specific searches
                search_url = f"{YOUTUBE_CONFIG['base_url']}{YOUTUBE_CONFIG['search_path']}?search_query={search_term}"
            else:
                # Use "Today" filter for general searches
                search_url = self.build_search_url(search_term)

            self.logger.info(f"Searching: {search_term}")

            self.driver.get(search_url)

            # Wait for initial results to load
            WebDriverWait(self.driver, 10).until(
                EC.presence_of_element_located((By.CSS_SELECTOR, YOUTUBE_CONFIG['video_selector']))
            )

            # Scroll to load more videos
            self.scroll_for_videos(max_videos)

            # Extract video data
            video_elements = self.driver.find_elements(By.CSS_SELECTOR, YOUTUBE_CONFIG['video_selector'])

            for element in video_elements[:max_videos]:
                video_data = self.extract_video_data(element)
                if video_data:
                    videos.append(video_data)

            self.logger.info(f"Found {len(videos)} videos matching '{search_term}'")

        except TimeoutException:
            self.logger.warning(f"Timeout loading search results for '{search_term}'")
        except Exception as e:
            self.logger.error(f"Error scraping '{search_term}': {e}")

        return videos

    def search_youtube(self, search_terms: Optional[List[str]] = None, max_videos: int = 50) -> List[Dict[str, Any]]:
        """Main search method - search YouTube for content matching the search term"""
        if not self.driver:
            self.setup_driver()

        if search_terms is None:
            search_terms = self.get_default_search_terms()

        all_videos = []
        seen_urls = set()

        self.logger.info(f"Starting scrape with search term: '{self.search_config['term']}', max {max_videos} videos total")

        for term in search_terms:
            if len(all_videos) >= max_videos:
                break

            remaining = max_videos - len(all_videos)
            term_videos = self.scrape_search_term(term, CONFIG['max_videos_per_search'])

            # Filter out duplicates
            for video in term_videos:
                if video['url'] not in seen_urls and len(all_videos) < max_videos:
                    all_videos.append(video)
                    seen_urls.add(video['url'])

            # Small delay between searches to be respectful
            time.sleep(1)

        self.logger.info(f"Scraping complete. Found {len(all_videos)} unique videos")
        return all_videos

    def __enter__(self):
        """Context manager entry"""
        return self

    def __exit__(self, exc_type, exc_val, exc_tb):
        """Context manager exit"""
        self.cleanup()

print("✅ YouTubeSearchScraper class defined!")

## 6. Output Formatter Class

Define the formatter class for generating different output formats.

In [None]:
class URLFormatter:
    """Handles formatting and saving of scraped video data"""

    def __init__(self, search_config=None):
        self.search_config = search_config or SEARCH_CONFIG

    def format_for_bulk_import(self, videos: List[Dict[str, Any]]) -> str:
        """Format URLs for bulk import (one URL per line)"""
        urls = [video['url'] for video in videos if video.get('url')]
        return '\n'.join(urls)

    def format_with_metadata(self, videos: List[Dict[str, Any]]) -> str:
        """Format videos with detailed metadata in readable text format"""
        lines = []
        search_term = self.search_config['term'].title()  # Capitalized for display
        lines.append(f"{search_term} YouTube Videos")
        lines.append("=" * 50)
        lines.append(f"Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
        lines.append(f"Total videos: {len(videos)}")
        lines.append("")

        for i, video in enumerate(videos, 1):
            lines.append(f"{i}. {video.get('title', 'Unknown Title')}")
            lines.append(f"   URL: {video.get('url', 'N/A')}")
            lines.append(f"   Channel: {video.get('channel', 'Unknown')}")
            lines.append(f"   Duration: {video.get('duration', 'Unknown')}")
            lines.append(f"   Views: {video.get('views', 'Unknown')}")
            lines.append(f"   Uploaded: {video.get('upload_time', 'Unknown')}")
            lines.append(f"   Scraped from: {video.get('search_term', 'Unknown')}")
            lines.append("")

        return '\n'.join(lines)

    def create_dataframe(self, videos: List[Dict[str, Any]]) -> pd.DataFrame:
        """Create pandas DataFrame from video data"""
        if not videos:
            return pd.DataFrame()

        # Select columns to include
        columns = ['video_id', 'url', 'title', 'channel', 'duration', 'views', 'upload_time', 'search_term', 'filter_term']

        data = []
        for video in videos:
            row = {col: video.get(col, '') for col in columns}
            data.append(row)

        return pd.DataFrame(data)

print("✅ URLFormatter class defined!")

## 7. Run the Scraper

Execute the YouTube search scraper. This will take 1-2 minutes.

In [None]:
# Check if search term has been configured
if not SEARCH_CONFIG.get('configured', False):
    print("❌ ERROR: No search term configured!")
    print("   Go back to Cell 2, enter a search term, and click 'Set Search Term'.")
    print("   Then run this cell again.")
    raise SystemExit("Execution stopped: Search term required")

if not SEARCH_CONFIG['term'].strip():
    print("❌ ERROR: Search term is empty!")
    print("   Go back to Cell 2 and enter a valid search term.")
    raise SystemExit("Execution stopped: Empty search term")

# Initialize scraper and formatter
print(f"🚀 Initializing YouTube Search Scraper for '{SEARCH_CONFIG['term']}'...")
scraper = YouTubeSearchScraper(headless=CONFIG['headless'], debug=CONFIG['debug'], search_config=SEARCH_CONFIG)
formatter = URLFormatter(search_config=SEARCH_CONFIG)

# Run the scrape
print(f"🔍 Starting YouTube search for '{SEARCH_CONFIG['term']}' content...")
videos = scraper.search_youtube(max_videos=CONFIG['max_videos'])

# Display results
print(f"\n✅ Found {len(videos)} videos matching '{SEARCH_CONFIG['term']}'!")

if videos:
    print("\n📊 Sample Results:")
    for i, video in enumerate(videos[:3], 1):
        print(f"{i}. {video['title'][:60]}...")
        print(f"   Channel: {video['channel']}")
        print(f"   Views: {video['views']}")
        print(f"   URL: {video['url']}")
        print()

# Cleanup
scraper.cleanup()
print("🧹 Browser cleanup completed")

## 8. Generate Output Files

Create different output formats from the scraped data.

In [None]:
# Generate timestamp for filenames
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
search_term_slug = SEARCH_CONFIG['term'].lower().replace(' ', '_')

if videos:
    print("📁 Generating output files...")

    # 1. Bulk import format (for NotebookLM)
    bulk_content = formatter.format_for_bulk_import(videos)
    bulk_filename = f'{search_term_slug}_bulk_import_{timestamp}.txt'
    with open(bulk_filename, 'w', encoding='utf-8') as f:
        f.write(bulk_content)
    print(f"✓ Created {bulk_filename} ({len(videos)} URLs)")

    # 2. Detailed text format
    detailed_content = formatter.format_with_metadata(videos)
    detailed_filename = f'{search_term_slug}_detailed_{timestamp}.txt'
    with open(detailed_filename, 'w', encoding='utf-8') as f:
        f.write(detailed_content)
    print(f"✓ Created {detailed_filename}")

    # 3. CSV format
    df = formatter.create_dataframe(videos)
    csv_filename = f'{search_term_slug}_data_{timestamp}.csv'
    df.to_csv(csv_filename, index=False)
    print(f"✓ Created {csv_filename}")

    # 4. JSON format
    json_filename = f'{search_term_slug}_data_{timestamp}.json'
    import json
    with open(json_filename, 'w', encoding='utf-8') as f:
        json.dump({
            'metadata': {
                'generated_at': datetime.now().isoformat(),
                'total_videos': len(videos),
                'search_term': SEARCH_CONFIG['term'],
                'source': 'YouTube Search Scraper'
            },
            'videos': videos
        }, f, indent=2, ensure_ascii=False)
    print(f"✓ Created {json_filename}")

    print("\n✅ All output files generated successfully!")

else:
    print("❌ No videos found to process")

## 9. Display Results Summary

Show a summary of the scraping results.

In [None]:
if videos:
    search_term = SEARCH_CONFIG['term']
    print(f"📊 '{search_term}' Search Results Summary")
    print("=" * 40)

    # Basic stats
    print(f"Total videos found: {len(videos)}")

    # Channel breakdown
    channels = {}
    for video in videos:
        channel = video.get('channel', 'Unknown')
        channels[channel] = channels.get(channel, 0) + 1

    print(f"Unique channels: {len(channels)}")
    if channels:
        top_channel = max(channels.items(), key=lambda x: x[1])
        print(f"Top channel: {top_channel[0]} ({top_channel[1]} videos)")

    # Show top 5 channels
    print("\nTop 5 Channels:")
    sorted_channels = sorted(channels.items(), key=lambda x: x[1], reverse=True)
    for i, (channel, count) in enumerate(sorted_channels[:5], 1):
        print(f"{i}. {channel}: {count} videos")

    print("\n📁 Output Files:")
    search_term_slug = search_term.lower().replace(' ', '_')
    print(f"• {search_term_slug}_bulk_import_{timestamp}.txt - URLs for NotebookLM bulk import")
    print(f"• {search_term_slug}_detailed_{timestamp}.txt - Detailed video information")
    print(f"• {search_term_slug}_data_{timestamp}.csv - Spreadsheet format")
    print(f"• {search_term_slug}_data_{timestamp}.json - Complete structured data")

    print("\n🎯 Next Steps:")
    print("1. Download the bulk_import_*.txt file")
    print("2. Go to NotebookLM (notebooklm.google.com)")
    print("3. Use 'Bulk import' feature with the URLs")
    print(f"4. Enjoy your '{search_term}' video collection!")

else:
    print("❌ No videos were found. Try running the scraper again or check your internet connection.")

## 10. Display Bulk Import URLs

Here are your YouTube URLs ready for NotebookLM bulk import:

In [None]:
# Display bulk import URLs for easy copying
if videos:
    search_term = SEARCH_CONFIG['term'].upper()
    print(f"📋 {search_term} YOUTUBE URLs - Copy these for NotebookLM Bulk Import")
    print("=" * 70)
    print(f"Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
    print(f"Total videos: {len(videos)}")
    print("=" * 70)
    print()

    # Display URLs one per line (just the URLs for easy copying)
    for video in videos:
        print(video['url'])

    print()
    print("=" * 70)
    print("💡 Copy these URLs and paste them into NotebookLM's bulk import feature!")
    print("   Go to notebooklm.google.com → Create/Open notebook → Bulk import")

else:
    print("❌ No videos found to display")

## 🎉 Complete!

Your YouTube Search Scraper has successfully found and formatted videos for NotebookLM bulk import!

**Copy the URLs from Cell 10 and paste them directly into NotebookLM.** 🚀

### Important Notes:

- To perform a different search, simply change the search term in the widget and run the cells again
- The title filter option lets you control whether videos must contain your search term in the title
- Output files are named based on your search term for easy organization