In [None]:
import os
import time
import urllib.parse
from bs4 import BeautifulSoup
import pandas as pd
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.support.ui import WebDriverWait
import re

# Setup Selenium WebDriver
options = webdriver.ChromeOptions()
options.add_argument("--lang=en-US")  # Force browser to load English
options.add_argument("--headless")  # Run in headless mode (no GUI)
options.add_argument("--disable-gpu")
options.add_argument("--window-size=1920,1080")
options.add_argument("--log-level=3")  # Suppress logs

service = Service(ChromeDriverManager().install())
driver = webdriver.Chrome(service=service, options=options)

# --------------------------------------------------------------------------------------------------------------------------
# Search skills: what skills do you want to get course data on
# --------------------------------------------------------------------------------------------------------------------------
# Specify the path to the file containing the extracted skills
file_path = './all_skills.txt'
data_folder = "youtube_data"
os.makedirs(data_folder, exist_ok=True)  # Ensure folder exists


# Open the file and read its contents
with open(file_path, 'r', encoding='utf-8') as file:
    skills = file.readlines()  # Read each line as an individual skill
tech_skills = [skill.strip() for skill in skills]

latest_skill_index = 4603

# List of URLs to scrape
url_list = ["https://www.youtube.com/results?search_query="+skill.replace(" ","+")+"+Course"\
            for skill in tech_skills[latest_skill_index:]]  # Add more URLs as needed

# Function to scroll down to load dynamic content
def scroll_page(n,target_tag, target_class):
    """scroll page utill there are n course cards or no new contents are loaded"""
    num_video = len(driver.find_elements(By.CSS_SELECTOR, target_tag+'.'+target_class.replace(" ", ".")))
    no_new_scroll = 0
    print(f"Scrolling and get num_course: {num_video}",end="")
    while num_video < n and no_new_scroll < 10:  # Adjust range as needed
        driver.execute_script("window.scrollBy(0, 1500);")
        time.sleep(1.5)# Wait for content to load
        new_num_video = len(driver.find_elements(By.CSS_SELECTOR, target_tag+'.'+target_class.replace(" ", ".")))
        print(f"-{new_num_video}",end="")
        if new_num_video > num_video:
            no_new_scroll = 0
            num_video = new_num_video
        else:
            no_new_scroll += 1
    print('')
    
data = []

for idx, url in enumerate(url_list):
    try:
        print(f"Scraping URL: {url}")
        driver.get(url)
        while not driver.page_source:
            time.sleep(3)# Wait for initial load
        target_tag = 'ytd-video-renderer'
        target_class = "style-scope ytd-item-section-renderer"
        # ------------------------------------------------------------------------------------------------------------------
        # Scroll page; change number of courses you want to scroll and scrape for each skill
        # ------------------------------------------------------------------------------------------------------------------
        scroll_page(10,target_tag,target_class)
    except Exception as e:
        print(f"Failed to scrape {url}: {e}")

    # Parse the HTML content using BeautifulSoup
    soup = BeautifulSoup(driver.page_source, 'html.parser')

    # Find all product containers
    video_divs = soup.find_all('ytd-video-renderer', class_= "style-scope ytd-item-section-renderer")
    num_videos = len(video_divs)
    print(f"Video divs found {num_videos}")

    def thumbnails_loaded(driver):
        driver.execute_script("window.scrollBy(0, 1500);")
        video_divs = driver.find_elements(By.CSS_SELECTOR, "ytd-video-renderer.style-scope.ytd-item-section-renderer")  # Refresh the list in case of updates
        count = 0
        for video in video_divs[:num_videos]:
            if video:
                thumbnail = video.find_elements(By.CSS_SELECTOR, "a#thumbnail.yt-simple-endpoint.inline-block.style-scope.ytd-thumbnail img.yt-core-image--loaded")
                if thumbnail:
                    count += 1
        return count >= num_videos
    try:
        WebDriverWait(driver, 5).until(thumbnails_loaded)
    except:
        None
    soup = BeautifulSoup(driver.page_source, 'html.parser')
    video_divs = soup.find_all('ytd-video-renderer', class_= "style-scope ytd-item-section-renderer")[:num_videos]

    # Get skill name
    skill_name = tech_skills[latest_skill_index + idx]
    
    # Extract information
    video = []
    for div in video_divs:
        if div:
            # Extract video link
            video_tag = div.find("a", attrs={"class": "yt-simple-endpoint inline-block style-scope ytd-thumbnail", "id": "thumbnail"})
            video_link = "https://youtube.com"+video_tag.get("href") if video_tag else None
            thumbnail_tag = video_tag.find("img") if video_tag else None
            thumbnail_link = thumbnail_tag.get("src") if thumbnail_tag else None

            # Extract video title
            title_div =div.find("a", attrs={"class": "yt-simple-endpoint style-scope ytd-video-renderer", "id": "video-title"})
            title_tag = title_div.find("yt-formatted-string", class_="style-scope ytd-video-renderer") if title_div else None
            title = title_tag.text.strip() if title_tag else None

            # Extract num_views and video age
            num_views_div = div.find("div", attrs={"class": "style-scope ytd-video-meta-block", "id": "metadata-line"})
            metadata_items = [span.text for span in num_views_div.find_all("span")] if num_views_div else None
            if metadata_items and len(metadata_items) >= 2:
                num_views, age = metadata_items[0], metadata_items[1]
                if not age.endswith("ago"):
                    num_views, age = age, num_views  # Swap values if needed
            elif metadata_items and len(metadata_items) == 1:
                if "ago" in metadata_items[0]:
                    num_views, age = None, metadata_items[0]  # It's age, set views to None
                else:
                    num_views, age = metadata_items[0], None  # It's views, set age to None
            else:
                num_views, age = None, None

            # Extract channel name
            channel_tag = div.find("yt-formatted-string", attrs={"class": "style-scope ytd-channel-name", "id": "text"})
            channel = channel_tag.text.strip() if channel_tag else None

            # Extract video snippet
            snippet_div = div.find("yt-formatted-string", class_="metadata-snippet-text style-scope ytd-video-renderer")
            snippet_tag = snippet_div.find_all("span") if snippet_div else None
            snippet = " ".join(span.text.strip() for span in snippet_tag) if snippet_tag else None
            
            # Extract video duration
            duration_tag = div.find("span", attrs={"class": "style-scope ytd-thumbnail-overlay-time-status-renderer", "id": "text"})
            duration = duration_tag.text.strip() if duration_tag else None
            
            # Extract course series
            series_div = div.find("div", attrs={"class": "style-scope ytd-expandable-metadata-renderer", "id": "header"})
            chapter_div = series_div.find("yt-formatted-string",\
                          attrs={"class": "style-scope ytd-expandable-metadata-renderer", "id": "prominent-label-text"})\
                          if series_div else None
            chapter_tag = chapter_div.find("span") if chapter_div else None
            chapter = chapter_tag.text.strip() if chapter_tag else None
            series_div2 = series_div.find("div", attrs={"class": "style-scope ytd-expandable-metadata-renderer", "id": "collapsed-title"})\
                          if series_div else None
            text_tag = series_div2.find("yt-formatted-string") if series_div2 else None
            text = text_tag.text if text_tag else None
            topics = text.split(" | ") if text else None
            series = [re.sub(r"^\d+\.", "", topic).strip() for topic in topics] if topics else None

            
            # Append extracted data
            video.append({
                'skill': skill_name,
                'thumbnail_link': thumbnail_link,
                'video_link': video_link,
                'title': title,
                'num_views': num_views,
                'age': age,
                'channel': channel,
                'snippet': snippet,
                'duration': duration,
                'chapter': chapter,
                'series': series
            })
    df = pd.DataFrame(video)
    skillname = skill_name.replace("/", "_").replace("*", "_")
    csv_path = os.path.join(data_folder, f"{skillname}.csv")
    df.to_csv(csv_path, index=False)
    print(f"{skillname}.csv saved at: {csv_path}")
    print(f"-----------------------------------------------Scraping Complete ({latest_skill_index + idx+1}/{len(tech_skills)})-----------------------------------------------")
            
# Close the driver after scraping
driver.quit()

Scraping URL: https://www.youtube.com/results?search_query=Python+Course
Scrolling and get num_course: 15
Video divs found 15
Python.csv saved at: youtube_data\Python.csv
-----------------------------------------------Scraping Complete (1/9481)-----------------------------------------------
Scraping URL: https://www.youtube.com/results?search_query=Java+Course
Scrolling and get num_course: 13
Video divs found 13
Java.csv saved at: youtube_data\Java.csv
-----------------------------------------------Scraping Complete (2/9481)-----------------------------------------------
Scraping URL: https://www.youtube.com/results?search_query=Sql+Course
Scrolling and get num_course: 14
Video divs found 14
Sql.csv saved at: youtube_data\Sql.csv
-----------------------------------------------Scraping Complete (3/9481)-----------------------------------------------
Scraping URL: https://www.youtube.com/results?search_query=Aws+Course
Scrolling and get num_course: 18
Video divs found 18
Aws.csv saved at

KeyboardInterrupt: 