## Coursera

In [None]:
import os
import time
import urllib.parse
from bs4 import BeautifulSoup
import pandas as pd
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from webdriver_manager.chrome import ChromeDriverManager


✅ Exported top 8000 skills to top_skills.txt


In [17]:
# Setup Selenium WebDriver
options = webdriver.ChromeOptions()
options.add_argument("--lang=en-US")  # Force browser to load English
options.add_argument("--headless")  # Run in headless mode (no GUI)
options.add_argument("--disable-gpu")
options.add_argument("--window-size=1920,1080")
options.add_argument("--log-level=3")  # Suppress logs

service = Service(ChromeDriverManager().install())
driver = webdriver.Chrome(service=service, options=options)

# --------------------------------------------------------------------------------------------------------------------------
# Search skills: what skills do you want to get course data on
# --------------------------------------------------------------------------------------------------------------------------
# Specify the path to the file containing the extracted skills
file_path = './all_skills.txt'

# Open the file and read its contents
with open(file_path, 'r', encoding='utf-8') as file:
    skills = file.readlines()  # Read each line as an individual skill
tech_skills = [skill.strip() for skill in skills]

latest_skill_index = 0

data_folder = "coursera_data"
os.makedirs(data_folder, exist_ok=True)  # Ensure folder exists

# List of URLs to scrape
url_list = ["https://www.coursera.org/search?query="+urllib.parse.quote(skill) for skill in tech_skills[latest_skill_index:]]  # Add more URLs as needed

# Function to scroll down to load dynamic content
def scroll_page(n,target_class):
    """scroll page utill there are n course cards or no new contents are loaded"""
    num_courses = len(driver.find_elements(By.CLASS_NAME, target_class))
    no_new_scroll = 0
    print(f"Scrolling and get num_course: {num_courses}",end="")
    while num_courses < n and no_new_scroll < 10:  # Adjust range as needed
        driver.execute_script("window.scrollBy(0, 800);")
        time.sleep(0.5)# Wait for content to load
        new_num_courses = len(driver.find_elements(By.CLASS_NAME, target_class))
        print(f"-{new_num_courses}",end="")
        if new_num_courses > num_courses:
            no_new_scroll = 0
            num_courses = new_num_courses
        else:
            no_new_scroll += 1
    print('')

data = []
for idx, url in enumerate(url_list):
    try:
        print(f"Scraping URL: {url}")
        driver.get(url)
        while not driver.page_source:
            time.sleep(3)# Wait for initial load
        
        # Define the class name of the target div (Update based on actual class)
        target_class = "cds-ProductCard-gridCard"  # Example class, adjust based on inspection
        
        # ------------------------------------------------------------------------------------------------------------------
        # Scroll page; change number of courses you want to scroll and scrape for each skill
        # ------------------------------------------------------------------------------------------------------------------
        # scroll_page(10,target_class)

    except Exception as e:
        print(f"Failed to scrape {url}: {e}")

    # Parse the HTML content using BeautifulSoup
    soup = BeautifulSoup(driver.page_source, 'html.parser')

    # Find all product containers
    course_divs = soup.find_all('div', class_= "cds-ProductCard-gridCard")
    print(f"Course divs found {len(course_divs)}")
    
    # Get skill name
    skill_name = tech_skills[latest_skill_index + idx]
    
    # Extract information
    course = []
    for div in course_divs:
        if div:
            # Extract course link
            link_div = div.find("a", class_="cds-119 cds-113 cds-115 cds-CommonCard-titleLink css-vflzcf cds-142")
            head = "https://www.coursera.org"
            link = link_div['href'] if link_div else None
            link = head + link if link else None

            # Extract course image link
            img_div = div.find("div", class_="cds-CommonCard-previewImage")
            img_tag = img_div.find("img") if img_div else None
            img_link = img_tag['src'] if img_tag else None

            # Extract partner name
            partner_tag = div.find('p', class_='cds-ProductCard-partnerNames css-vac8rf')
            partner = partner_tag.text.strip() if partner_tag else None

            # Extract course title
            title_tag = div.find('h3', class_='cds-CommonCard-title css-6ecy9b')
            title = title_tag.text.strip() if title_tag else None

            # Extract skills
            skill_div = div.find("div", class_="cds-ProductCard-body")
            skill_tag = skill_div.find('p', class_="css-vac8rf") if skill_div else None
            text = skill_tag.find_all(string=True, recursive=False) if skill_tag else None
            skill = [word.strip() for word in "".join(text).strip().split(",")] if text else None


            # Extract rating
            rating_tag = div.find('span', class_='css-6ecy9b')
            rating = float(rating_tag.text.strip()) if rating_tag else None

            def convert_reviews(text):
                text = text.replace("reviews", "").strip()  # Remove "reviews" and trim spaces
                if "K" in text:
                    return int(float(text.replace("K", "")) * 1000)  # Convert "2.1K" → 2100
                return int(text)  # Convert normal numbers

            # Extract number of reviews
            num_review_div = div.find("div", class_="cds-CommonCard-ratings")
            num_review_tag = num_review_div.find('div', class_='css-vac8rf') if num_review_div else None
            num_review = convert_reviews(num_review_tag.text.strip()) if num_review_tag else None

            # Extract course info
            info_div = div.find("div", class_="cds-CommonCard-metadata")
            info_tag = info_div.find('p', class_='css-vac8rf') if info_div else None
            info = info_tag.text.strip() if info_tag else None
            # Allowed difficulty levels
            difficulty_levels = {"Beginner", "Intermediate", "Advanced", "Mixed"}
            # Duration keywords
            duration_units = {"Weeks", "Months", "Hours"}
            def extract_info(text):
                parts = [part.strip() for part in text.split("·")]  # Split by '·' and clean spaces
                # Default buckets
                difficulty, course_type, duration = None, None, None
                for part in parts:
                    if part in difficulty_levels:
                        difficulty = part  # Assign as difficulty
                    elif any(unit in part for unit in duration_units):  # Check for duration keywords
                        duration = part
                    else:
                        course_type = part  # Assign as type if it's neither difficulty nor duration
                return difficulty, course_type, duration
            difficulty, course_type, duration = extract_info(info) if info else (None, None, None)
            
            # Append extracted data
            course.append({
                'skill': skill_name,
                'link': link,
                'image_link': img_link,
                'partner': partner,
                'title': title,
                'course_skill': skill,
                'rating': rating,
                'num_review': num_review,
                'difficulty': difficulty,
                'course_type': course_type,
                'duration': duration
            })
    df = pd.DataFrame(course)
    skillname = skill_name.replace("/", "_").replace("*", "_")
    csv_path = os.path.join(data_folder, f"{skillname}.csv")
    df.to_csv(csv_path, index=False)
    print(f"{skillname}.csv saved at: {csv_path}")
    print(f"-----------------------------------------------Scraping Complete ({latest_skill_index+idx+1}/{len(tech_skills)})-----------------------------------------------")
            
# Close the driver after scraping
driver.quit()

Scraping URL: https://www.coursera.org/search?query=Python
Course divs found 12
Python.csv saved at: coursera_data\Python.csv
-----------------------------------------------Scraping Complete (1/9481)-----------------------------------------------
Scraping URL: https://www.coursera.org/search?query=Java
Course divs found 12
Java.csv saved at: coursera_data\Java.csv
-----------------------------------------------Scraping Complete (2/9481)-----------------------------------------------
Scraping URL: https://www.coursera.org/search?query=Sql
Course divs found 12
Sql.csv saved at: coursera_data\Sql.csv
-----------------------------------------------Scraping Complete (3/9481)-----------------------------------------------
Scraping URL: https://www.coursera.org/search?query=Aws
Course divs found 12
Aws.csv saved at: coursera_data\Aws.csv
-----------------------------------------------Scraping Complete (4/9481)-----------------------------------------------
Scraping URL: https://www.coursera.o