## Udemy

In [2]:
import os
import time
import urllib.parse
import undetected_chromedriver as uc
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
import pandas as pd
import html
import re

In [None]:

# Setup driver
options = uc.ChromeOptions()
options.add_argument(r"user-data-dir=C:/Users/jackc/AppData/Local/Google/Chrome/User Data/Profile 1")
driver = uc.Chrome(version_main=134, headless=False)

# --------------------------------------------------------------------------------------------------------------------------
# Search skills: what skills do you want to get course data on
# --------------------------------------------------------------------------------------------------------------------------
# Specify the path to the file containing the extracted skills
file_path = './all_skills.txt'

# Open the file and read its contents
with open(file_path, 'r', encoding='utf-8') as file:
    skills = file.readlines()  # Read each line as an individual skill
tech_skills = [skill.strip() for skill in skills]

latest_skill_index = 8616

num_page = 1
data_folder = "udemy_data"
os.makedirs(data_folder, exist_ok=True)  # Ensure folder exists


# List of URLs to scrape
url_list = ["https://www.udemy.com/courses/search/?locale=en_US&p=" + str(i) + "&q=" + skill.replace(" ", "+") + \
            "&src=ukw"  for skill in tech_skills[latest_skill_index:] for i in range(1, num_page+1)]  # 20 courses per page 

data = []
for idx, url in enumerate(url_list):
    try:
        print(f"Scraping URL: {url}")
        driver.get(url)
        WebDriverWait(driver, 10).until(
        EC.visibility_of_element_located((By.CSS_SELECTOR, 'div[data-purpose="course-price-text"]'))  # Change locator as needed
    )
     
    except Exception as e:
        print(f"Failed to scrape {url}: {e}")
        
    # Parse the HTML content using BeautifulSoup
    soup = BeautifulSoup(driver.page_source, 'html.parser')

    # Find all product containers
    course_divs = soup.find_all('div', class_= "course-card-module--container--3oS-F course-card-module--large--AL3kI")
    print(f"Course divs found {len(course_divs)}")
    
    # Get skill name
    skill_name = tech_skills[(latest_skill_index+ idx)//num_page]
    
    # Extract information
    course = []
    for div in course_divs:
        if div:
            # Extract course link
            link_div = div.find("h3", {"data-purpose": "course-title-url"})
            link_tag = link_div.find("a") if link_div else None
            head = "https://www.udemy.com"
            link = (head + link_tag['href']) if link_tag else None

            # Extract course image link
            img_div = div.find("div", class_="course-card-module--image-container--o-meJ")
            img_tag = img_div.find("img") if img_div else None
            img_link = img_tag['src'] if img_tag else None

            # Extract partner name
            partner_tag = div.find('div', class_='course-card-instructors-module--instructor-list--cJTfw')
            partner = partner_tag.text.strip() if partner_tag else None

            # Extract course title
            title_div = div.find('h3', class_='ud-heading-md course-card-title-module--course-title--wmFXN')
            title_tag = title_div.find('a') if title_div else None
            title = title_tag.contents[0].strip() if title_tag else None

            # Extract skills                
            desc_tag = div.find("span", {"data-testid": "seo-headline"})
            desc = desc_tag.text.replace("<strong>","").replace("</strong>","").replace('"',"").strip() if desc_tag else None
            
            # Extract rating
            rating_tag = div.find('span', class_='ud-heading-sm star-rating-module--rating-number--2-qA2')
            rating = float(rating_tag.text.strip()) if rating_tag else None

            # Extract number of reviews
            num_review_tag = div.find("span", class_="ud-text-xs course-card-ratings-module--reviews-text--1z0l4")
            num_review = int(num_review_tag.text.replace("(", "").replace(")", "").replace(",", "")) if num_review_tag else None

            # Extract course info
            info_div = div.find_all("span", class_="course-card-details-module--row--jw-lD")
            info = []
            for info_tag in info_div:
                info.append(info_tag.text.strip())
            # Allowed difficulty levels
            difficulty_levels = {"Beginner", "All Levels", "Intermediate", "Advanced"}
            # Duration keywords
            duration_units = {"total hours"}
            def extract_info(info):
                difficulty, num_lecture, duration = None, None, None
                for part in info:
                    if part in difficulty_levels:
                        difficulty = part  # Assign as difficulty
                    elif any(unit in part for unit in duration_units):  # Check for duration keywords
                        duration = part
                    else:
                        num_lecture = part  # Assign as type if it's neither difficulty nor duration
                return difficulty, num_lecture, duration
            difficulty, num_lecture, duration = extract_info(info) if info else (None, None, None)
            
            # Extract price
            price_div = div.find("span", {"data-testid": "seo-current-price"})
            price = price_div.text.strip() if price_div else None
            
            
            # Append extracted data
            course.append({
                'skill': skill_name,
                'link': link,
                'image_link': img_link,
                'partner': partner,
                'title': title,
                'course_description': desc,
                'rating': rating,
                'num_review': num_review,
                'difficulty': difficulty,
                'num_lecture': num_lecture,
                'duration': duration,
                'price': price
            })
    df = pd.DataFrame(course)
    skillname = skill_name.replace("/", "_").replace("*", "_")
    csv_path = os.path.join(data_folder, f"{skillname}.csv")
    df.to_csv(csv_path, index=False)
    print(f"{skillname}.csv saved at: {csv_path}")
    print(f"Scraping Complete ({(latest_skill_index+idx+1)}/{len(tech_skills) * num_page})-----------------------------------------------------------")
driver.quit()

Scraping URL: https://www.udemy.com/courses/search/?locale=en_US&p=1&q=Python&src=ukw
Course divs found 20
Python.csv saved at: udemy_data\Python.csv
Scraping Complete (1/9481)-----------------------------------------------------------
Scraping URL: https://www.udemy.com/courses/search/?locale=en_US&p=1&q=Java&src=ukw
Course divs found 20
Java.csv saved at: udemy_data\Java.csv
Scraping Complete (2/9481)-----------------------------------------------------------
Scraping URL: https://www.udemy.com/courses/search/?locale=en_US&p=1&q=Sql&src=ukw
Course divs found 20
Sql.csv saved at: udemy_data\Sql.csv
Scraping Complete (3/9481)-----------------------------------------------------------
Scraping URL: https://www.udemy.com/courses/search/?locale=en_US&p=1&q=Aws&src=ukw
Course divs found 20
Aws.csv saved at: udemy_data\Aws.csv
Scraping Complete (4/9481)-----------------------------------------------------------
Scraping URL: https://www.udemy.com/courses/search/?locale=en_US&p=1&q=Machine+L

KeyboardInterrupt: 

In [13]:
# Convert into pandas dataframe and save files

# Output folder for saving HTML files
data_folder = "udemy_data"
os.makedirs(data_folder, exist_ok=True)  # Ensure folder exists

df = pd.DataFrame(data)
csv_path = os.path.join(data_folder, f"udemy_raw.csv")
df.to_csv(csv_path, index=False)
print(f"csv saved at: {csv_path}")

csv saved at: udemy_data\udemy_raw.csv
