In [1]:
import pandas as pd
import undetected_chromedriver as uc
from bs4 import BeautifulSoup 
from time import sleep
from datetime import date

In [2]:
# Uses undetected_chromedriver to bypass cloudflare and get source from dynamic content
options = uc.ChromeOptions() 
options.headless = True 
driver = uc.Chrome(use_subprocess=True, options=options) 

In [3]:
# Dataframe
columns = ['Course Title', 'Course Headline',  'Rating', 'Reviews', 'Course Length', 'Lectures', 'Level', 'Instructor' ]
df=pd.DataFrame(columns=columns)

In [4]:
# Data extractor
def scrapeCourses(courses):
    rows = []
    for course in courses:
        row = []

        try:
            course_info = course.find('div',class_="course-card--main-content--2XqiY")
        except:
            course_info = None
            continue

        try:         
            course_title=course_info.find("div", class_="ud-sr-only").previousSibling.text
        except: course_title=None

        try:
            course_head=course_info.find("span",{'data-purpose':'seo-headline'}).text
        except:
            course_head = None

        try:
            course_rating=course_info.find("span",{'data-purpose':'seo-rating'}).text
        except:
            course_rating = None

        try:
            course_reviews=course_info.find("span",{'data-purpose':'seo-num-reviews'}).text
        except:
            course_reviews = None

        try:
            course_hours=course_info.find("span",{'data-purpose':'seo-content-info'}).text
        except:
            course_hours=None

        try: 
            course_lectures = course_info.find("span",{'data-purpose':'seo-num-lectures'}).text
        except: 
            course_lectures= None
        try:
            course_level = course_info.find("span",{'data-purpose':'seo-instructional-level'}).text
        except: course_level= None

        try:
            course_instructor=course_info.find("div",{'data-purpose':"safely-set-inner-html:course-card:visible-instructors"}).text
        except:
            course_instructor= None

        row = [course_title, course_head, course_rating, course_reviews, course_hours, course_lectures, course_level, course_instructor]
        rows.append(row)
        
    return rows
    

In [5]:
# Params
query = 'python'
page_limit= 5

In [6]:
# Driver
for page in range(1,page_limit+1):
    driver.get(f'https://www.udemy.com/courses/search/?p={page}&q={query}&src=ukw')
    #driver.get("https://www.udemy.com/topic/javascript/")
    sleep(5)
    html = driver.page_source
    soup = BeautifulSoup(html, 'html.parser')
    courses_list = soup.find('div', {'class': 'course-list--container--3zXPS'})
    courses = courses_list.find_all('div', {'class': 'course-card--container--1QM2W'})
    rows = scrapeCourses(courses)
    tempdf = pd.DataFrame(rows,columns=columns)
    df=pd.concat([df,tempdf], ignore_index=True)

driver.close()

In [7]:
df

Unnamed: 0,Course Title,Course Headline,Rating,Reviews,Course Length,Lectures,Level,Instructor
0,100 Days of Code: The Complete Python Pro Boot...,Master <strong>Python</strong> by building 100...,Rating: 4.7 out of 5,167538 reviews,64 total hours,676 lectures,All Levels,Dr. Angela Yu
1,The Complete Python Bootcamp From Zero to Hero...,Learn <strong>Python</strong> like a Professio...,Rating: 4.6 out of 5,448503 reviews,22 total hours,155 lectures,All Levels,Jose Portilla
2,Complete Python Developer in 2023: Zero to Mas...,How to become a <strong>Python</strong> 3 Deve...,Rating: 4.7 out of 5,44317 reviews,31 total hours,338 lectures,All Levels,"Andrei Neagoie, Zero To Mastery"
3,Web Developer Bootcamp with Flask and Python i...,"Become a Full Stack Web Developer using Flask,...",Rating: 4.6 out of 5,5930 reviews,20 total hours,186 lectures,Intermediate,"Jose Salvatierra, Teclado by Jose Salvatierra"
4,The Complete Python & PostgreSQL Developer Course,Build 9 projects—master two essential and mode...,Rating: 4.6 out of 5,4883 reviews,22 total hours,186 lectures,Beginner,"Rob Percival, Jose Salvatierra, Codestars • ov..."
...,...,...,...,...,...,...,...,...
150,Python Developer | Complete course 2021,"Learn programming by starting from zero, and c...",Rating: 4.7 out of 5,826 reviews,27.5 total hours,389 lectures,All Levels,Jonathan Roux | CodeWithJonathan | Python - Dj...
151,Python Complete Course For Python Beginners,<strong>Python</strong> Complete Course For <s...,Rating: 4.2 out of 5,3023 reviews,7.5 total hours,51 lectures,Beginner,Horizon Tech
152,Python In Practice : 15 Projects to Master Python,Learn <strong>python</strong> programming and ...,Rating: 4.4 out of 5,1434 reviews,21 total hours,206 lectures,All Levels,Rahul Mula
153,Python for Computer Vision with OpenCV and Dee...,Learn the latest techniques in computer vision...,Rating: 4.6 out of 5,8891 reviews,14 total hours,92 lectures,Intermediate,Jose Portilla


In [8]:
# Output
df.to_csv(f'output/udemy_courses_{query}_{date.today()}.csv')
df.to_json(f'output/udemy_courses_{query}_{date.today()}.json')