In [1]:
# Udemy Course Scraper - Jupyter Notebook Version
# Cell 1: Install and Import Dependencies
# Run this cell first to install required packages

import subprocess
import sys

def install_packages():
    """Install required packages if not already installed"""
    packages = ['requests', 'beautifulsoup4', 'pandas', 'openpyxl', 'lxml', 'ipywidgets', 'tqdm']
    
    for package in packages:
        try:
            __import__(package.replace('-', '_'))
        except ImportError:
            print(f"Installing {package}...")
            subprocess.check_call([sys.executable, "-m", "pip", "install", package])

# Uncomment the line below if you need to install packages
# install_packages()

import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import json
from urllib.parse import urljoin
import re
from IPython.display import display, HTML, clear_output
from tqdm.notebook import tqdm
import getpass
import warnings
warnings.filterwarnings('ignore')

print("✅ All packages imported successfully!")

✅ All packages imported successfully!


In [2]:
# Cell 2: Define the UdemyCourseScraper Class
class UdemyCoursesScraper:
    def __init__(self):
        self.session = requests.Session()
        self.session.headers.update({
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
        })
        self.courses_data = []
        self.progress_bar = None
        
    def login(self, email, password):
        """Login to Udemy account with better error handling"""
        print("🔐 Logging into Udemy...")
        
        try:
            # Get login page to extract CSRF token
            login_url = "https://www.udemy.com/join/login-popup/"
            response = self.session.get(login_url)
            soup = BeautifulSoup(response.content, 'html.parser')
            
            # Find CSRF token
            csrf_token = None
            csrf_input = soup.find('input', {'name': 'csrfmiddlewaretoken'})
            if csrf_input:
                csrf_token = csrf_input.get('value')
            
            # Login data
            login_data = {
                'email': email,
                'password': password,
                'csrfmiddlewaretoken': csrf_token,
            }
            
            # Perform login
            login_response = self.session.post(
                "https://www.udemy.com/join/login-popup/",
                data=login_data,
                headers={'Referer': login_url}
            )
            
            # Check if login was successful
            if login_response.status_code == 200:
                # Verify by checking if we can access the dashboard
                dashboard_response = self.session.get("https://www.udemy.com/home/my-courses/learning/")
                if dashboard_response.status_code == 200 and "my-courses" in dashboard_response.url:
                    print("✅ Login successful!")
                    return True
            
            print("❌ Login failed. Please check your credentials.")
            return False
            
        except Exception as e:
            print(f"❌ Login error: {str(e)}")
            return False
    
    def get_my_courses(self):
        """Get list of purchased courses with progress tracking"""
        print("📚 Fetching your courses...")
        
        try:
            my_courses_url = "https://www.udemy.com/home/my-courses/learning/"
            response = self.session.get(my_courses_url)
            
            if response.status_code != 200:
                print("❌ Failed to access courses page")
                return []
            
            soup = BeautifulSoup(response.content, 'html.parser')
            
            # Look for course cards with multiple strategies
            course_links = []
            
            selectors = [
                'a[href*="/course/"]',
                '.course-card-title a',
                '.course-title a',
                '[data-purpose="course-title-url"]',
                'a[data-purpose="course-card-title"]'
            ]
            
            for selector in selectors:
                links = soup.select(selector)
                if links:
                    course_links.extend([link.get('href') for link in links if link.get('href')])
                    break
            
            # Clean and deduplicate course URLs
            course_urls = set()
            for link in course_links:
                if link:
                    if link.startswith('/'):
                        full_url = urljoin("https://www.udemy.com", link)
                    else:
                        full_url = link
                    
                    if '/course/' in full_url:
                        course_urls.add(full_url)
            
            course_urls = list(course_urls)
            print(f"✅ Found {len(course_urls)} courses")
            return course_urls
            
        except Exception as e:
            print(f"❌ Error fetching courses: {str(e)}")
            return []
    
    def scrape_course_details(self, course_url):
        """Scrape details for a single course with enhanced error handling"""
        try:
            response = self.session.get(course_url)
            
            if response.status_code != 200:
                return None
            
            soup = BeautifulSoup(response.content, 'html.parser')
            
            # Extract course title
            title_selectors = [
                '[data-purpose="course-header-title"]',
                'h1[class*="course-title"]',
                'h1.clp-lead__title',
                'h1'
            ]
            title = self.find_text_by_selectors(soup, title_selectors, "Title not found")
            
            # Extract description
            desc_selectors = [
                '[data-purpose="course-description"]',
                '.course-description',
                '[class*="description"]',
                '.clp-lead__headline'
            ]
            description = self.find_text_by_selectors(soup, desc_selectors, "Description not found")
            
            # Extract instructor
            instructor_selectors = [
                '[data-purpose="instructor-name"]',
                '.instructor-name',
                '[class*="instructor"] a',
                '.clp-lead__instructor-name a',
                'a[href*="/user/"]'
            ]
            instructor = self.find_text_by_selectors(soup, instructor_selectors, "Instructor not found")
            
            # Extract number of lectures
            lectures_selectors = [
                '[data-purpose="curriculum-stats-lectures"]',
                '[class*="lectures"]',
                'span:contains("lectures")',
                'span:contains("lecture")'
            ]
            lectures_text = self.find_text_by_selectors(soup, lectures_selectors, "0")
            lectures = self.extract_number(lectures_text)
            
            # Extract total time
            time_selectors = [
                '[data-purpose="curriculum-stats-duration"]',
                '[class*="duration"]',
                'span:contains("total")',
                'span:contains("hours")',
                'span:contains("hour")'
            ]
            duration_text = self.find_text_by_selectors(soup, time_selectors, "Duration not found")
            
            # Clean up the course URL to remove query parameters
            clean_url = course_url.split('?')[0]
            
            course_data = {
                'Title': title,
                'Description': description[:500] + "..." if len(description) > 500 else description,
                'Instructor': instructor,
                'Number of Lectures': lectures,
                'Total Duration': duration_text,
                'Course URL': clean_url
            }
            
            return course_data
            
        except Exception as e:
            print(f"⚠️ Error scraping course {course_url}: {str(e)}")
            return None
    
    def find_text_by_selectors(self, soup, selectors, default=""):
        """Try multiple selectors to find text content"""
        for selector in selectors:
            if ':contains(' in selector:
                text_to_find = selector.split(':contains(')[1].rstrip(')')
                elements = soup.find_all(text=re.compile(text_to_find, re.I))
                if elements:
                    parent = elements[0].parent
                    return parent.get_text(strip=True) if parent else default
            else:
                element = soup.select_one(selector)
                if element:
                    return element.get_text(strip=True)
        return default
    
    def extract_number(self, text):
        """Extract number from text"""
        numbers = re.findall(r'\d+', text)
        return int(numbers[0]) if numbers else 0
    
    def scrape_all_courses(self, delay=2):
        """Scrape all purchased courses with progress bar"""
        course_urls = self.get_my_courses()
        
        if not course_urls:
            print("❌ No courses found. Make sure you're logged in correctly.")
            return pd.DataFrame()
        
        self.courses_data = []
        
        # Use tqdm for progress bar
        for url in tqdm(course_urls, desc="Scraping courses"):
            course_data = self.scrape_course_details(url)
            
            if course_data:
                self.courses_data.append(course_data)
            
            # Be respectful - add delay between requests
            time.sleep(delay)
        
        df = pd.DataFrame(self.courses_data)
        print(f"✅ Completed! Scraped {len(self.courses_data)} courses successfully.")
        
        return df
    
    def export_to_excel(self, df, filename="udemy_courses.xlsx"):
        """Export dataframe to Excel with formatting"""
        if df.empty:
            print("❌ No course data to export.")
            return
        
        try:
            with pd.ExcelWriter(filename, engine='openpyxl') as writer:
                df.to_excel(writer, sheet_name='My Udemy Courses', index=False)
                
                # Get workbook and worksheet
                workbook = writer.book
                worksheet = writer.sheets['My Udemy Courses']
                
                # Adjust column widths
                for column in worksheet.columns:
                    max_length = 0
                    column_letter = column[0].column_letter
                    
                    for cell in column:
                        try:
                            if len(str(cell.value)) > max_length:
                                max_length = len(str(cell.value))
                        except:
                            pass
                    
                    adjusted_width = min(max_length + 2, 50)
                    worksheet.column_dimensions[column_letter].width = adjusted_width
            
            print(f"✅ Data exported to {filename}")
            print(f"📊 Total courses exported: {len(df)}")
            
        except Exception as e:
            print(f"❌ Error exporting to Excel: {str(e)}")

print("✅ UdemyCoursesScraper class defined successfully!")

✅ UdemyCoursesScraper class defined successfully!


In [3]:
# Cell 3: Login and Authentication
def login_to_udemy():
    """Interactive login function for Jupyter"""
    scraper = UdemyCoursesScraper()
    
    print("🎓 Udemy Course Scraper")
    print("=" * 30)
    
    # Use getpass for secure password input
    email = input("Enter your Udemy email: ")
    password = getpass.getpass("Enter your Udemy password: ")
    
    success = scraper.login(email, password)
    
    if success:
        return scraper
    else:
        print("Please try again with correct credentials.")
        return None

In [4]:
# Cell 4: Main Scraping Function
def scrape_and_display_courses(scraper, delay=2, export_excel=True, filename="udemy_courses.xlsx"):
    """Main function to scrape courses and display results"""
    if scraper is None:
        print("❌ Please login first using the login_to_udemy() function")
        return None
    
    print("🚀 Starting course scraping...")
    
    # Scrape all courses
    df = scraper.scrape_all_courses(delay=delay)
    
    if df.empty:
        print("❌ No courses were scraped successfully.")
        return None
    
    # Display summary
    print(f"\n📊 Summary:")
    print(f"Total courses: {len(df)}")
    print(f"Total lectures: {df['Number of Lectures'].sum()}")
    print(f"Unique instructors: {df['Instructor'].nunique()}")
    
    # Display first few courses
    print(f"\n📋 Preview of your courses:")
    display(df.head())
    
    # Export to Excel if requested
    if export_excel:
        scraper.export_to_excel(df, filename)
    
    return df

In [5]:
# Cell 5: Usage Instructions
print("""
🎯 How to use this scraper in Jupyter:

1. Run Cell 1 to install dependencies
2. Run Cell 2 to define the scraper class  
3. Run Cell 3 to login:
   scraper = login_to_udemy()

4. Run Cell 4 to scrape your courses:
   df = scrape_and_display_courses(scraper)

5. Optional: Analyze your data further:
   # Group by instructor
   instructor_stats = df.groupby('Instructor').agg({
       'Title': 'count',
       'Number of Lectures': 'sum'
   }).rename(columns={'Title': 'Course Count'})
   
   display(instructor_stats.sort_values('Course Count', ascending=False))

⚙️ Customization options:
- Change delay between requests: scrape_and_display_courses(scraper, delay=3)
- Custom filename: scrape_and_display_courses(scraper, filename="my_courses.xlsx")
- Skip Excel export: scrape_and_display_courses(scraper, export_excel=False)
""")


🎯 How to use this scraper in Jupyter:

1. Run Cell 1 to install dependencies
2. Run Cell 2 to define the scraper class  
3. Run Cell 3 to login:
   scraper = login_to_udemy()

4. Run Cell 4 to scrape your courses:
   df = scrape_and_display_courses(scraper)

5. Optional: Analyze your data further:
   # Group by instructor
   instructor_stats = df.groupby('Instructor').agg({
       'Title': 'count',
       'Number of Lectures': 'sum'
   }).rename(columns={'Title': 'Course Count'})
   
   display(instructor_stats.sort_values('Course Count', ascending=False))

⚙️ Customization options:
- Change delay between requests: scrape_and_display_courses(scraper, delay=3)
- Custom filename: scrape_and_display_courses(scraper, filename="my_courses.xlsx")
- Skip Excel export: scrape_and_display_courses(scraper, export_excel=False)



In [6]:
scraper = login_to_udemy()

🎓 Udemy Course Scraper


Enter your Udemy email:  jarrettkw@gmail.com
Enter your Udemy password:  ········


🔐 Logging into Udemy...
❌ Login failed. Please check your credentials.
Please try again with correct credentials.
