In [2]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager
import pandas as pd
import time
import random
import re
import sqlite3
import os
from datetime import datetime

class ImprovedCourseraScraper:
    def __init__(self, headless=False):  # Set to False for debugging
        self.options = Options()
        
        self.options.add_argument('--no-sandbox')
        self.options.add_argument('--disable-dev-shm-usage')
        self.options.add_argument('--window-size=1920,1080')
        self.options.add_argument('--disable-blink-features=AutomationControlled')
        self.options.add_argument('--disable-gpu')
        self.options.add_argument('--disable-extensions')
        
        self.options.add_argument('user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36')
        
        if headless:
            self.options.add_argument('--headless')
        
        self.driver = webdriver.Chrome(
            service=Service(ChromeDriverManager().install()),
            options=self.options
        )
        
        self.courses_data = []
        self.db_connection = None
        
    def setup_sqlite_database(self, db_path="coursera_courses.sqlite"):
        try:
            self.db_connection = sqlite3.connect(db_path)
            self._create_courses_table()
            print(f"SQLite database connected: {db_path}")
            return True
            
        except Exception as e:
            print(f"Error setting up SQLite database: {e}")
            return False

    def _create_courses_table(self):
        create_table_sql = """
        CREATE TABLE IF NOT EXISTS courses (
            id INTEGER PRIMARY KEY AUTOINCREMENT,
            title TEXT NOT NULL,
            provider TEXT,
            instructors TEXT,
            rating REAL,
            rating_count INTEGER,
            duration TEXT,
            difficulty_level TEXT,
            course_type TEXT,
            price TEXT,
            description TEXT,
            skills_covered TEXT,
            link TEXT,
            language TEXT,
            scraped_date TEXT,
            created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
            UNIQUE(title, provider)
        )
        """
        
        cursor = self.db_connection.cursor()
        cursor.execute(create_table_sql)
        self.db_connection.commit()
        cursor.close()
        print("Courses table created/verified")

    def scrape_courses(self, search_query="", max_pages=2):
        try:
            for page in range(max_pages):
                # Build URL with search query
                base_url = "https://www.coursera.org/search"
                if search_query:
                    url = f"{base_url}?query={search_query.replace(' ', '%20')}"
                    if page > 0:
                        url += f"&page={page + 1}"
                else:
                    url = f"{base_url}"
                    if page > 0:
                        url += f"?page={page + 1}"
                
                print(f"Scraping page {page + 1}...")
                print(f"URL: {url}")
                
                self.driver.get(url)
                time.sleep(random.uniform(4, 6))
                
                # Debug: Save page source for inspection
                with open(f"coursera_page_{page + 1}.html", "w", encoding="utf-8") as f:
                    f.write(self.driver.page_source)
                print(f"Page source saved to coursera_page_{page + 1}.html")
                
                # Wait for page to load
                WebDriverWait(self.driver, 15).until(
                    EC.presence_of_element_located((By.TAG_NAME, "body"))
                )
                
                # Handle popups and consent forms
                self._handle_popups()
                
                # Scroll to load content
                self._scroll_page()
                
                courses_count_before = len(self.courses_data)
                self._extract_page_courses_improved()
                courses_added = len(self.courses_data) - courses_count_before
                
                print(f"Added {courses_added} courses from page {page + 1}")
                
                # Save to database after each page
                if courses_added > 0 and self.db_connection:
                    saved_count = self.save_to_database()
                    print(f"Saved {saved_count} courses to database")
                
                if page < max_pages - 1:
                    time.sleep(random.uniform(3, 5))
                    
        except Exception as e:
            print(f"Error during scraping: {e}")
            import traceback
            traceback.print_exc()
        
        finally:
            self.driver.quit()
            if self.db_connection:
                self.db_connection.close()
                print("Database connection closed")
        
        return self.courses_data

    def _handle_popups(self):
        """Handle various popups that might appear"""
        popup_selectors = [
            'button[aria-label="Close"]',
            'button[data-e2e="close-button"]',
            '.cds-modal-close',
            '.rc-ModalCloseButton',
            'button[class*="close"]',
            '.phoenix-close',
            '[data-testid="modal-close-button"]'
        ]
        
        for selector in popup_selectors:
            try:
                close_btn = WebDriverWait(self.driver, 3).until(
                    EC.element_to_be_clickable((By.CSS_SELECTOR, selector))
                )
                close_btn.click()
                print(f"Closed popup with selector: {selector}")
                time.sleep(1)
            except:
                continue

    def _scroll_page(self):
        """Scroll page to load all content"""
        try:
            last_height = self.driver.execute_script("return document.body.scrollHeight")
            for _ in range(3):
                self.driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
                time.sleep(2)
                new_height = self.driver.execute_script("return document.body.scrollHeight")
                if new_height == last_height:
                    break
                last_height = new_height
        except:
            pass

    def _extract_page_courses_improved(self):
        try:
            # More comprehensive course card selectors
            course_selectors = [
                'li[class*="ais-InfiniteHits-item"]',
                '[data-testid*="search-result"]',
                '[class*="cds-ProductCard"]',
                '.rc-SearchHit',
                '[class*="course-card"]',
                '.bt3-col-xs-12',  # Bootstrap-based selectors
                '.card-info',
                'div[data-e2e*="course-card"]'
            ]
            
            all_cards = []
            for selector in course_selectors:
                try:
                    cards = self.driver.find_elements(By.CSS_SELECTOR, selector)
                    if cards:
                        print(f"Found {len(cards)} elements with selector: {selector}")
                        all_cards.extend(cards)
                except:
                    continue
            
            # Remove duplicates by element reference
            unique_cards = []
            seen_elements = set()
            for card in all_cards:
                element_id = id(card)
                if element_id not in seen_elements:
                    seen_elements.add(element_id)
                    unique_cards.append(card)
            
            print(f"Total unique course cards found: {len(unique_cards)}")
            
            for i, card in enumerate(unique_cards):
                if i > 0 and i % 2 == 0:
                    time.sleep(random.uniform(1, 2))
                
                course_info = self._extract_course_info_improved(card)
                if course_info:
                    self.courses_data.append(course_info)
                    print(f"✓ Extracted: {course_info['title'][:50]}...")
                    
        except Exception as e:
            print(f"Error extracting courses: {e}")
            import traceback
            traceback.print_exc()

    def _extract_course_info_improved(self, card):
        try:
            # Get the entire card text for debugging
            card_text = card.text
            if not card_text or len(card_text.strip()) < 10:
                return None
            
            print(f"Card text preview: {card_text[:100]}...")
            
            # Extract title - try multiple selectors
            title = None
            title_selectors = [
                'h2',
                'h3',
                '[class*="title"]',
                '[data-testid*="title"]',
                '.card-title',
                '.course-name',
                'a[href*="/learn/"]',
                'a[href*="/specializations/"]'
            ]
            
            for selector in title_selectors:
                title = self._safe_extract(card, selector, 'text')
                if title and len(title.strip()) > 5:
                    title = title.strip()
                    break
            
            if not title:
                # Try to extract title from card text
                lines = [line.strip() for line in card_text.split('\n') if line.strip()]
                if lines:
                    title = lines[0]  # First line is often the title
            
            if not title:
                return None
            
            # Extract provider
            provider = self._extract_provider(card, card_text)
            
            # Extract rating
            rating = self._extract_rating(card, card_text)
            
            # Extract rating count
            rating_count = self._extract_rating_count(card, card_text)
            
            # Extract other information
            duration = self._extract_duration(card_text)
            difficulty = self._extract_difficulty(card_text)
            course_type = self._extract_course_type(card_text)
            price = self._extract_price_info(card_text)
            description = self._extract_description(card, card_text)
            skills = self._extract_skills(card_text)
            
            # Extract link
            link = self._safe_extract(card, 'a', 'href')
            if link and not link.startswith('http'):
                link = 'https://www.coursera.org' + link
            
            return {
                'title': title,
                'provider': provider or "Provider not specified",
                'instructors': "Instructors not specified",  # Hard to extract from list view
                'rating': rating,
                'rating_count': rating_count,
                'duration': duration or "Duration not specified",
                'difficulty_level': difficulty or "Not specified",
                'course_type': course_type or "Course",
                'price': price or "Price not specified",
                'description': description or "Description not available",
                'skills_covered': skills or "Skills not specified",
                'link': link or "Link not available",
                'language': "English",  # Default assumption
                'scraped_date': datetime.now().strftime('%Y-%m-%d %H:%M:%S')
            }
                
        except Exception as e:
            print(f"Error extracting course info: {e}")
            return None

    def _extract_provider(self, card, card_text):
        provider_selectors = [
            '[data-testid*="partner"]',
            '[class*="partner-name"]',
            '[class*="provider"]',
            '.partner-name',
            '.institution'
        ]
        
        for selector in provider_selectors:
            provider = self._safe_extract(card, selector, 'text')
            if provider:
                return provider.strip()
        
        # Extract from text patterns
        patterns = [
            r'Offered by\s+(.+)',
            r'from\s+(.+)',
        ]
        
        for pattern in patterns:
            match = re.search(pattern, card_text, re.IGNORECASE)
            if match:
                return match.group(1).strip()
        
        return "Provider not specified"

    def _extract_rating(self, card, card_text):
        rating_selectors = [
            '[data-testid*="rating"]',
            '[class*="rating-number"]',
            '.product-ratings',
            '.ratings-text'
        ]
        
        for selector in rating_selectors:
            rating_text = self._safe_extract(card, selector, 'text')
            if rating_text:
                match = re.search(r'(\d+\.\d+)', rating_text)
                if match:
                    return float(match.group(1))
        
        # Extract from text
        match = re.search(r'(\d+\.\d+)\s*stars?', card_text, re.IGNORECASE)
        if match:
            return float(match.group(1))
        
        return 0.0

    def _extract_rating_count(self, card, card_text):
        count_selectors = [
            '[data-testid*="ratings-count"]',
            '[class*="rating-count"]',
            '[class*="enrollment-number"]'
        ]
        
        for selector in count_selectors:
            count_text = self._safe_extract(card, selector, 'text')
            if count_text:
                return self._parse_count(count_text)
        
        # Extract from text
        match = re.search(r'\(([\d,]+)\s*(ratings|reviews)\)', card_text, re.IGNORECASE)
        if match:
            return self._parse_count(match.group(1))
        
        return 0

    def _parse_count(self, count_text):
        try:
            count_text = count_text.lower().replace(',', '')
            if 'k' in count_text:
                return int(float(count_text.replace('k', '')) * 1000)
            elif 'm' in count_text:
                return int(float(count_text.replace('m', '')) * 1000000)
            else:
                numbers = re.findall(r'\d+', count_text)
                if numbers:
                    return int(numbers[0])
        except:
            pass
        return 0

    def _extract_duration(self, card_text):
        patterns = [
            r'(\d+\s*-\s*\d+\s*(months?|weeks?|hours?))',
            r'(\d+\s*(months?|weeks?|hours?))',
            r'Approx\.\s*(\d+\s*hours?)'
        ]
        
        for pattern in patterns:
            match = re.search(pattern, card_text, re.IGNORECASE)
            if match:
                return match.group(1).strip()
        
        return None

    def _extract_difficulty(self, card_text):
        levels = ['beginner', 'intermediate', 'advanced', 'mixed']
        for level in levels:
            if level in card_text.lower():
                return level.capitalize()
        return "Not specified"

    def _extract_course_type(self, card_text):
        types = {
            'specialization': 'Specialization',
            'professional certificate': 'Professional Certificate',
            'guided project': 'Guided Project',
            'degree': 'Degree',
            'mastertrack': 'MasterTrack'
        }
        
        for key, value in types.items():
            if key in card_text.lower():
                return value
        return "Course"

    def _extract_price_info(self, card_text):
        if 'free' in card_text.lower():
            return "Free"
        elif 'subscription' in card_text.lower():
            return "Subscription"
        elif 'audit' in card_text.lower():
            return "Free Audit"
        else:
            # Look for price patterns
            match = re.search(r'\$\d+\.?\d*', card_text)
            if match:
                return match.group(0)
            return "Price not specified"

    def _extract_description(self, card, card_text):
        desc_selectors = [
            '[class*="description"]',
            '[class*="snippet"]',
            '.course-description'
        ]
        
        for selector in desc_selectors:
            desc = self._safe_extract(card, selector, 'text')
            if desc and len(desc) > 20:
                return desc[:500].strip()
        
        # Use first few meaningful lines from card text
        lines = [line.strip() for line in card_text.split('\n') if line.strip()]
        if len(lines) > 1:
            # Skip title and provider lines
            description_lines = []
            for line in lines[2:6]:  # Take lines 2-5 as description
                if len(line) > 10 and not any(keyword in line.lower() for keyword in ['rating', 'review', 'enrollment', 'star']):
                    description_lines.append(line)
            if description_lines:
                return ' '.join(description_lines)[:400]
        
        return "Description not available"

    def _extract_skills(self, card_text):
        # Look for skills mentioned in the text
        skills_keywords = ['python', 'sql', 'machine learning', 'data analysis', 'statistics', 
                          'r programming', 'tableau', 'excel', 'data visualization']
        found_skills = []
        
        for skill in skills_keywords:
            if skill in card_text.lower():
                found_skills.append(skill)
        
        if found_skills:
            return ', '.join(found_skills)
        return "Skills not specified"

    def _safe_extract(self, parent, selector, attribute='text'):
        try:
            element = parent.find_element(By.CSS_SELECTOR, selector)
            if attribute == 'text':
                text = element.text.strip()
                return text if text else None
            else:
                attr = element.get_attribute(attribute)
                return attr if attr else None
        except:
            return None

    def save_to_database(self):
        if not self.courses_data or not self.db_connection:
            print("No data to save or database not connected")
            return 0
        
        try:
            cursor = self.db_connection.cursor()
            saved_count = 0
            
            insert_sql = """
            INSERT OR IGNORE INTO courses (title, provider, instructors, rating, rating_count, duration, 
                                         difficulty_level, course_type, price, description, skills_covered, 
                                         link, language, scraped_date)
            VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
            """
            
            for course in self.courses_data:
                try:
                    cursor.execute(insert_sql, (
                        course['title'],
                        course['provider'],
                        course['instructors'],
                        course['rating'],
                        course['rating_count'],
                        course['duration'],
                        course['difficulty_level'],
                        course['course_type'],
                        course['price'],
                        course['description'],
                        course['skills_covered'],
                        course['link'],
                        course['language'],
                        course['scraped_date']
                    ))
                    if cursor.rowcount > 0:
                        saved_count += 1
                except Exception as e:
                    print(f"Error inserting course: {e}")
                    continue
            
            self.db_connection.commit()
            cursor.close()
            return saved_count
            
        except Exception as e:
            print(f"Error saving to database: {e}")
            return 0

    def debug_page_structure(self):
        """Debug method to understand page structure"""
        print("\n=== PAGE STRUCTURE DEBUG ===")
        print(f"Current URL: {self.driver.current_url}")
        print(f"Page title: {self.driver.title}")
        
        # Find all containers that might hold courses
        container_selectors = ['main', '#main', '.results', '.search-results', '.ais-InfiniteHits', '.cds-grid']
        
        for selector in container_selectors:
            try:
                containers = self.driver.find_elements(By.CSS_SELECTOR, selector)
                print(f"Found {len(containers)} containers with selector: {selector}")
                for i, container in enumerate(containers):
                    print(f"Container {i+1} text preview: {container.text[:200]}...")
            except:
                continue

def main():
    # Initialize scraper with headless=False for debugging
    scraper = ImprovedCourseraScraper(headless=False)
    
    # Setup SQLite database
    if not scraper.setup_sqlite_database("coursera_courses.sqlite"):
        print("Failed to setup database. Exiting.")
        return
    
    search_query = "data science"
    pages = 2
    
    print(f"\nScraping Coursera courses for: '{search_query}'")
    
    # Scrape and save to database
    start_time = time.time()
    courses = scraper.scrape_courses(
        search_query=search_query,
        max_pages=pages
    )
    end_time = time.time()
    
    # Display results
    if courses:
        print(f"\nSuccessfully processed {len(courses)} courses in {end_time - start_time:.1f} seconds")
        
        # Display sample courses
        print("\nSample courses found:")
        for i, course in enumerate(courses[:5]):
            print(f"{i+1}. {course['title']}")
            print(f"   Provider: {course['provider']}")
            print(f"   Rating: {course['rating']} ({course['rating_count']} reviews)")
            print(f"   Type: {course['course_type']}")
            print(f"   Price: {course['price']}")
            print()
        
        # Save to CSV
        scraper.save_to_csv("coursera_courses_backup.csv")
        
    else:
        print("No courses were scraped")
        print("Check the saved HTML files for debugging")
    
    print(f"\nDatabase file: coursera_courses.sqlite")
    print("CSV backup: coursera_courses_backup.csv")

if __name__ == "__main__":
    main()

Courses table created/verified
SQLite database connected: coursera_courses.sqlite

Scraping Coursera courses for: 'data science'
Scraping page 1...
URL: https://www.coursera.org/search?query=data%20science
Page source saved to coursera_page_1.html
Total unique course cards found: 0
Added 0 courses from page 1
Scraping page 2...
URL: https://www.coursera.org/search?query=data%20science&page=2
Error during scraping: Message: invalid session id; For documentation on this error, please visit: https://www.selenium.dev/documentation/webdriver/troubleshooting/errors#invalidsessionidexception
Stacktrace:
	GetHandleVerifier [0x0xcdfe43+66515]
	GetHandleVerifier [0x0xcdfe84+66580]
	(No symbol) [0x0xacda6b]
	(No symbol) [0x0xb0b5ab]
	(No symbol) [0x0xb3b086]
	(No symbol) [0x0xb3667c]
	(No symbol) [0x0xb35bf3]
	(No symbol) [0x0xa9e72d]
	(No symbol) [0x0xa9ecae]
	(No symbol) [0x0xa9f14d]
	GetHandleVerifier [0x0xf37353+2521315]
	GetHandleVerifier [0x0xf322d3+2500707]
	GetHandleVerifier [0x0xd07c94+2

Traceback (most recent call last):
  File "C:\Users\lesha\AppData\Local\Temp\ipykernel_15204\4058517320.py", line 97, in scrape_courses
    self.driver.get(url)
  File "C:\Users\lesha\anaconda3\Lib\site-packages\selenium\webdriver\remote\webdriver.py", line 483, in get
    self.execute(Command.GET, {"url": url})
  File "C:\Users\lesha\anaconda3\Lib\site-packages\selenium\webdriver\remote\webdriver.py", line 458, in execute
    self.error_handler.check_response(response)
  File "C:\Users\lesha\anaconda3\Lib\site-packages\selenium\webdriver\remote\errorhandler.py", line 232, in check_response
    raise exception_class(message, screen, stacktrace)
selenium.common.exceptions.InvalidSessionIdException: Message: invalid session id; For documentation on this error, please visit: https://www.selenium.dev/documentation/webdriver/troubleshooting/errors#invalidsessionidexception
Stacktrace:
	GetHandleVerifier [0x0xcdfe43+66515]
	GetHandleVerifier [0x0xcdfe84+66580]
	(No symbol) [0x0xacda6b]
	(No 

Database connection closed
No courses were scraped
Check the saved HTML files for debugging

Database file: coursera_courses.sqlite
CSV backup: coursera_courses_backup.csv
