In [None]:
import time  # For handling time-related functions
import re  # For regular expressions
import numpy as np  # For numerical operations
import pandas as pd  # For data manipulation and analysis
from bs4 import BeautifulSoup  # For web scraping
from selenium import webdriver  # For browser automation
from selenium.webdriver.chrome.service import Service  # For configuring the ChromeDriver service

In [None]:
def get_driver():
    """
    Creates and configures a Chrome WebDriver instance for web scraping.

    Returns:
        WebDriver: Configured instance of the Chrome WebDriver.
    """
    # Set the path to the ChromeDriver executable
    service = Service('D:/chromedriver-win64/chromedriver.exe')

    # Configure Chrome options
    options = webdriver.ChromeOptions()
    
    # Ignore certificate errors
    options.add_argument('--ignore-certificate-errors')
    
    # Start the browser in maximized mode
    options.add_argument('--start-maximized')

    # Create a Chrome WebDriver instance with the specified service and options
    driver = webdriver.Chrome(service=service, options=options)

    # Return the WebDriver instance
    return driver


In [None]:
def get_page_source(url, delay=10):
    """
    Retrieves the page source of a specified URL using Selenium and BeautifulSoup.

    Args:
        url (str): The URL to scrape.
        delay (int, optional): Delay in seconds to wait for the page to load. Default is 10 seconds.

    Returns:
        BeautifulSoup: The BeautifulSoup object representing the page source.
    """
    # Get a Chrome WebDriver instance
    driver = get_driver()

    # Open the specified URL in the browser
    driver.get(url)

    # Allow time for the page to load (adjust delay as needed)
    time.sleep(delay)

    # Get the page source using BeautifulSoup for parsing
    page_source = BeautifulSoup(driver.page_source, 'html.parser')

    # Close the WebDriver to release resources
    driver.quit()

    # Return the parsed page source
    return page_source

In [None]:
def get_titles(page_source, first_page=False):
    """
    Extracts titles of problems from the given page source.

    Args:
        page_source (BeautifulSoup): The BeautifulSoup object representing the page source.
        first_page (bool, optional): Flag indicating whether it's the first page. Default is False.

    Returns:
        list: List of extracted titles.
    """
    # Determine the starting index based on whether it's the first page or not
    start_index = 1 if first_page else 0

    # Find all title elements using BeautifulSoup
    title_elements = page_source.find_all(
        'a',
        class_=[
            'h-5 hover:text-blue-s dark:hover:text-dark-blue-s',
            'h-5 hover:text-blue-s dark:hover:text-dark-blue-s opacity-60'
        ]
    )[start_index:]

    # Extract text from title elements and store in a list
    titles = [title_element.text for title_element in title_elements]

    # Return the list of titles
    return titles

In [None]:
def get_problems_URL(page_source, first_page=False):
    """
    Extracts URLs of problems from the given page source.

    Args:
        page_source (BeautifulSoup): The BeautifulSoup object representing the page source.
        first_page (bool, optional): Flag indicating whether it's the first page. Default is False.

    Returns:
        list: List of extracted problem URLs.
    """
    # Determine the starting index based on whether it's the first page or not
    start_index = 1 if first_page else 0

    # Find all problem elements with an 'a' tag and an 'href' attribute using BeautifulSoup
    problem_elements = page_source.find_all('a', href=True, class_=[
        'h-5 hover:text-blue-s dark:hover:text-dark-blue-s',
        'h-5 hover:text-blue-s dark:hover:text-dark-blue-s opacity-60'
    ])[start_index:]

    # Extract the 'href' attribute values from the problem elements and store in a list
    problems_url = [el['href'] for el in problem_elements]

    # Return the list of problem URLs
    return problems_url

In [None]:
def get_acceptances_difficulties(page_source, first_page=False):
    """
    Extracts acceptances and difficulties of problems from the given page source.

    Args:
        page_source (BeautifulSoup): The BeautifulSoup object representing the page source.
        first_page (bool, optional): Flag indicating whether it's the first page. Default is False.

    Returns:
        tuple: Two lists - acceptances and difficulties.
    """
    # Find all div elements with the specified class using BeautifulSoup
    div_elements = page_source.find_all('div', class_='mx-2 flex items-center py-[11px]')

    # Determine the starting index based on whether it's the first page or not
    start_index = 1 if first_page else 0

    # Extract text from the span elements within the div elements and store in a list
    items = [
        span_element.text.strip()
        for div_element in div_elements
        for span_element in [div_element.find('span')]
        if span_element
    ]

    # Separate the items into acceptances and difficulties lists
    acceptances, difficulties = [], []
    for item in items:
        if item:
            (acceptances if item.endswith('%') else difficulties).append(item)

    # Adjust lists based on the starting index
    acceptances = acceptances[start_index:]
    difficulties = difficulties[start_index:]

    # Return the lists of acceptances and difficulties
    return acceptances, difficulties

In [None]:
def get_single_page_df(url, first_page=False):
    """
    Creates a Pandas DataFrame for a single page of problems.

    Args:
        url (str): The URL of the page to scrape.
        first_page (bool, optional): Flag indicating whether it's the first page. Default is False.

    Returns:
        DataFrame: Pandas DataFrame containing titles, problem URLs, acceptances, and difficulties.
    """
    # Get the page source for the specified URL
    page_source = get_page_source(url)

    # Extract titles, problem URLs, acceptances, and difficulties from the page source
    titles = get_titles(page_source, first_page)
    problems_url = get_problems_URL(page_source, first_page)
    acceptances, difficulties = get_acceptances_difficulties(page_source, first_page)

    # Create a dictionary with the extracted data
    data = {
        'title': titles,
        'problem_URL': problems_url,
        'acceptance': acceptances,
        'difficulty': difficulties
    }

    # Create a DataFrame using the dictionary
    df = pd.DataFrame(data)

    # Return the DataFrame
    return df

In [None]:
def get_multiple_page_df(start=1, end=60):
    """
    Calls get_single_page_df for a range of pages and concatenates the results into a single DataFrame.

    Args:
        start (int, optional): The starting page. Default is 1.
        end (int, optional): The ending page. Default is 60.

    Returns:
        DataFrame: Pandas DataFrame containing titles, problem URLs, acceptances, and difficulties for multiple pages.
    """
    # Initialize an empty list to store DataFrames for each page
    list_of_dfs = []

    # Set the flag for the first page
    first_page = True if start == 1 else False

    # Iterate over the specified range of pages
    for i in range(start, end + 1):
        # Construct the URL for the current page
        url = 'https://leetcode.com/problemset/all/?page=' + str(i)

        # Get the DataFrame for the current page and append it to the list
        df = get_single_page_df(url, first_page)
        list_of_dfs.append(df)

        # Update the first_page flag for subsequent pages
        first_page = False

    # Concatenate the list of DataFrames into a single DataFrame
    df = pd.concat(list_of_dfs, ignore_index=True)

    # Return the final DataFrame
    return df

In [None]:
def scrape(start=1, end=60, file_name='part1.csv'):
    """
    Initiates the scraping process, saving the resulting DataFrame to a CSV file.

    Args:
        start (int, optional): The starting page for scraping. Default is 1.
        end (int, optional): The ending page for scraping. Default is 60.
        file_name (str, optional): The name of the CSV file to save the scraped data. Default is 'part1.csv'.
    """
    # Get the DataFrame by scraping multiple pages
    df = get_multiple_page_df(start, end)

    # Save the DataFrame to a CSV file
    df.to_csv(path_or_buf=file_name, index=False)

`Running Web-Scraping Process`:

In [None]:
scrape(start=1, end=60, file_name='x1.csv')

In [None]:
df1 = pd.read_csv('x1.csv')

In [None]:
df1['problem_URL'] = df1['problem_URL'].apply(lambda x: f'{"https://leetcode.com"}{x}')

In [None]:
df1['solution_URL'] = df1['problem_URL'].apply(lambda x: f'{x}{"/solution"}')

In [None]:
def get_title(page_source):
    """
    Extracts the title from the given page source.

    Args:
        page_source (BeautifulSoup): The BeautifulSoup object representing the page source.

    Returns:
        str: The extracted title text.
    """
    # Find the title element using BeautifulSoup
    title_element = page_source.find(
        'a',
        class_='mr-2 text-label-1 dark:text-dark-label-1 hover:text-label-1 dark:hover:text-dark-label-1 text-lg font-medium'
    )

    # Extract the text content from the title element
    title = title_element.text

    # Return the title
    return title

In [None]:
def get_problem_description(page_source):
    """
    Extracts the problem description from the given page source.

    Args:
        page_source (BeautifulSoup): The BeautifulSoup object representing the page source.

    Returns:
        str: The extracted problem description text.
    """
    # Find the div element containing the problem description using BeautifulSoup
    description_element = page_source.find(
        'div',
        class_='xFUwe'
    )

    # Extract the text content from the description element
    description_text = description_element.text

    # Return the problem description
    return description_text

In [None]:
def get_topic_tags(page_source):
    """
    Extracts the topic tags from the given page source.

    Args:
        page_source (BeautifulSoup): The BeautifulSoup object representing the page source.

    Returns:
        str: Comma-separated string of extracted topic tags.
    """
    # Initialize an empty list to store topic tags
    topic_tags = []

    # Find all elements with the specified class using BeautifulSoup
    topic_tag_elements = page_source.find_all('a',
                 class_='mr-4 rounded-xl px-2 py-1 text-xs transition-colors text-label-2 dark:text-dark-label-2 hover:text-label-2 dark:hover:text-dark-label-2 bg-fill-3 dark:bg-dark-fill-3 hover:bg-fill-2 dark:hover:bg-dark-fill-2') 

    # Extract text content from each topic tag element and append to the list
    for topic_tag_element in topic_tag_elements:
        topic_tag = topic_tag_element.text
        topic_tags.append(topic_tag)

    # Join the list of topic tags into a comma-separated string
    topic_tags_str = ', '.join(f"'{item}'" for item in topic_tags)

    # Return the formatted string of topic tags
    return topic_tags_str

In [None]:
def get_accepted(page_source):
    """
    Extracts the number of accepted submissions from the given page source.

    Args:
        page_source (BeautifulSoup): The BeautifulSoup object representing the page source.

    Returns:
        Tag: The BeautifulSoup Tag object containing the information about accepted submissions.
    """
    # Find all elements with the specified class using BeautifulSoup
    accepted_elements = page_source.find_all('div', class_='text-label-1 dark:text-dark-label-1 text-sm font-medium')

    # Return the first element (it contains the information about accepted submissions)
    return accepted_elements[0]

In [None]:
def get_submission(page_source):
    """
    Extracts the number of total submissions from the given page source.

    Args:
        page_source (BeautifulSoup): The BeautifulSoup object representing the page source.

    Returns:
        Tag: The BeautifulSoup Tag object containing the information about total submissions.
    """
    # Find all elements with the specified class using BeautifulSoup
    submission_elements = page_source.find_all('div', class_='text-label-1 dark:text-dark-label-1 text-sm font-medium')

    # Return the second element (it contains the information about total submissions)
    return submission_elements[1]

In [None]:
def get_solution(page_source):
    """
    Extracts the solution count from the given page source.

    Args:
        page_source (BeautifulSoup): The BeautifulSoup object representing the page source.

    Returns:
        str: The extracted solution.
    """
    # Find the element with a link containing 'solutions' using BeautifulSoup
    solutions_element = page_source.find(href=re.compile("solutions"))

    # Use regular expression to extract the solution type from the text
    solution = re.findall(r"\((.*?)\)", solutions_element.text)[0]

    # Return the extracted solution count
    return solution

In [None]:
def get_discussion_count(page_source):
    """
    Extracts the count of discussions from the given page source.

    Args:
        page_source (BeautifulSoup): The BeautifulSoup object representing the page source.

    Returns:
        str: The extracted count of discussions.
    """
    # Find the element with the specified class using BeautifulSoup
    discussion_count_element = page_source.find_all('div', class_='flex-1 text-sm leading-[22px]')[0]

    # Use regular expression to extract the count of discussions from the element's text
    discussion_count = re.findall(r"\((.*?)\)", discussion_count_element.text)[0]

    # Return the extracted count of discussions
    return discussion_count

In [None]:
def get_likes(page_source):
    """
    Extracts the number of likes from the given page source.

    Args:
        page_source (BeautifulSoup): The BeautifulSoup object representing the page source.

    Returns:
        str: The extracted number of likes.
    """
    # Find the element with the specified class using BeautifulSoup
    likes_element = page_source.find_all('div', class_='text-lg text-gray-6 dark:text-dark-gray-6')[0]

    # Find the next sibling element and extract the text content
    likes_count = likes_element.find_next_sibling().text

    # Return the extracted number of likes
    return likes_count

In [None]:
def get_dislikes(page_source):
    """
    Extracts the number of dislikes from the given page source.

    Args:
        page_source (BeautifulSoup): The BeautifulSoup object representing the page source.

    Returns:
        str: The extracted number of dislikes.
    """
    # Find the element with the specified class using BeautifulSoup
    dislikes_element = page_source.find_all('div', class_='text-lg text-gray-6 dark:text-dark-gray-6')[1]

    # Find the next sibling element and extract the text content
    dislikes_count = dislikes_element.find_next_sibling().text

    # Return the extracted number of dislikes
    return dislikes_count

In [None]:
def get_similar_questions(page_source):
    """
    Extracts a list of similar questions from the given page source.

    Args:
        page_source (BeautifulSoup): The BeautifulSoup object representing the page source.

    Returns:
        str: Comma-separated string of extracted similar questions.
    """
    # Initialize an empty list to store similar questions
    similar_questions = []

    # Find all elements with the specified class using BeautifulSoup
    similar_question_elements = page_source.find_all('a', class_='text-sm font-medium transition-none text-label-1 dark:text-dark-label-1 hover:text-blue-s dark:hover:text-dark-blue-s')

    # Extract text content from each similar question element and append to the list
    for similar_question_element in similar_question_elements:
        similar_question = similar_question_element.text
        similar_questions.append(similar_question)

    # Join the list of similar questions into a comma-separated string
    similar_questions_str = ', '.join(f"'{item}'" for item in similar_questions)

    # Return the formatted string of similar questions
    return similar_questions_str

In [None]:
def get_page_source(driver, url, delay=10):
    """
    Retrieves the page source of a specified URL using Selenium and BeautifulSoup.

    Args:
        driver (webdriver): The Selenium WebDriver object.
        url (str): The URL to scrape.
        delay (int, optional): Delay in seconds to wait for the page to load. Default is 10 seconds.

    Returns:
        BeautifulSoup: The BeautifulSoup object representing the page source.
    """
    
    # Open the specified URL in the browser
    driver.get(url)

    # Allow time for the page to load (adjust delay as needed)
    time.sleep(delay)

    # Get the page source using BeautifulSoup for parsing
    page_source = BeautifulSoup(driver.page_source, 'html.parser')

    # Return the parsed page source
    return page_source

In [None]:
def get_is_premium(page_source):
    """
    Checks if the page indicates a premium status.

    Args:
        page_source (BeautifulSoup): The BeautifulSoup object representing the page source.

    Returns:
        str: 'True' if premium, 'False' otherwise.
    """
    # Find the element with the specified class using BeautifulSoup
    premium_element = page_source.find('div', class_='text-md mb-6 text-center text-label-2 dark:text-dark-label-2')

    # Determine premium status based on the existence of the element
    is_premium = 'True' if premium_element else 'False'

    # Return the premium status
    return is_premium

In [None]:
def scrape(df1, start=1, end=3000, file_name='x2.csv'):
    """
    Scrapes data for a range of links from the provided DataFrame and saves it to a CSV file.

    Args:
        df1 (DataFrame): The DataFrame containing problem URLs.
        start (int, optional): The starting index for scraping. Default is 1.
        end (int, optional): The ending index for scraping. Default is 3000.
        file_name (str, optional): The name of the CSV file to save the scraped data. Default is 'x2.csv'.
    """
    # Extract links for the specified range from the DataFrame
    links = df1['problem_URL'][start - 1:end]

    # Initialize an empty list to store DataFrames for each link
    dfs = []
    
    # Set the path to the ChromeDriver executable
    service = Service('D:/chromedriver-win64/chromedriver.exe')

    # Configure Chrome options
    options = webdriver.ChromeOptions()
    options.add_argument('--ignore-certificate-errors')  # Ignore certificate errors
    options.add_argument('--start-maximized')  # Start the browser in maximized mode

    # Create a Chrome WebDriver instance with the specified service and options
    driver = webdriver.Chrome(service=service, options=options)

    # Iterate over the links and scrape data
    for link in links:
        i = 0
        # Get the page source for the current link
        page_source = get_page_source(driver, link, delay=10)

        # Create a dictionary to store scraped data
        data = {'is_premium': get_is_premium(page_source)}

        # Check if the problem is not premium before scraping additional data
        if data['is_premium'] == 'False':
            # Update the data dictionary with additional scraped data
            data.update({
                'title': get_title(page_source),
                'problem_description': get_problem_description(page_source),
                'topic_tags': get_topic_tags(page_source),
                'accepted': get_accepted(page_source),
                'submission': get_submission(page_source),
                'solution': get_solution(page_source),
                'discussion_count': get_discussion_count(page_source),
                'likes': get_likes(page_source),
                'dislikes': get_dislikes(page_source),
                'similar_questions': get_similar_questions(page_source)
            })

            # Create a DataFrame for the current link and append it to the list
            df = pd.DataFrame(data, index=[i])
            dfs.append(df)
            i += 1
    print(dfs)
    # Concatenate the list of DataFrames into a single DataFrame
    df = pd.concat(dfs, ignore_index=True)

    # Save the final DataFrame to a CSV file
    df.to_csv(path_or_buf=file_name, index=None)

In [None]:
scrape(df1, start=1, end=3000, file_name='x2.csv')

In [None]:
df2 = pd.read_csv('x2.csv')

In [None]:
df = df1.merge(df2, left_on='title', right_on='title', how='left')

In [None]:
df.to_csv('leetcode_scraped_data.csv', index=None)