In [1]:
# Imports
import requests
import pandas as pd
import numpy as np
import re
import random
import time
from bs4 import BeautifulSoup

## Helper Functions

In [2]:
def get_question_links(tag, num_pages=10):
    '''
    Gets links to questions that belong to a certain tag.

    Parameters:
    tag (str): A tag for which to get questions.
    num_pages (int): Number of pages of the tag to scrape. Defaults to 10.

    Returns:
    question_links (list[str]): A list of URLs to Math Stack Exchange questions.
    '''
    question_links = []
    for page in range(1, num_pages + 1):
        url = f'https://math.stackexchange.com/questions/tagged/{tag}?tab=votes&page={page}&pagesize=50'
        response = requests.get(url)
        soup = BeautifulSoup(response.text, 'html.parser')
        for question in soup.select('.s-post-summary'):
            link = question.select_one('.s-link')['href']
            question_links.append(link)
        time.sleep(random.uniform(0,2)) # Random to mimic human behavior and hopefully not get banned...
        
    return question_links

In [3]:
def parse_title(title_str):
    '''
    Separates title into text portions and LaTeX portions.

    Parameters:
    title_str (str): The raw title.

    Returns:
    text_content (str): The text portion of the title.
    latex_content (str): The LaTeX portion of the title. 
    '''
    latex_parts = re.findall(r'\$(.*?)\$', title_str)
    text_content = re.sub(r'\$.*?\$', '', title_str).strip()
    latex_content = ' '.join(latex_parts)
    return text_content, latex_content

In [4]:
def parse_body(content_soup):
    '''
    Separates body into text portions and LaTeX portions. Takes into account more sophisiticated LaTeX than just "$" delimiters.

    Parameters:
    content_soup (soup): The soup object obtained from the contents of the question.

    Returns:
    text_content (str): The text portion of the body.
    latex_content (str): The LaTeX portion of the body. 
    '''
    latex_elements = content_soup.find_all('span', class_='math-container')
    for element in latex_elements:
        element.extract()
    text_content = content_soup.get_text(separator=" ", strip=True).replace('\n', ' ')
    latex_content = ' '.join([element.get_text(separator=" ", strip=True) for element in latex_elements]).replace('$', '')
    return text_content, latex_content       

In [5]:
def remove_newlines_outside_dollar(text):
    '''
    Removes all "\n" characters outside of LaTeX (i.e. outside of "$" delimiters).

    Parameters:
    text (str): Raw text.

    Returns:
    cleaned (str): Cleaned text.
    '''
    # Split text by LaTeX parts
    parts = re.split(r'(\$.*?\$)', text)  
    cleaned_parts = []
    
    for part in parts:
        # Keep LaTeX parts as they are, strip out newlines from all other parts. 
        if part.startswith('$') and part.endswith('$'):
            cleaned_parts.append(part)
        else:
            cleaned_parts.append(part.replace('\n', ' '))  # Remove newlines from non-LaTeX parts
    cleaned = ''.join(cleaned_parts)
    return cleaned

In [6]:
def get_question_content(link):
    '''
    Gets title, body, and tags for a question. Also separates all text into text portions and LaTeX portions. 

    Parameters:
    link (str): URL for the question.

    Returns:
    data_dict (dict): A dictionary containing all the data for the question.
    '''
    url = f'https://math.stackexchange.com{link}'
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    title = soup.select_one('.question-hyperlink').get_text(separator=" ", strip=True)
    title_text, title_latex = parse_title(title)
    body = soup.select_one('.js-post-body')
    body_raw = remove_newlines_outside_dollar(body.get_text(separator=" ", strip=True))
    body_text, body_latex = parse_body(body)
    tags = []
    tag_data = soup.find_all('li', class_='d-inline mr4 js-post-tag-list-item')
    for tag in tag_data:
        tags.append(tag.get_text())
    data_dict = {'title_raw':title, 'title_text': title_text, 'title_latex': title_latex, 
            'body_raw': body_raw, 'body_text': body_text, 'body_latex': body_latex, 
            'tags':tags[:len(tags) // 2]} # Tags get repeated at bottom of every page
    return data_dict

## Main Code

In [7]:
# Top 30 tags on Math Stack Exchange
tags = [
    'real-analysis',
    'calculus',
    'linear-algebra',
    'probability',
    'abstract-algebra',
    'integration',
    'sequences-and-series',
    'combinatorics',
    'general-topology',
    'matrices',
    'functional-analysis',
    'complex-analysis',
    'geometry',
    'group-theory',
    'algebra-precalculus',
    'probability-theory',
    'ordinary-differential-equations',
    'limits',
    'analysis',
    'number-theory',
    'measure-theory',
    'statistics',
    'multivariable-calculus',
    'functions',
    'derivatives',
    'differential-geometry',
    'discrete-mathematics',
    'trigonometry',
    'algebraic-geometry',
    'elementary-set-theory'
]

In [None]:
# Get 10,000 questions for each tag. Should take around two hours to run based on the average 1 second time.sleep() between calls.
all_links = []
for tag in tags:
    curr_links = get_question_links(tag, num_pages=200)
    all_links.extend(curr_links)
    with open('links.txt', 'w') as f:
        for link in all_links:
            f.write(f'{link}\n')
    print(f'Gotten {len(curr_links)} questions for tag: {tag}')

In [8]:
# Get all links from file
with open('links.txt') as f:
    all_links = f.read().splitlines()
all_links = list(dict.fromkeys(all_links))

In [10]:
# Get question data for each link
counter = 234200 # Has been reset each time I've restarted the program, hence why it isn't set to 0. 
curr_data = []
for link in all_links[234200:]:
    next_data = get_question_content(link)
    time.sleep(random.uniform(0,1)) # Once again, used to hopefully not get banned. 
    curr_data.append(next_data)
    counter += 1
    if counter % 100 == 0: # Save data incrementally due to running this process on my local machine (sometimes have to restart). 
        print(f'Gotten {counter} questions.')
        question_data = pd.DataFrame(curr_data)
        question_data.to_csv(f'question_data/{counter-99}-{counter}.csv')
        curr_data = []

Gotten 234300 questions.
Gotten 234400 questions.
Gotten 234500 questions.
Gotten 234600 questions.
Gotten 234700 questions.
Gotten 234800 questions.
Gotten 234900 questions.
Gotten 235000 questions.
Gotten 235100 questions.
Gotten 235200 questions.
Gotten 235300 questions.
Gotten 235400 questions.
Gotten 235500 questions.
Gotten 235600 questions.
Gotten 235700 questions.
