In [7]:
# Imports
import pandas as pd
import re
import os
from pylatexenc.latexwalker import LatexWalker, LatexEnvironmentNode, LatexMacroNode, LatexCharsNode, LatexGroupNode, LatexMathNode

## Helper Functions

In [8]:
def parse_title(title_str):
    '''
    Separates title into text portions and LaTeX portions.

    Parameters:
    title_str (str): The raw title.

    Returns:
    text_content (str): The text portion of the title.
    latex_content (str): The LaTeX portion of the title. 
    '''
    latex_parts = re.findall(r'\$(.*?)\$', title_str)
    text_content = re.sub(r'\$.*?\$', '<LATEX>', title_str).strip()
    latex_content = ' '.join(latex_parts)
    return text_content, latex_content

In [9]:
def parse_body(body_str):
    '''
    Separates body into text portions and LaTeX portions. Takes into account more sophisiticated LaTeX than just "$" delimiters.

    Parameters:
    body_str (str): The body text of a question.

    Returns:
    text_content (str): The text portion of the body.
    latex_content (str): The LaTeX portion of the body. 
    '''
    # Define regex patterns
    dollar_pattern = r'\$.*?\$'
    double_dollar_pattern = r'\$\$.*?\$\$'
    begin_end_pattern = r'\\begin\{.*?\}.*?\\end\{.*?\}'

    # Combine the patterns to find the largest possible matches
    combined_pattern = re.compile(f'({double_dollar_pattern}|{begin_end_pattern}|{dollar_pattern})', re.DOTALL)

    # Find all matches in the input string
    matches = combined_pattern.findall(body_str)

    # Remove LaTeX from body
    body_text = combined_pattern.sub('<LATEX>', body_str)

    # Combine LaTeX matches into a single string and remove dollar signs
    body_latex = ' '.join(matches)
    body_latex = re.sub(r'\$\$', '', body_latex)  # Remove double dollar signs
    body_latex = re.sub(r'\$', '', body_latex)    # Remove single dollar signs

    return body_text, body_latex

In [10]:
def remove_newlines(latex_str):
    '''
    Removes any newlines from LaTeX strings.

    Parameters:
    latex_str (str): A string containing LaTeX.

    Returns:
    cleaned_str (str): The cleaned LaTeX without newline characters.
    '''
    cleaned_str = re.sub(r'(?<!\\)\n', ' ', latex_str).strip()
    return cleaned_str

## Main Code

The purpose of this notebook is to separate text and LaTeX within the title and body. This is so that, later on down the line, I can treat these in separate ways (i.e. when training word embeddings or performing vectorization).

In [11]:
# Read in all data and combine into a single dataframe
raw_data_df = pd.DataFrame()
all_files = os.listdir('raw_data')
for file_name in all_files:
    curr_data_df = pd.read_csv(f'raw_data/{file_name}')
    raw_data_df = pd.concat([raw_data_df, curr_data_df], ignore_index=True)

In [12]:
# Separate text from LaTeX in both title and body
title_separated_df = pd.DataFrame(raw_data_df['title'].apply(parse_title).tolist(), index=raw_data_df.index).rename(
    {0:'title_text', 1:'title_latex'}, axis=1)
body_separated_df = pd.DataFrame(raw_data_df['body'].apply(parse_body).tolist(), index=raw_data_df.index).rename(
    {0:'body_text', 1:'body_latex'}, axis=1)

In [13]:
# Add parsed data to dataframe
parsed_data_df = raw_data_df.join(title_separated_df).join(body_separated_df).drop_duplicates()

In [14]:
# Save parsed data as csv for exploratory analysis later
parsed_data_df.to_csv('parsed_data.csv', index=False)