In [105]:
# Imports
import pandas as pd
import re
import os
import scipy.sparse as sp
from pylatexenc.latexwalker import LatexWalker, LatexEnvironmentNode, LatexMacroNode, LatexCharsNode, LatexGroupNode, LatexMathNode
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

## Helper Functions

In [146]:
def remove_newlines(latex_str):
    '''
    Removes any newlines from LaTeX strings.

    Parameters:
    latex_str (str): A string containing LaTeX.

    Returns:
    cleaned_str (str): The cleaned LaTeX without newline characters.
    '''
    cleaned_str = re.sub(r'(?<!\\)\n', ' ', latex_str).strip()
    return cleaned_str

In [147]:
def fine_tokenize_expression(expr):
    '''
    Tokenizes an expression into each of its component words/symbols. Essentially combines 
    all plaintext next to one another into a single token and any symbol into a single token.

    Parameters:
    expr (str): An expression to tokenize.

    Returns:
    tokens (list[str]): A list of tokens.
    '''
    tokens = re.findall(r'\w+|[^\s\w]', expr)
    return tokens

In [150]:
def fine_tokenize_latex(latex_str):
    '''
    Tokenizes a LaTeX string finely. This means every symbol and every word are separate. 

    Parameters:
    latex_str (str): A LaTeX string to tokenize.

    Returns:
    tokens (list[str]): A list of tokens.
    '''
    # Create a LatexWalker to get the tree of nodes from the LaTeX code.
    walker = LatexWalker(latex_str)
    nodes, _, __ = walker.get_latex_nodes()
    tokens = []

    # Inner function to parse each individual node
    def extract_tokens(node):

        # For macro, environment, and group nodes, use the name of the node 
        # as a token, then recursively parse its contents.
        if isinstance(node, LatexMacroNode):
            tokens.append(node.macroname)
            if node.nodeargd is None:
                return
            for arg in node.nodeargd.argnlist:
                extract_tokens(arg)
        elif isinstance(node, LatexEnvironmentNode):
            tokens.append(node.environmentname)
            for n in node.nodelist:
                extract_tokens(n)
        elif isinstance(node, LatexGroupNode):
            for child_node in node.nodelist:
                extract_tokens(child_node)

        # For char nodes, tokenize the text using the above regex.
        elif isinstance(node, LatexCharsNode):
            tokens.extend(fine_tokenize_expression(node.chars))

    for node in nodes:
        extract_tokens(node)
    
    return tokens

In [90]:
def coarse_tokenize_expression(expr):
    '''
    Tokenizes an expression into commands, arguments, and parenthetical expressions, each of 
    which becomes a separate token.

    Parameters:
    expr (str): An expression to tokenize.

    Returns:
    tokens (list[str]): A list of tokens.
    '''
    token_pattern = re.compile(r'\\[a-zA-Z]+|{[^{}]+}|[()\[\]]|\S+')
    tokens = token_pattern.findall(expr)
    return tokens

In [155]:
def latex_to_string(node, include_braces=False):
    '''
    Converts LaTeX nodes to strings for use by the coarse tokenizer.

    Parameters:
    node (LatexNode): A node to extract text from.
    include_braces (bool): Whether or not to include the braces in the tokens. Defaults to False.

    Returns:
    return_str (str): A string with the LaTeX text properly parsed.
    '''
    return_str = ''

    # If chars node, simply get the characters
    if isinstance(node, LatexCharsNode):
        return_str = node.chars
        
    # If macro, group, or environment node, recursively get all text from 
    # child nodes and join it all together with the node name.
    elif isinstance(node, LatexMacroNode):
        if node.nodeargd is None:
            return return_str
        macro_content = f"\\{node.macroname}" + ''.join([latex_to_string(arg, include_braces=True) for arg in node.nodeargd.argnlist])
        return_str = macro_content
    elif isinstance(node, LatexGroupNode):
        content = ''.join([latex_to_string(child, include_braces=True) for child in node.nodelist])
        if include_braces:
            return_str = f"{{{content}}}"
        else:
            return_str = content
    elif isinstance(node, LatexEnvironmentNode):
        env_content = f"\\begin{{{node.environmentname}}}" + ''.join([latex_to_string(n, include_braces=True) for n in node.nodelist]) + f"\\end{{{node.environmentname}}}"
        return_str = env_content
    return return_str

In [152]:
def coarse_tokenize_latex(latex_str):
    '''
    Tokenizes a LaTeX string finely. This means that arguments and parenthetical expressions 
    are kept together as one token.

    Parameters:
    latex_str (str): A LaTeX string to tokenize.

    Returns:
    tokens (list[str]): A list of tokens.
    '''
    # Create a LatexWalker to get the tree of nodes from the LaTeX code.
    walker = LatexWalker(latex_str)
    nodes, _, __ = walker.get_latex_nodes()
    tokens = []

    # Inner function to parse each individual node
    def extract_tokens(node):

        # For macros nodes, get arguments and insert the macro name and arguments together as one token
        if isinstance(node, LatexMacroNode):
            if node.nodeargd is None:
                return
            macro_name = f"\\{node.macroname}"
            args = ''.join([latex_to_string(arg, include_braces=True) for arg in node.nodeargd.argnlist])
            tokens.append(macro_name + args)

        # For environment nodes, place the entire environment as one token
        elif isinstance(node, LatexEnvironmentNode):
            env_name = f"\\begin{{{node.environmentname}}}" + ''.join([latex_to_string(n) for n in node.nodelist]) + f"\\end{{{node.environmentname}}}"
            tokens.append(env_name)

        # For char nodes, just coarsely tokenize the expression
        elif isinstance(node, LatexCharsNode):
            tokens.extend(coarse_tokenize_expression(node.chars))

        # For group nodes, just get the content and join it all together
        elif isinstance(node, LatexGroupNode):
            group_content = ''.join([latex_to_string(child_node, include_braces=True) for child_node in node.nodelist])
            tokens.append(f"{{{group_content}}}")

        # For math nodes, apply recursion and get contents
        elif isinstance(node, LatexMathNode):
            for n in node.nodelist:
                extract_tokens(n)
    
    for node in nodes:
        extract_tokens(node)

    return tokens

In [93]:
def merge_tokens(tokens):
    '''
    Merges tokens that are related to each other by a subscript or superscript.

    Parameters:
    tokens (list[str]): A list of tokens.

    Returns:
    merged_tokens (list[str]): A list of merged tokens.
    '''
    merged_tokens = []
    skip_next = False
    
    for i, token in enumerate(tokens):
        if skip_next:
            skip_next = False
            continue

        if token in ('^', '_'):
            if i > 0 and i < len(tokens) - 1:
                # Merge previous token, current ^/_ token, and next token
                merged_tokens[-1] += token + tokens[i + 1]
                skip_next = True
            elif i == 0 and i < len(tokens) - 1:
                # Merge current ^/_ token and next token
                merged_tokens.append(token + tokens[i + 1])
                skip_next = True
            elif i > 0:
                # Merge previous token and current ^/_ token
                merged_tokens[-1] += token
        elif token.endswith('^') or token.endswith('_'):
            # Merge current token with next token
            if i < len(tokens) - 1:
                merged_tokens.append(token + tokens[i + 1])
                skip_next = True
            else:
                merged_tokens.append(token)
        elif token.startswith('^') or token.startswith('_'):
            # Merge previous token with current token
            if i > 0:
                merged_tokens[-1] += token
            else:
                merged_tokens.append(token)
        else:
            merged_tokens.append(token)
    
    return merged_tokens

## Main Code

I'll be vectorizing the body text using 4 different methods:
- Combined raw vectorization: This will take the entirety of the body text and vectorize it using TF-IDF vectorization. I don't expect this to work very well since this will have both text and LaTeX in it.
- Separate raw vectorization: Separates the text from the LaTeX in the body of the question, but still uses regular TF-IDF vectorization on both portions. I expect this to work much better than with the LaTeX and the text combined.
- Separate vectorization with fine tokenization: Treats the text with regular TF-IDF but uses custom tokenization to tokenize and vectorize the LaTeX. This will treat each symbol and word in the LaTeX as its own token to apply vectorization to.
- Separate vectorization with coarse tokenization: Treats the text with regular TF-IDF but uses custom tokenization to tokenize and vectorize the LaTeX. This time, though, tokens are much larger, being composed of individual macros along with their arguments.

The thinking behind the last two is that a lot of relevant features will lie within the types of environments, macros, and even characters used within LaTeX code. For instance, if we see a lot of $\int$ symbols, the question is probably related to calculus in some way. However, if we keep seeing $\zeta$ or $\textnormal{mod}$, the question might be more number theoretical. Even the presence of certain mathematical symbols like $e$ can be used to narrow down the field of math a question is in. For this reason, it makes sense to tokenize expressions in a way that preserves the structure of the underlying problem. See above for documentation on how this is explicitly done.

For each of these 4 methods, I'll be using 5 different dimensionalities: 1,000, 2,000, 5,000, 10,000, and 15,000. It's entirely possible that the larger of these values are computationally infeasible for me to train a neural network on, but that's a later problem. Might as well gather the data here anyways.

Also, when I save these vectorized datasets, the filesizes are going to be quite large. Because of that, if you're looking at this repo and want to recreate this analysis somehow, just use the raw data that is in the repo.

In [None]:
# Read in all data and combine into a single dataframe
raw_data_df = pd.DataFrame()
all_files = os.listdir('question_data')
for file_name in all_files:
    curr_data_df = pd.read_csv(f'question_data/{file_name}').drop('Unnamed: 0', axis=1)
    raw_data_df = pd.concat([raw_data_df, curr_data_df], ignore_index=True)

In [102]:
# Fill all NA values with empty strings
raw_data_df.fillna('', inplace=True)

In [162]:
dim_list = [1000, 2000, 5000, 10000, 15000]
for dim in dim_list:

    # Vectorize just raw body text
    vectorizer_raw = TfidfVectorizer(max_features=dim)
    X_raw = vectorizer_raw.fit_transform(raw_data_df['body_raw'])

    # Vectorize text and LaTeX code separately, treating both as raw text
    vectorizer_text = TfidfVectorizer(max_features=dim)
    vectorizer_latex_raw = TfidfVectorizer(max_features=dim)
    X_text = vectorizer_text.fit_transform(raw_data_df['body_text'])
    X_latex_raw = vectorizer_latex_raw.fit_transform(raw_data_df['body_latex'])

    # Vectorize text and LaTeX code separately with fine tokenizer
    raw_data_df['body_latex_fine'] = raw_data_df['body_latex'].apply(lambda x: ' '.join(fine_tokenize_latex(remove_newlines(x))))
    vectorizer_latex_fine = TfidfVectorizer(max_features=dim)
    X_latex_fine = vectorizer_latex_fine.fit_transform(raw_data_df['body_latex_fine'])

    # Vectorize text and LaTeX code separately with coarse tokenizer
    raw_data_df['body_latex_coarse'] = raw_data_df['body_latex'].apply(lambda x: ' '.join(merge_tokens(coarse_tokenize_latex(remove_newlines(x)))))
    vectorizer_latex_coarse = TfidfVectorizer(max_features=dim)
    X_latex_coarse = vectorizer_latex_coarse.fit_transform(raw_data_df['body_latex_coarse'])

    # Combine text and LaTeX features
    X_text_latex_raw = sp.hstack([X_text, X_latex_raw])
    X_text_latex_fine = sp.hstack([X_text, X_latex_fine])
    X_text_latex_coarse = sp.hstack([X_text, X_latex_coarse])

    # Store the final datasets as NPZs
    sp.save_npz(f'vectorized_data/raw_{dim}.npz', X_raw)
    sp.save_npz(f'vectorized_data/text_latex_raw_{dim}.npz', X_text_latex_raw)
    sp.save_npz(f'vectorized_data/text_latex_fine_{dim}.npz', X_text_latex_fine)
    sp.save_npz(f'vectorized_data/text_latex_coarse_{dim}.npz', X_text_latex_coarse)

# Save the tags separately as a CSV file
raw_data_df[['tags']].to_csv('tags.csv', index=False)