In [None]:
import torch
from transformers import DistilBertTokenizer, DistilBertModel
import nltk
from nltk.corpus import stopwords

from io import StringIO
from html.parser import HTMLParser
from bs4 import BeautifulSoup

import pandas as pd
import numpy as np
import json
 
nltk.download('stopwords')
nltk_stopwords = set(stopwords.words('english'))
nltk_stopwords

tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased',truncation_side='left',truncation=True)
tokenizer.add_tokens(list(open('latex-vocabulary/latex_symbols.txt','r')))

In [None]:
if torch.backends.mps.is_available():
    device = torch.device("mps")
    x = torch.ones(1, device=device)
    print(x)
else:
    print("MPS device not found.")
    device = torch.device("cpu")

In [None]:
class MLStripper(HTMLParser):
    def __init__(self):
        super().__init__()
        self.reset()
        self.strict = False
        self.convert_charrefs= True
        self.text = StringIO()
    def handle_data(self, d):
        self.text.write(d)
    def get_data(self):
        return self.text.getvalue()

def get_latex_from_alt(context):
    strip_deliminators = lambda latex: latex.replace('$','').replace('\\[','').replace('\\]','')

    context_soup = BeautifulSoup(context)
    latex_images = context_soup.find_all('img')
    for image in latex_images:
        image.replace_with(strip_deliminators(image['alt']))
    # return [strip_deliminators(image['alt']) for image in latex_images]
    return str(context_soup)

def strip_tags(html):
    s = MLStripper()
    s.feed(html)
    return s.get_data()

def remove_stopwords(text):
    filtered_text = [w for w in text.split() if w.lower() not in nltk_stopwords]
    return " ".join(filtered_text)

Opening the JSON file where all the problems are stored:

In [None]:
problems = json.load(open('amc_10_problems_with_sol.json'))
problems["2015 AMC 10A #1"]["problem"]

We must filter for problems from before 2019 and get their problem, solutions, and choices.

The following functions need to be applied to the text to simplify them:

In [None]:
problem_text = strip_tags(get_latex_from_alt(problem))
solutions_text = strip_tags(get_latex_from_alt(" ".join([solution for solution in json.loads(solutions_list) if 'http' not in solution])))
choices_text = " ".join(json.loads(choices))
training_text = " ".join([problem_text, solutions_text, choices_text])

This part introduces how to apply the model to a certain string of text.

In [None]:
class DistilBERTClass(torch.nn.Module):
    def __init__(self, num_classes):
        super(DistilBERTClass, self).__init__()
        self.l1 = DistilBertModel.from_pretrained("distilbert-base-uncased")
        self.pre_classifier = torch.nn.Linear(768, 768)
        self.dropout = torch.nn.Dropout(0.3)
        self.classifier = torch.nn.Linear(768, num_classes)

    def forward(self, input_ids, attention_mask):
        output_1 = self.l1(input_ids=input_ids, attention_mask=attention_mask)
        hidden_state = output_1[0]
        pooler = hidden_state[:, 0]
        pooler = self.pre_classifier(pooler)
        pooler = torch.nn.ReLU()(pooler)
        pooler = self.dropout(pooler)
        return self.classifier(pooler)

The model must be downloaded: https://huggingface.co/iuruoy-shao/top-level-with-solutions-distilbert-amc10-2019-2022/tree/main

In [None]:
model = DistilBERTClass(num_classes=5)
model = torch.load('top-level-with-solutions-distilbert-amc10-2019-2022.pt')
model.to(device)

# model = torch.load('top-level-with-solutions-distilbert-amc10-2019-2022.pt')

sample_string = 'value x satisfies x- \\frac{3}{4} = \\frac{5}{12} - \\frac{1}{3}? Adding \\frac{3}{4} sides, x= \\frac{5}{12} - \\frac{1}{3} + \\frac{3}{4} = \\frac{5}{12} - \\frac{4}{12} + \\frac{9}{12}=\\boxed{\\textbf{(E) }\\frac{5}{6}}. Multiplying 12 sides gets us 12x-9=1 \\Rightarrow 12x=10, therefore \\boxed{x=\\textbf{(E)}~\\frac{5}{6}}. \\ {-}\\frac{2}{3} \\ \\frac{7}{36} \\ \\frac{7}{12} \\ \\frac{2}{3} \\ \\frac{5}{6}'

def outputs(input_string):
    inputs = tokenizer(sample_string, return_tensors="pt").to(device)
    fin_outputs = []
    with torch.no_grad():
        outputs = model(**inputs)
    fin_outputs.extend(torch.sigmoid(outputs).cpu().detach().numpy().tolist())
    outputs = np.array(fin_outputs) >= 0.5
    return [[1 if value else 0 for value in output] for output in outputs][0]

print(outputs(sample_string))

Store the outputs of every problem with their corresponding problem number / information