# Final Project

* [Preparing the data](#first)

#### Modules

In [1]:
import math
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline, TrainingArguments, Trainer, AutoModelForQuestionAnswering
from trl import SFTTrainer
import torch
import os
import numpy as np
import pandas as pd
from datasets import Dataset, load_dataset
import evaluate

#### Constants

In [2]:
TOKEN = r'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'
#TEST_MODEL = r"tiiuae/falcon-rw-1b"
TEST_MODEL = r"bert-base-uncased"
MATHEMATICS_DATASET = r'D:\jupyter_notebooks\datasets\mathematics_dataset\mathematics_dataset'

## Preparing the data <a class="anchor" id="first"></a>

Can get the dataset from https://console.cloud.google.com/storage/browser/mathematics-dataset;tab=objects?prefix=&forceOnObjectsSortingFiltering=false. I use just the arithmetic type problems

In [3]:
def prepare_mathematics(path=MATHEMATICS_DATASET, subject_list=['arithmetic']):
    df = pd.DataFrame()
    levels = []
    subjects = []
    problem_types = []
    problems = []
    answers = []

    for subdir, dirs, files in os.walk(path):
        for file in files:
            if 'train' in subdir:
                level = subdir[subdir.rindex('-')+1:] # the problem level is indicated after a - character in the directory name
                subject, problem_type = file.split('__') # splitting file name into subject/type
                problem_type = problem_type[:-4] # Removing .txt extension

                # Only looking at some subjects
                if subject in subject_list:
                    # Reading the data from the file
                    with open(os.path.join(subdir,file), 'r') as f:
                        prob = ans = True
                        while prob and ans:
                            prob = f.readline().strip()
                            ans = f.readline().strip()

                            # Creating row
                            levels.append(level)
                            subjects.append(subject)
                            problem_types.append(problem_type)
                            problems.append(prob)
                            answers.append(ans)          
                            
    # Creating the dataframe
    df['level'] = levels
    df['subject'] = subjects
    df['problem_type'] = problem_types
    df['problem'] = problems
    df['answer'] = answers
    
    # Cleaning up the notebook environment
    del levels
    del subjects
    del problem_types
    del problems
    del answers
    
    return df

In [4]:
df = prepare_mathematics()

#### creating splits (I think its like 400000, 50000, 50000)

In [5]:
temp = df[df.level=='easy']
temp = temp.drop(columns=['level', 'subject'])
temp = temp.rename(columns={'problem':'text', 'answer':'label'})
length = len(temp)

train = temp.sample(int(length*.04))
temp=temp.drop(train.index)
train = Dataset.from_pandas(train, preserve_index=False)

dev = temp.sample(int(length*.005))
temp=temp.drop(dev.index)
dev = Dataset.from_pandas(dev, preserve_index=False)

test = temp.sample(int(length*.005))
test = Dataset.from_pandas(test, preserve_index=False)
del temp

#### tokenization

In [22]:
tokenizer = AutoTokenizer.from_pretrained(TEST_MODEL)
def tokenization(items):
    tokenized = tokenizer(items['text'], max_length = 64, padding = 'max_length', truncation=True)
    return tokenized

def tokenization_labels(items):
    tokenized = tokenizer(items['text'], max_length = 64, padding = 'max_length', truncation=True)
    tokenized['label'] = tokenizer(items['label'], max_length = 64, padding = 'max_length', truncation=True)['input_ids']
    return tokenized
# train = train.map(tokenization, batched=True)
# dev = dev.map(tokenization, batched=True)
test = test.map(tokenization_labels, batched=True)
# test_labels = test.map(tokenization_labels, batched=True)
# test['label'] = test['text']

Map:   0%|          | 0/30000 [00:00<?, ? examples/s]

In [21]:
test_labels

Dataset({
    features: ['problem_type', 'text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 30000
})

#### Training

In [7]:
metric = evaluate.load('f1')
def compute_metrics(pred):
    output, labels = pred
    return metric.compute(predictions=np.argmax(output, axis=-1), references=labels)

tokenizer = AutoTokenizer.from_pretrained(TEST_MODEL)
#tokenizer.pad_token = tokenizer.eos_token

args = TrainingArguments(
    output_dir='./temp',
    evaluation_strategy="epoch", # Reports the metric after each epoch
    per_device_train_batch_size = 16,
    per_device_eval_batch_size = 16,
)

trainer = SFTTrainer(
    model = TEST_MODEL,
    args = args,
    train_dataset = train,
    eval_dataset = dev,
    #compute_metrics = compute_metrics,
    tokenizer=tokenizer,
    max_seq_length=64,
    #packing=True,
    dataset_text_field='text',
)



If you want to use `BertLMHeadModel` as a standalone, add `is_decoder=True.`


Map:   0%|          | 0/240000 [00:00<?, ? examples/s]

Map:   0%|          | 0/30000 [00:00<?, ? examples/s]

0 validation loss so theoretically worked perfectly for the basic arithmetic problems, but didnt look at the model output

In [8]:
trainer.train()

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss
1,0.0004,2.6e-05
2,0.0,1e-06
3,0.0,0.0


TrainOutput(global_step=45000, training_loss=0.0034019357781870188, metrics={'train_runtime': 5845.2726, 'train_samples_per_second': 123.176, 'train_steps_per_second': 7.699, 'total_flos': 1.47913866661248e+16, 'train_loss': 0.0034019357781870188, 'epoch': 3.0})

In [9]:
trainer.save_model('bert_test')

In [112]:
print(train.groupby('problem_type').size())

problem_type
add_or_sub              53680
add_or_sub_in_base      53637
add_sub_multiple        52947
div                     53488
mixed                   53404
mul                     53307
mul_div_multiple        53400
nearest_integer_root    53425
simplify_surd           52712
dtype: int64


Random code I was messing with, I was using the commented model in the constants for this part

In [3]:
def separate_amps_problem(path):
    problem_ended = False
    problem = ''
    answer = ''
    with open(path, 'r') as f:
        for line in f:
            if line == 'Answer:\n':
                problem_ended = True
            elif problem_ended:
                answer += line
            elif line != 'Problem:\n':
                problem += line
    return problem, answer

In [4]:
print(separate_amps_problem(r"D:\jupyter_notebooks\datasets\amps\amps\mathematica\algebra\system_of_equations\30.txt"))

('Solve the following system of two equations: \n$-\\frac{17 x}{\\sqrt{2}}-\\frac{y}{\\sqrt{2}}-\\frac{5}{\\sqrt{2}}=0$, $\\frac{13 x}{\\sqrt{2}}+\\frac{21 y}{\\sqrt{2}}+\\frac{27}{\\sqrt{2}}=0$\n', '$x=-\\frac{39}{172}$, $y=-\\frac{197}{172}$')


In [5]:
tokenizer = AutoTokenizer.from_pretrained(TEST_MODEL)
pipeline = pipeline(
    "text-generation",
    model=TEST_MODEL,
    tokenizer=tokenizer,
    torch_dtype=torch.bfloat16,
    device_map="auto",
)

In [12]:
sequences = pipeline(
    r"Compute the mean of ${9, -4}$.",
    max_length=50,
    do_sample=True,
    top_k=10,
    num_return_sequences=1,
    eos_token_id=tokenizer.eos_token_id,
)
for seq in sequences:
    print(f"Result: {seq['generated_text']}")

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Result: Compute the mean of ${9, -4}$.
${9,-4} = {0, 2, 3}
${9,-2} = {0,-3}
${9,-1} = {-1,-3
