In [70]:
with open("hf-token.txt", "r") as f:
    token = f.read()

LF = "\n"

In [71]:
%%capture
!pip install flash-attn --no-build-isolation

from transformers import pipeline, AutoModelForSequenceClassification, AutoTokenizer
import torch
from torch import nn
from transformers import pipeline
import pandas as pd
import re
from tqdm import tqdm
import numpy as np

print(torch.mps.is_available())
torch.mps.empty_cache()

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [72]:
model_name = "lschlessinger/bert-finetuned-math-prob-classification"
model_name = "bert-base-uncased"

tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)

# Load in float16 to fit memory
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    torch_dtype=torch.float16,   # Important for Mac memory fitting
    device_map="mps:0",
    trust_remote_code=True,
    offload_folder = "offload",
    token=token
)

# customnize model for this task
model.classifier = nn.Linear(768, 8, bias = True, device = "mps:0", dtype = torch.float16) # device = mps

# # freeze all but the final layer of weights
# for param in model.parameters():
#     param.requires_grad = False
# for param in model.classifier.parameters():
#     param.requires_grad = True

In [73]:
def reformat(outs):

    cls_ref = {
        0 : "Algebra",
        1 : "Counting & Probability",
        2 : "Geometry",
        3 : "Intermediate Algebra",
        4 : "Number Theory",
        5 : "Prealgebra",
        6 : "Precalculus"
    }

    outs = res["logits"].squeeze(0)
    cls_idx = outs.argmax(axis = 0)
    cls_conf = outs.max()
    cls = cls_ref[int(cls_idx)]

    print(f"cls: {cls}, conf: {cls_conf}")

In [74]:
## data load, train-test split

df = pd.read_csv("train-data/train.csv")
df_te = pd.read_csv("train-data/test.csv").drop("id", axis = 1)
train_size = 2073
df_tr, df_val = df.iloc[:train_size], df.iloc[train_size:]

In [75]:
def test_model(model, df_val, batch_size = 30, subset = 1000):
    ''' tests the model on a given test dataset '''

    print("Validating...")

    torch.mps.empty_cache()
    
    with torch.no_grad():

        y = df_val.label.to_numpy()
        avg_list = []

        pbar = tqdm(range(0, len(df_val), batch_size))

        for start_idx in pbar: 

            if start_idx >= subset: break

            end_idx = min((start_idx + batch_size, len(df_val)))
            df = df_val.iloc[start_idx:end_idx]
            questions = df.Question.tolist()

            tokens = tokenizer(questions, padding = True, truncation = True, return_tensors = "pt") # .to("mps:0")
            res = np.array(model(**tokens).logits.argmax(axis = 1).cpu())

            sub_avg = (res == y[start_idx:end_idx]).mean()
            avg_list.append(sub_avg)

            pbar.set_description(f"score: {round(np.mean(avg_list),3)}")
        
# test_model(model, df_val)

In [78]:
def y_to_dummies(y):
    ''' converts answers in the form [1,8,3,2,3...] to 
    [[0,1,0,...0],[0,0,...,0,1]] (one-hot encoded) '''

    out = torch.zeros((len(y),8), device = "mps:0", dtype = torch.float16)

    # for i,yi in enumerate(y):
    #     out[i,yi] = 1

    return torch.tensor(y, device = "mps:0", dtype = torch.float16) # device = "mps:0"

softmax = nn.Softmax(dim=1)
def train_model(model, df_tr, optimizer, loss_fn):
    ''' performs one training step on model with the specified input data '''
    global y_pred, y

    # torch.mps.empty_cache()

    ## reformat input data
    questions, answers = df_tr.Question.tolist(), df_tr.label.to_numpy()
    y = y_to_dummies(answers)

    ## render predictions
    tokens = tokenizer(questions, padding = True, truncation = True, return_tensors = "pt").to("mps:0")
    tokens["input_ids"]
    optimizer.zero_grad()
    y_pred = model(**tokens).logits
    y_pred = softmax(y_pred)

    y, y_pred = y, y_pred

    ## loss & backprop & optim
    loss = loss_fn(y_pred, y)
    print(loss)
    loss.backward()
    optimizer.step()

    # generate training metrics
    pred_labels = np.array(y_pred.argmax(axis = 1).cpu())
    avg = (pred_labels == answers).mean()

    return avg

In [79]:
## training loop
batch_size = 10

loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(filter(lambda p: p.requires_grad, model.parameters()), lr=1e-4)

n_epochs = 10
for ep in range(n_epochs):
    print(f"\n== epoch {ep} ==\n")

    pbar = tqdm(range(0, len(df_tr), batch_size))
    accs = []
    for start_idx in pbar:

        # subset training data into minibatches
        end_idx = min((start_idx + batch_size), len(df_tr))
        df_tr_sub = df_tr.iloc[start_idx:end_idx]

        acc = train_model(model, df_tr_sub, optimizer, loss_fn)
        accs.append(acc)
        pbar.set_description(f"train acc: {round(np.mean(accs),3)}")

    # test_model(model, df_val)
    print([p for p in model.classifier.parameters()][0][0][:5])


== epoch 0 ==



  pred_labels = np.array(y_pred.argmax(axis = 1).cpu())
train acc: 0.1:   0%|          | 1/208 [00:00<01:28,  2.34it/s]

tensor(nan, device='mps:0', dtype=torch.float16, grad_fn=<NllLossBackward0>)


train acc: 0.2:   1%|          | 2/208 [00:00<00:56,  3.62it/s]

tensor(nan, device='mps:0', dtype=torch.float16, grad_fn=<NllLossBackward0>)
tensor(nan, device='mps:0', dtype=torch.float16, grad_fn=<NllLossBackward0>)


train acc: 0.225:   2%|▏         | 4/208 [00:01<00:52,  3.90it/s]

tensor(nan, device='mps:0', dtype=torch.float16, grad_fn=<NllLossBackward0>)
tensor(nan, device='mps:0', dtype=torch.float16, grad_fn=<NllLossBackward0>)


train acc: 0.217:   3%|▎         | 6/208 [00:01<00:43,  4.59it/s]

tensor(nan, device='mps:0', dtype=torch.float16, grad_fn=<NllLossBackward0>)
tensor(nan, device='mps:0', dtype=torch.float16, grad_fn=<NllLossBackward0>)


train acc: 0.238:   4%|▍         | 8/208 [00:01<00:40,  4.91it/s]

tensor(nan, device='mps:0', dtype=torch.float16, grad_fn=<NllLossBackward0>)
tensor(nan, device='mps:0', dtype=torch.float16, grad_fn=<NllLossBackward0>)


train acc: 0.23:   5%|▍         | 10/208 [00:02<00:41,  4.72it/s]

tensor(nan, device='mps:0', dtype=torch.float16, grad_fn=<NllLossBackward0>)
tensor(nan, device='mps:0', dtype=torch.float16, grad_fn=<NllLossBackward0>)


train acc: 0.217:   6%|▌         | 12/208 [00:03<00:48,  4.06it/s]

tensor(nan, device='mps:0', dtype=torch.float16, grad_fn=<NllLossBackward0>)
tensor(nan, device='mps:0', dtype=torch.float16, grad_fn=<NllLossBackward0>)


train acc: 0.208:   6%|▋         | 13/208 [00:03<00:53,  3.62it/s]


KeyboardInterrupt: 

In [None]:
loss_fn(y_pred,y).backward()

RuntimeError: Trying to backward through the graph a second time (or directly access saved tensors after they have already been freed). Saved intermediate values of the graph are freed when you call .backward() or autograd.grad(). Specify retain_graph=True if you need to backward through the graph a second time or if you need to access saved tensors after calling backward.

In [None]:
[p for p in model.classifier.parameters()][0][0][:5]

tensor([-0.0106, -0.0171, -0.0295,  0.0323,  0.0349], device='mps:0',
       dtype=torch.float16, grad_fn=<SliceBackward0>)