In [10]:
import sys
import plotly.io as pio
import plotly.graph_objects as go
sys.path.append("../..")

from evals.spelling_by_grade import create_full_spelling_prompt, prepare_grade_spelling_eval
from evals.plot_utils import basic_bar_graph, create_table, nested_bar_graph
from evals.eval_utils import EvalResponse, ModelType
from evals.eval_list import FULL_SPELLING_EVAL, GET_FIRST_LETTER_EVAL, GET_POSITION_OF_LETTER_EVAL, EVAL_LIST

pio.renderers.default = "notebook"

words_by_grade = prepare_grade_spelling_eval("../data/GradeSpellingEval.txt", '-')
words_by_grade[3][:10]

[('able', 'A-B-L-E'),
 ('above', 'A-B-O-V-E'),
 ('afraid', 'A-F-R-A-I-D'),
 ('afternoon', 'A-F-T-E-R-N-O-O-N'),
 ('again', 'A-G-A-I-N'),
 ('age', 'A-G-E'),
 ('air', 'A-I-R'),
 ('airplane', 'A-I-R-P-L-A-N-E'),
 ('almost', 'A-L-M-O-S-T'),
 ('alone', 'A-L-O-N-E')]

In [None]:
create_full_spelling_prompt('able', words_by_grade[3], 2)

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM
from transformer_lens import HookedTransformer

MODEL_TYPE = ModelType.HUGGINGFACE

if MODEL_TYPE == ModelType.HUGGINGFACE:
    model = AutoModelForCausalLM.from_pretrained("EleutherAI/gpt-j-6B")
    tokenizer = AutoTokenizer.from_pretrained("EleutherAI/gpt-j-6B")
    model.config.pad_token_id = tokenizer.eos_token_id # Prevent lots of info messages telling us it's doing this every prompt.
    tokenizer.pad_token = tokenizer.eos_token
    tokenizer.padding_side = 'left'
elif MODEL_TYPE == ModelType.TRANSFORMER_LENS:
    model = HookedTransformer.from_pretrained("gpt-j-6b")
    tokenizer = model.tokenizer

In [None]:
import torch
import transformer_lens.utils as utils

device = "cuda:0" if torch.cuda.is_available() else "cpu"
model.to(device)
torch.set_grad_enabled(False)
print(device, MODEL_TYPE)

prompts = ["Once upon a time", "Once upon a time two"]
if MODEL_TYPE == ModelType.HUGGINGFACE:
    input_ids = tokenizer(prompts, padding=True, truncation=True, return_tensors="pt").to(model.device)
    outputs = model.generate(**input_ids, max_length=20, num_return_sequences=1, temperature=0.7, do_sample=True)
    response = [tokenizer.decode(output, skip_special_tokens=True) for output in outputs]
elif MODEL_TYPE == ModelType.TRANSFORMER_LENS:
    tokens = [model.to_tokens(prompt) for prompt in prompts]
    max_length = 20
    eos_token = model.to_tokens(model.tokenizer.eos_token, prepend_bos=False).item()
    tokens = [torch.cat((torch.full((max_length - token.shape[-1],), eos_token, dtype=torch.long).to(utils.get_device()), 
                         torch.as_tensor(token).to(utils.get_device()).squeeze(0))) for token in tokens]
    tokens = torch.stack(tokens)
    outputs = model.generate(tokens, max_new_tokens=10, temperature=0.7)
    response = model.to_string(outputs)
print(response)

In [None]:
full_spelling = FULL_SPELLING_EVAL.run_eval_with_multiple_shots(model, MODEL_TYPE, tokenizer, words_by_grade, [0, 1, 2, 5, 10])
print([full_spelling[i]['accuracy'] for i in full_spelling.keys()])

In [None]:
first_letter = GET_FIRST_LETTER_EVAL.run_eval(model, MODEL_TYPE, tokenizer, words_by_grade, 2)
print(first_letter['accuracy'])

In [None]:
position = GET_POSITION_OF_LETTER_EVAL.run_eval(model, MODEL_TYPE, tokenizer, words_by_grade, 2)
print(position['accuracy'])

In [None]:
'''evals = {eval.name: eval.run_eval(model, MODEL_TYPE, tokenizer, words_by_grade, 2) for eval in EVAL_LIST}
for eval in evals:
    print(eval, evals[eval]['accuracy'])'''

In [None]:
basic_bar_graph(position['accuracy'])

In [None]:
import numpy as np

green_shades = ['#a1d99b', '#74c476', '#31a354', '#006d2c', '#024736']
nested_bar_graph(shot_successes, green_shades)

In [None]:
# Check to see how often the model gets the first letter right.
first_successes = {shot: {} for shot in shots}
for shot in shots:
    successes = {grade: 0 for grade in words_by_grade.keys()}
    for grade in range(1, 6):
        success_list = [word['response'].strip().upper().startswith(word['answer'][0]) for word in shot_data[shot][grade] if min(len(word['response']), len(word['answer'])) > 0]
        successes[grade] = round((success_list.count(True) / len(shot_data[shot][grade])), 3)                                                                                                                           
    first_successes[shot] = successes
print(first_successes)
nested_bar_graph(first_successes)

In [None]:
table = create_table(shot_data[2])
table