In [1]:
def get_spelling(word):
    return '-'.join([char.upper() for char in word])

# Get the words and their correct spelling by grade from 1-5.
with open("GradeSpellingEval.txt", "r") as file:
    lines = file.readlines()
    lines = [l.replace('\n', '').replace('*', '') for l in lines]
    grade_indices = [i for i in range(len(lines)-1) if lines[i].startswith("GRADE")] + [len(lines)]
    words_by_grade = {}
    for i, idx in enumerate(grade_indices):
        if i+1 < len(grade_indices):
            words_by_grade[i+1] = []
            for l in range(idx+2, grade_indices[i+1] - 1):
                words_by_grade[i+1].append((lines[l].strip(), get_spelling(lines[l].strip())))

words_by_grade[3][:10]

[('able', 'A-B-L-E'),
 ('above', 'A-B-O-V-E'),
 ('afraid', 'A-F-R-A-I-D'),
 ('afternoon', 'A-F-T-E-R-N-O-O-N'),
 ('again', 'A-G-A-I-N'),
 ('age', 'A-G-E'),
 ('air', 'A-I-R'),
 ('airplane', 'A-I-R-P-L-A-N-E'),
 ('almost', 'A-L-M-O-S-T'),
 ('alone', 'A-L-O-N-E')]

In [None]:
import random

def create_few_shot_prompt(word, word_list, num_shots):
    # Creates a few shot prompt from the word list, using word as the actual result to answer.
    prompt = ''
    if word in word_list:
        word_list.remove(word) # Ensure we don't sample the same word we want to spell, giving the model the answer.
    samples = random.sample(word_list, num_shots)
    for sample in samples:
        prompt += f"Q: How do you spell '{sample[0]}'? A: {sample[1]}\n\n"
    prompt += f"Q: How do you spell '{word}'? A: "
    return prompt

create_few_shot_prompt('able', words_by_grade[3], 2)

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM

tokenizer = AutoTokenizer.from_pretrained("EleutherAI/gpt-j-6B")
model = AutoModelForCausalLM.from_pretrained("EleutherAI/gpt-j-6B")

In [None]:
prompt = "Once upon a time"
input_ids = tokenizer.encode(prompt, return_tensors='pt')
output = model.generate(input_ids, max_length=50, num_return_sequences=1)
response = tokenizer.decode(output[0], skip_special_tokens=True)
print(response)

In [None]:
import tqdm.notebook as tqdm

def assess_model_on_words(model, word_list, num_shots):
    successes = {grade: 0 for grade in word_list.keys()}
    for grade in word_list:
        print(f"Assessing Grade {grade}")
        for i in tqdm.tqdm(range(len(word_list[grade]))):
            word = word_list[grade][i]
            prompt = create_few_shot_prompt(word[0], word_list[grade], num_shots)
            inputs = tokenizer.encode_plus(prompt, return_tensors='pt') # Tokens, attention mask
            output = model.generate(**inputs, max_new_tokens=10, num_return_sequences=1)
            response = tokenizer.decode(output[0], skip_special_tokens=True)
            print(response)
            if response.startswith(word[1]):
                successes[grade] += 1
    success_percentage = {grade: successes[grade] / len(word_list[grade]) for grade in word_list.keys()}
    return success_percentage

# This eval fails - gives ???? or ____ as answers.
assess_model_on_words(model, words_by_grade, 10)