In [1]:
def get_spelling(word):
    return '-'.join([char.upper() for char in word])

# Get the words and their correct spelling by grade from 1-5.
with open("GradeSpellingEval.txt", "r") as file:
    lines = file.readlines()
    lines = [l.replace('\n', '').replace('*', '') for l in lines]
    grade_indices = [i for i in range(len(lines)-1) if lines[i].startswith("GRADE")] + [len(lines)]
    words_by_grade = {}
    for i, idx in enumerate(grade_indices):
        if i+1 < len(grade_indices):
            words_by_grade[i+1] = []
            for l in range(idx+2, grade_indices[i+1] - 1):
                words_by_grade[i+1].append((lines[l].strip(), get_spelling(lines[l].strip())))

words_by_grade[3][:10]

[('able', 'A-B-L-E'),
 ('above', 'A-B-O-V-E'),
 ('afraid', 'A-F-R-A-I-D'),
 ('afternoon', 'A-F-T-E-R-N-O-O-N'),
 ('again', 'A-G-A-I-N'),
 ('age', 'A-G-E'),
 ('air', 'A-I-R'),
 ('airplane', 'A-I-R-P-L-A-N-E'),
 ('almost', 'A-L-M-O-S-T'),
 ('alone', 'A-L-O-N-E')]

In [None]:
import random

def create_few_shot_prompt(word, word_list, num_shots):
    # Creates a few shot prompt from the word list, using word as the actual result to answer.
    prompt = ''
    if word in word_list:
        word_list.remove(word) # Ensure we don't sample the same word we want to spell, giving the model the answer.
    if num_shots > 0:
        samples = random.sample(word_list, num_shots)
        for sample in samples:
            prompt += f"Q: How do you spell '{sample[0]}'? A: {sample[1]}\n\n"
    prompt += f"Q: How do you spell '{word}'? A:"
    return prompt

create_few_shot_prompt('able', words_by_grade[3], 2)

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM

tokenizer = AutoTokenizer.from_pretrained("EleutherAI/gpt-j-6B")
model = AutoModelForCausalLM.from_pretrained("EleutherAI/gpt-j-6B")

In [None]:
import torch

device = "cuda:0" if torch.cuda.is_available() else "cpu"
model.to(device)
model.config.pad_token_id = tokenizer.eos_token_id # Prevent lots of info messages telling us it's doing this every prompt.
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = 'left'

prompt = "Once upon a time"
input_ids = tokenizer.encode(prompt, return_tensors='pt').to(device)
output = model.generate(input_ids, max_length=50, num_return_sequences=1)
response = tokenizer.decode(output[0], skip_special_tokens=True)
print(response)

In [None]:
import tqdm.notebook as tqdm

# Give the model a few-shot problem and give it the first letter. (Without it, just answers ???? or ____)
# Last line is "Q: How do you spell the word 'example'? A: E-"
def assess_model_on_words(model, word_list, num_shots, batch_size=10):
    data = {grade: [] for grade in word_list.keys()}
    
    for grade in word_list:
        print(f"Assessing Grade {grade}")
        num_batches = (len(word_list[grade]) + batch_size - 1) // batch_size

        for i in tqdm.tqdm(range(num_batches)):
            start_index = i * batch_size
            end_index = min(len(word_list[grade]), start_index + batch_size)
            words = word_list[grade][start_index:end_index]
            
            prompts = [create_few_shot_prompt(word[0], word_list[grade], num_shots) for word in words]
            inputs = tokenizer(prompts, padding=True, truncation=True, return_tensors="pt").to(device)
            
            outputs = model.generate(**inputs, max_new_tokens=2*max([len(word[0]) for word in words]), num_return_sequences=1)
            responses = [tokenizer.decode(output, skip_special_tokens=True) for output in outputs]
            
            for i, response in enumerate(responses):
                response = response.split('\n\n')[num_shots]
                response = response.split('A:')[1].strip()
                data[grade].append((words[i], response))

    return data

shots = [0, 1, 2, 5, 10]
shot_data = {shot: {} for shot in shots}
for shot in shots:
    print(f"Checking {shot} {'shot' if shot == 1 else 'shots'}") # Was the grammar necessary? No, but it bugged me.
    shot_data[shot] = assess_model_on_words(model, words_by_grade, shot, 10)

In [None]:
# Check how closely model's spelling ability matches grade.
shot_successes = {shot: {} for shot in shots}
for shot in shots:
    successes = {grade: 0 for grade in words_by_grade.keys()}
    for grade in shot_data[shot].keys():
        successes[grade] = round([word[1].upper().startswith(word[0][1]) for word in shot_data[shot][grade]].count(True) / len(data[grade]), 3)
    shot_successes[shot] = successes
print(shot_successes)

In [None]:
import plotly.graph_objects as go
import plotly.io as pio

pio.renderers.default = "notebook"

def basic_bar_graph(data):
    fig = go.Figure(go.Bar(x=list(data.keys()), y=list(data.values())))
    fig.update_layout(yaxis=dict(range=[0, 1]))
    fig.show()

basic_bar_graph(shot_successes[2])

In [None]:
# Check how closely model's spelling ability matches length of the word.
lengths = {shot: {i: {'success': 0, 'total': 0} for i in range(1, 13)} for shot in [0, 1, 2, 5, 10]}

for shot in lengths.keys():
    for grade in shot_data[shot].keys():
        for word in shot_data[shot][grade]:
            length = len(word[0][0])
            lengths[shot][length]['total'] += 1
            if word[1] == word[0][1]:
                lengths[shot][length]['success'] += 1

def success_rate_bar_graph(data):
    success_rates = {key: val['success'] / val['total'] if val['total'] > 0 else 0 for key, val in data.items()}
    sample_sizes = {key: val['total'] for key, val in data.items()}

    fig = go.Figure(go.Bar(
        x=list(success_rates.keys()), 
        y=list(success_rates.values()),
        text=[f"n={n}" for n in sample_sizes.values()],  # Sample sizes as text
        textposition='auto'  # Positioning the text inside the bars
    ))
    
    fig.update_layout(yaxis=dict(range=[0, 1]))
    fig.show()

success_rate_bar_graph(lengths[2])

In [None]:
import numpy as np

def nested_bar_graph(data):
  fig = go.Figure()
  green_shades = ['#a1d99b', '#74c476', '#31a354', '#006d2c', '#024736']
  # The number of keys in the inner dictionaries determines the number of bars in each group
  num_inner_keys = len(next(iter(data.values())))
  bar_width = 0.15
  gap = 0.02  # Gap between groups of bars

  # Calculate the total width for each group of bars
  total_bar_group_width = num_inner_keys * bar_width + (num_inner_keys - 1) * gap

  # Generate the x-axis positions for the groups
  group_positions = np.linspace(0, len(data) - 1, len(data))

  # Add bars for each key in the nested dictionary
  for i, (key, values) in enumerate(data.items()):
      for j, (sub_key, value) in enumerate(values.items()):
          # Position each bar within the group based on its sub_key
          bar_position = group_positions[i] + (j - num_inner_keys / 2) * (bar_width + gap) + bar_width / 2
          fig.add_trace(go.Bar(
              x=[bar_position], 
              y=[value],
              width=bar_width,
              name=f'{key}-shot Grade {sub_key}',
              marker_color=green_shades[j % len(green_shades)]
          ))

  # Update layout to display the bars in groups
  fig.update_layout(
      barmode='group',
      title='Accuracy By Grade And Few-Shots (No Letters Given)',
      xaxis=dict(
          title='Number of Few Shots',
          tickmode='array',
          tickvals=group_positions,
          ticktext=list(data.keys())
      ),
      yaxis=dict(title='Values'),
      legend=dict(title='Legend'),
      showlegend=True
  )

  # Show the figure
  fig.show()

nested_bar_graph(shot_successes)

In [None]:
# Check to see how often the model gets the first letter right.
first_successes = {shot: {} for shot in shots}
for shot in shots:
    successes = {grade: 0 for grade in words_by_grade.keys()}
    for grade in range(1, 6):
        success_list = [word[1].upper().startswith(word[0][1][:1]) for word in shot_data[shot][grade]]
        for i, item in enumerate(success_list):
            if item is False and shot > 1: # When does the model get the first letter wrong with 2-shot or higher?
                print(shot_data[shot][grade][i])
        successes[grade] = round([word[1].upper().startswith(word[0][1][:1]) for word in shot_data[shot][grade]].count(True) / len(shot_data[shot][grade]), 3)
    first_successes[shot] = successes
print(first_successes)