In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
# make deterministic
from mingpt.utils import set_seed
set_seed(44)

In [3]:
import os
import math
import time
from tqdm import tqdm
import numpy as np
import torch
import torch.nn as nn
from data.othello import Othello, OthelloBoardState, permit_reverse
from mingpt.dataset import CharDataset
from mingpt.utils import sample
from mingpt.model import GPT, GPTConfig
from mingpt.trainer import Trainer, TrainerConfig

## Training

In [4]:
# n_games=-1 means use as many simulated games as possible (from "data/othello_synthetic/")
othello = Othello(n_games=-1)
train_dataset = CharDataset(othello)
# original OthelloGPT params: n_layer=8, n_head=8, n_embd=512
mconf = GPTConfig(train_dataset.vocab_size, train_dataset.block_size, n_layer=8, n_head=8, n_embd=512)
model = GPT(mconf)

  0%|          | 0/50 [00:00<?, ?it/s]

Mem Used: 4.15 GB: 100%|██████████| 50/50 [00:15<00:00,  3.28it/s] 


Loaded 4999952 from 50 files. Now deduplicating...
Deduplicating finished with 4999389 games left
Using 3999512 for training, 999877 for validation
Dataset created has 3999512 sequences, 61 unique words.


In [6]:
if torch.cuda.is_available():
    device = torch.cuda.current_device()
    model = model.to(device)
else:
    print("NO GPU FOUND")

In [7]:
# setting up training
max_epochs = 40
experiment_name = "bias80"
t_start = time.strftime("_%Y%m%d_%H%M%S")
ckpt_path = f"./ckpts/{experiment_name}_{t_start}.ckpt"
tconf = TrainerConfig(
    max_epochs=max_epochs, 
    batch_size=512*4, # using 4 gpus
    learning_rate=5e-4,
    lr_decay=True, 
    warmup_tokens=len(train_dataset)*train_dataset.block_size*5, 
    final_tokens=len(train_dataset)*train_dataset.block_size*max_epochs,
    num_workers=0, 
    ckpt_path=ckpt_path, 
    # saved_epochs=[0, 1, 2, 3, 5, 10, 15, 20],
)
trainer = Trainer(model, train_dataset, None, tconf)
device = trainer.device
print(t_start)

_20230704_154558


In [None]:
trainer.train()

In [15]:
# loading model from ckpt
# ckpt = "grok/synth"
ckpt = "bias80_e40"
load_res = model.load_state_dict(torch.load(f"./ckpts/{ckpt}.ckpt"))
if torch.cuda.is_available():
    device = torch.cuda.current_device()
    model = model.to(device)
else:
    print("NO GPU FOUND")

## Validation

In [16]:
# checks if model prediction is legal for each node in given game
# expects dataset has already been loaded and model is on GPU
def check_legal(model, game):
    total_nodes = 0
    success_nodes = 0

    len_whole_game = len(game)
    for len_partial_game in range(1, len_whole_game):
        total_nodes += 1
        context = game[:len_partial_game]
        x = torch.tensor([train_dataset.stoi[s] for s in context], dtype=torch.long)[None, ...].to(device)
        y = sample(model, x, 1, temperature=1.0)
        # taking top-1 prediction
        completion = [train_dataset.itos[int(i)] for i in y[0] if i != -1]
        try:
            OthelloBoardState().update(completion)
        except Exception:
            pass
        else:
            success_nodes += 1
    
    return total_nodes, success_nodes

In [17]:
# default data root is othello_synthetic
def validate_with_dataset(model, data_root=None, n_games=1000):
    # find to load in first n games, because the first ~1 million othello_synthetic games are test set for unbiased model
    v_games = Othello(data_root=data_root, n_games=n_games, test_split=1)

    total_nodes = 0
    success_nodes = 0

    def progress_report():
        return f"{success_nodes/total_nodes*100:.2f}% pass rate: {success_nodes}/{total_nodes} among all searched nodes"
    
    bar = tqdm(v_games.val[:n_games])
    for game in bar:
        tn, sn = check_legal(model, game)
        total_nodes += tn
        success_nodes += sn
        bar.set_description(progress_report())
    print(progress_report())

In [22]:
validate_with_dataset(model, data_root="othello_synthetic", n_games=50)

Mem Used: 7.166 GB:   0%|          | 0/50 [00:00<?, ?it/s]
  0%|          | 0/50 [00:00<?, ?it/s]

Loaded 99999 from 1 files. Now deduplicating...
Deduplicating finished with 50 games left
Using 0 for training, 50 for validation


87.08% pass rate: 2569/2950 among all searched nodes: 100%|██████████| 50/50 [00:20<00:00,  2.41it/s]

87.08% pass rate: 2569/2950 among all searched nodes





In [None]:
partial_game = [19, 34, 41, 11, 10, 9, 1, 20, 3, 2, 8, 0, 13, 4, 29, 12, 5, 6, 14, 15, 21, 37, 22, 33, 7, 26, 18, 16, 25, 17, 42, 23, 30, 24, 31, 32, 46, 38]
OthelloBoardState().update(partial_game, prt=True)

### probe analysis

In [1]:
# summarizing probe loss

import json

root = f"ckpts/grok/probes/state_tl256_random"
errs = []
for i in range(8):
    name = f"layer{i+1}/tensorboard.txt"
    with open(f"{root}/{name}", "r") as file:
        j = json.load(file)
        test_acc = j['test_acc_cont']
        err = 100 * (1 - test_acc[-1])
        errs.append(err)
        # print(f"layer {i+1} error rate: {err:.5f}")

print(", ".join([str(e) for e in errs]))

25.944670627934276, 25.998496185446008, 26.326511150234744, 26.59829812206572, 26.910798122065728, 26.979478433098592, 27.150491490610328, 27.355670481220663
