In [1]:
import math
import random
import yaml
import argparse
from dotmap import DotMap

import numpy as np
import torch
import torch.nn as nn
from torch.nn import functional as F
from torch.optim import Adam
from torch.nn.functional import cosine_similarity

import matplotlib.pyplot as plt
import wandb

In [2]:
import sys
sys.path.append("./src")  # make sure Python can find src/
import data
from model_linear import GPTLinear
from model_softmax import GPTSoftmax
# from train_step import train_step
from multi_task_train import train_step

In [None]:
# Config

device = "cuda" if torch.cuda.is_available() else "cpu"

config = {
'model':
  {
    'n_layer': 1,
    'n_head': 1,
    'n_embd': 256,
    'linear': True,
  },

'data':
  {
    'name': 'window',
    'min_num': 1,
    'max_num': 16,
    'k': 2,
    'p': 17,
    'sep': 17,
    'cot': False,
    'num_tokens': 16,
    'n_train': 256,
    'n_test': 64,
    'fixed_len': True,
  },

'train':
  {
    'lr': 0.0001,
    'grad_clip': -1,
    'num_steps': 500,
    'norm_type': "none_rank",
    'wandb': True,
    'save_ckpt': False,
    'ckpt_freq': 20,
    'seed' = 67,
  }
}


config = DotMap(config)
config.model.vocab_size = max(config.data.p, config.data.max_num) + 1
config.model.block_size = 2 * config.data.num_tokens + 1

### MWP Test

In [4]:
data_sampler = data.MovingWindowProduct(
    min_num=config.data.min_num,
    max_num=config.data.max_num,
    k=config.data.k,
    p=config.data.p,
)

model = GPTLinear(config.model, return_att=True).to(device)
optim = Adam(model.parameters(), lr=config.train.lr)

if config.train.wandb:
    wandb_run_name = 'mwp_linear'
    wandb.login(key="")
    wandb.init(project="loss_plateau_tf", name=wandb_run_name, config=config)
    wandb.watch(model)

for step in range(config.train.num_steps):
    train_step(
        model=model,
        optim=optim,
        data_sampler=data_sampler,
        step=step,
        config=config,
        device=device
    )
    
if config.train.wandb:
    wandb.finish()

  from .autonotebook import tqdm as notebook_tqdm
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mjetyue04[0m ([33mwth_ucsd[0m). Use [1m`wandb login --relogin`[0m to force relogin


Step 0 -- Train loss: 2.8693161010742188, Train Acc: 0.068603515625 Test Acc: 0.060546875
Step 1 -- Train loss: 2.8518381118774414, Train Acc: 0.069091796875 Test Acc: 0.05078125
Step 2 -- Train loss: 2.838197946548462, Train Acc: 0.075439453125 Test Acc: 0.0732421875
Step 3 -- Train loss: 2.8244495391845703, Train Acc: 0.0673828125 Test Acc: 0.0703125
Step 4 -- Train loss: 2.8101773262023926, Train Acc: 0.06640625 Test Acc: 0.072265625
Step 5 -- Train loss: 2.803687572479248, Train Acc: 0.065673828125 Test Acc: 0.0849609375
Step 6 -- Train loss: 2.7909626960754395, Train Acc: 0.07861328125 Test Acc: 0.0751953125
Step 7 -- Train loss: 2.786344528198242, Train Acc: 0.079833984375 Test Acc: 0.076171875
Step 8 -- Train loss: 2.7790911197662354, Train Acc: 0.078857421875 Test Acc: 0.091796875
Step 9 -- Train loss: 2.773648500442505, Train Acc: 0.09521484375 Test Acc: 0.0869140625
Step 10 -- Train loss: 2.767432689666748, Train Acc: 0.093994140625 Test Acc: 0.0888671875
Step 11 -- Train los



0,1
att_prog_measure,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▂▂▄▆▇▇████████
data_repeat_frac,▄▃▂▂▆▅▆▅▁▆▁▅▁▂▂▄▂▄▆▃▇▃▅▅▄▅▁▄▅▆▁▆▅█▆▄▄▄█▇
idx0_check,▁███████████████████████████████████████
idx10_check,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▃███████████
idx11_check,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▃███████████
idx12_check,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▄▇██████████
idx13_check,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▂███████████
idx14_check,▁▁▁▁▁▁▁▁▁▁▁▁▁▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▃███████████
idx15_check,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁███████████
idx1_check,▁▁▁▁▁▁▁▁▁▂▂▂▂▁▂▂▂▂▂▂▂▂▃▃▄▅▅▇████████████

0,1
att_prog_measure,0.83799
data_repeat_frac,0.06563
idx0_check,1.0
idx10_check,1.0
idx11_check,1.0
idx12_check,1.0
idx13_check,1.0
idx14_check,1.0
idx15_check,1.0
idx1_check,1.0


### Mixing MWP and MWS

In [12]:
data_samplers = {}
data_samplers['mws'] = data.MovingWindowSum(
    min_num=config.data.min_num,
    max_num=config.data.max_num,
    k=config.data.k,
    p=config.data.p,
    sep = 17,
)
data_samplers['mwp'] = data.MovingWindowProduct(
    min_num=config.data.min_num,
    max_num=config.data.max_num,
    k=config.data.k,
    p=config.data.p,
    sep = 0,
)
config.model.n_head = 1
model = GPTLinear(config.model, return_att=True).to(device)

## Freeze embedding layer weights
for param in model.transformer.wte.parameters():
    param.requires_grad = False
for param in model.transformer.wpe.parameters():
    param.requires_grad = False

optim = Adam(filter(lambda p: p.requires_grad, model.parameters()), lr=config.train.lr)


if config.train.wandb:
    wandb_run_name = 'mws_mwp_linear_frozen_embedding'
    wandb.login(key="")
    wandb.init(project="loss_plateau_tf", name=wandb_run_name, config=config)
    wandb.watch(model)

for step in range(config.train.num_steps):
    train_step(
        model=model,
        optim=optim,
        data_samplers=data_samplers,
        step=step,
        config=config,
        device=device
    )
    
if config.train.wandb:
    wandb.finish()

Step 0 -- Train loss: 2.8831145763397217, Train Acc: 0.0693359375 Test Acc: 0.0546875
Step 1 -- Train loss: 2.877596855163574, Train Acc: 0.06494140625 Test Acc: 0.0556640625
Step 2 -- Train loss: 2.8668479919433594, Train Acc: 0.06787109375 Test Acc: 0.0576171875
Step 3 -- Train loss: 2.859556198120117, Train Acc: 0.058349609375 Test Acc: 0.0595703125
Step 4 -- Train loss: 2.8550333976745605, Train Acc: 0.067626953125 Test Acc: 0.0556640625
Step 5 -- Train loss: 2.849236249923706, Train Acc: 0.071533203125 Test Acc: 0.0615234375
Step 6 -- Train loss: 2.8438851833343506, Train Acc: 0.066650390625 Test Acc: 0.0546875
Step 7 -- Train loss: 2.844261884689331, Train Acc: 0.06201171875 Test Acc: 0.072265625
Step 8 -- Train loss: 2.8383939266204834, Train Acc: 0.072021484375 Test Acc: 0.0595703125
Step 9 -- Train loss: 2.8402185440063477, Train Acc: 0.05810546875 Test Acc: 0.0537109375
Step 10 -- Train loss: 2.8312153816223145, Train Acc: 0.066650390625 Test Acc: 0.0625
Step 11 -- Train loss



0,1
att_prog_measure,▁▁▁▁▁▂▂▂▂▃▃▃▄▅▆▇████████████████████████
data_repeat_frac,▇▃▇▁▅▄▃▄▆▇▆▃▇██▆▄▆▅▄▇▄▅▃▂▄▆▇▂▂▃▆▅▄▄▄▅▄▅▄
idx0_check,▁▃██████████████████████████████████████
idx10_check,▁▁▁▁▁▁▁▁▁▁▁▁▁▂▄▅▅▆▆▆▇▇▇▇████████████████
idx11_check,▁▁▁▁▁▁▁▁▁▁▁▂▁▁▄▅▅▅▆▆▆▇▇▇▇███████████████
idx12_check,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▃▅▅▅▆▆▇▇▇▇▇███████████████
idx13_check,▁▁▁▁▁▁▁▁▁▁▁▁▁▂▃▅▅▆▆▆▆▇▇▇▇███████████████
idx14_check,▁▁▁▁▁▁▁▁▁▁▁▁▁▂▄▅▅▆▆▆▆▇▇▇▇███████████████
idx15_check,▁▁▁▁▁▁▁▁▁▁▁▁▁▂▃▅▅▆▆▆▆▇▇▇▇███████████████
idx1_check,▁▁▁▁▁▁▁▁▁▁▁▁▁▂▄▅▅▅▅▆▆▇▇▇▇███████████████

0,1
att_prog_measure,0.73029
data_repeat_frac,0.05
idx0_check,1.0
idx10_check,1.0
idx11_check,1.0
idx12_check,1.0
idx13_check,1.0
idx14_check,1.0
idx15_check,1.0
idx1_check,1.0
