In [17]:
import torch

from minicons import scorer
from torch import optim

from tqdm import trange, tqdm

from transformers import AutoTokenizer, AutoModelForCausalLM, AdamW, get_constant_schedule, set_seed
from experiment import Learner
import utils

In [2]:
set_seed(42)

In [3]:
learner = Learner("kanishka/smolm-autoreg-bpe-seed_111")

In [4]:
learner.add_tokens()

In [7]:
indices = [[1, 180, 8192, 51, 353, 70, 141, 38]]

[[ii + 1 if ii == 8192 else ii for ii in i] for i in indices]

[[1, 180, 8193, 51, 353, 70, 141, 38]]

In [15]:
learner.sequence_score("she gave me the ball .")

[-3.4966742992401123]

In [3]:
lm = scorer.IncrementalLMScorer("kanishka/smolm-autoreg-bpe-seed_111")

In [8]:
lm.token_score("she gave the ball to me .")

[[('<s>', 0.0),
  ('she', -4.799075126647949),
  ('gave', -6.241611480712891),
  ('the', -2.643448829650879),
  ('ball', -3.6407971382141113),
  ('to', -0.5346994400024414),
  ('me', -2.1648502349853516),
  ('.', -0.6528644561767578)]]

In [9]:
lm.model.resize_token_embeddings().weight

Parameter containing:
tensor([[ 0.0782,  0.0603, -0.0287,  ...,  0.0975, -0.0735, -0.0400],
        [-0.0611, -0.0391, -0.1022,  ...,  0.0748, -0.0744,  0.0567],
        [ 0.0663,  0.0578, -0.0290,  ...,  0.1053, -0.0621, -0.0745],
        ...,
        [-0.0048,  0.2357,  0.0625,  ...,  0.1035, -0.0358,  0.0389],
        [-0.0794, -0.0094, -0.0807,  ...,  0.0767,  0.0060, -0.0285],
        [ 0.1411, -0.0778,  0.0027,  ...,  0.1294,  0.0140,  0.0097]],
       requires_grad=True)

In [10]:
# add new tokens
length = lm.model.resize_token_embeddings().weight.shape[0]
added_tokens = [" [verb]"]
lm.tokenizer.add_tokens(added_tokens)

# avoids including <|endoftext|> which is present in the tokenizer
new_length = length+len(added_tokens)
lm.model.resize_token_embeddings(new_length)

Embedding(8193, 256)

In [11]:
index = new_length-len(added_tokens)
embeddings_weight = lm.model.resize_token_embeddings().weight
embeddings_weight.requires_grad = False

mu = embeddings_weight[:index].mean(0).detach()
n = length
sigma = ((embeddings_weight[:index] - mu).T @ (embeddings_weight[:index] - mu)) / n
dist = torch.distributions.multivariate_normal.MultivariateNormal(
        mu, covariance_matrix=1e-5*sigma)

embeddings_weight[index:] = torch.stack(tuple((dist.sample() for _ in range(len(added_tokens)))), dim=0)
embeddings_weight.requires_grad = True

In [12]:
embeddings_weight

Parameter containing:
tensor([[ 0.0782,  0.0603, -0.0287,  ...,  0.0975, -0.0735, -0.0400],
        [-0.0611, -0.0391, -0.1022,  ...,  0.0748, -0.0744,  0.0567],
        [ 0.0663,  0.0578, -0.0290,  ...,  0.1053, -0.0621, -0.0745],
        ...,
        [-0.0794, -0.0094, -0.0807,  ...,  0.0767,  0.0060, -0.0285],
        [ 0.1411, -0.0778,  0.0027,  ...,  0.1294,  0.0140,  0.0097],
        [ 0.0350,  0.0372, -0.0193,  ...,  0.0344, -0.0282, -0.0269]],
       requires_grad=True)

In [13]:
lm.model.model.decoder.embed_tokens

Embedding(8193, 256)

In [14]:
target_params = ['model.decoder.embed_tokens.weight']

for param in lm.model.named_parameters():
    if param[0] not in target_params:
        param[1].requires_grad = False

In [15]:
assert [param[0] for param in lm.model.named_parameters() if param[1].requires_grad] == target_params

In [16]:
lm.tokenizer(["she [verb] the ball to him", "they [verb] the big beautiful ball to the cat."], padding=True)

{'input_ids': [[1, 180, 8193, 51, 353, 70, 277, 0, 0, 0, 0], [1, 174, 8193, 51, 253, 1428, 353, 70, 51, 390, 7]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]}

In [22]:
encoded = lm.tokenizer(["she [verb] the ball to him", "they [verb] the big beautiful ball to the cat."], padding=True, return_tensors="pt")

encoded['input_ids'] = torch.tensor([[t-1 if t == new_length else t for t in token_ids] for token_ids in encoded.input_ids])

In [51]:
encoded, offset = lm.prepare_text("she [verb] the ball to him .")
encoded['input_ids'] = torch.tensor([[t-1 if t > length else t for t in token_ids] for token_ids in encoded.input_ids])

In [52]:
lm.compute_stats((encoded, offset), return_tensors=True)

[tensor([ -4.7991, -14.9654,  -6.6724,  -6.5123,  -4.8335,  -5.2774,  -0.5907])]

In [16]:
from torch.utils.data import DataLoader

In [18]:
adaptation = utils.read_jsonl("../data/experiments/adaptation.jsonl")

In [20]:
dl = DataLoader(adaptation, batch_size=32, shuffle=False)

In [22]:
for batch in dl:
    pass

In [28]:
len(batch['pp'])

15

In [39]:
results = []

x = [1,2,3,4]
y = [5,6,7,8]

results.extend(list(zip(x,y)))

In [41]:
results.extend(list(zip(x,y)))

In [42]:
results

[(1, 5), (2, 6), (3, 7), (4, 8), (1, 5), (2, 6), (3, 7), (4, 8)]

In [43]:
import numpy as np

In [46]:
x = list(range(19))

In [48]:
np.argmax(x)

18