In [1]:
import csv
import json
import torch
import utils

import numpy as np

from collections import defaultdict, Counter
from minicons import scorer
from torch import optim
from torch.utils.data import DataLoader
from tqdm import tqdm, trange
from transformers import (
    AdamW,
    get_constant_schedule,
    set_seed,
)

from experiment import Learner, Trainer



In [9]:
x = {
    (1,2): (0.2, {}),
    (3,4): (0.7, {}),
}

sorted(x.items(), key = lambda item: item[1][0], reverse=True)

[((3, 4), (0.7, {})), ((1, 2), (0.2, {}))]

In [2]:
# set_seed(1024)

validation = utils.read_json("../../data/experiments/verbhood.json")

generalization = utils.read_jsonl("../../data/experiments/generalization.jsonl")

# model = Learner(
#     "kanishka/smolm-aochildes-vocab_8192-layers_8-attn_8-hidden_256-inter_1024-lr_1e-3-seed_1709",
#     device="cuda:0",
# )

# trainer = Trainer(
#     model,
#     "do you see lucy and me ?\n<s> lucy [verb] me a red ball .",
#     generalization,
#     validation,
#     learning_rate=0.02,
#     weight_decay=0.01
# )
# # trainer.add_token()
# # trainer.reset()
# trainer.train(num_epochs=20, generalization_batch_size=128), len(trainer.metrics['val_performance'])

In [3]:
lrs = [0.01, 0.02, 0.009]
decays = [0.0, 0.01, 0.1]
NUM_EPOCHS=70

results = defaultdict(tuple)

for lr in lrs:
    for wd in decays:
        print(f"LR: {lr}; WD: {wd}")
        set_seed(1024)

        model = Learner(
            "kanishka/smolm-aochildes-vocab_8192-layers_8-attn_8-hidden_256-inter_1024-lr_1e-3-seed_1709",
            device="cuda:0",
        )

        trainer = Trainer(
            model,
            "do you see lucy and me ?\n<s> lucy [verb] a red ball to me.",
            generalization,
            validation,
            learning_rate=lr,
            weight_decay=wd
        )
        trainer.train(num_epochs=NUM_EPOCHS, generalization_batch_size=128), len(trainer.metrics['val_performance'])
        results[(lr, wd)] = (trainer.metrics['val_performance'][trainer.best_epoch-1], trainer.agg_gen_results)
        print(f"\n\n")




LR: 0.01; WD: 0.0







LR: 0.01; WD: 0.01



LR: 0.01; WD: 0.1



LR: 0.02; WD: 0.0



LR: 0.02; WD: 0.01



LR: 0.02; WD: 0.1



LR: 0.009; WD: 0.0



LR: 0.009; WD: 0.01



LR: 0.009; WD: 0.1





In [4]:
results

defaultdict(tuple,
            {(0.01, 0.0): (0.4981249880790708,
              defaultdict(float,
                          {('best', 'do'): -5.557654669790557,
                           ('best', 'pp'): -4.7156833489735925})),
             (0.01, 0.01): (0.5258091306686401,
              defaultdict(float,
                          {('best', 'do'): -5.540888644709732,
                           ('best', 'pp'): -4.682361414697435})),
             (0.01, 0.1): (0.505259478092194,
              defaultdict(float,
                          {('best', 'do'): -5.38945529388659,
                           ('best', 'pp'): -4.593265732129415})),
             (0.02, 0.0): (1.4173242521286014,
              defaultdict(float,
                          {('best', 'do'): -5.382068863782016,
                           ('best', 'pp'): -4.711660928196377})),
             (0.02, 0.01): (1.4451923251152046,
              defaultdict(float,
                          {('best', 'do'): -5.322943450465347,
 

In [3]:
len(trainer.embs)

20

In [4]:
trainer.agg_gen_results,trainer.metrics['val_performance'][trainer.best_epoch-1]

(defaultdict(float,
             {('initial', 'do'): -6.521739979946252,
              ('initial', 'pp'): -5.627667588657803,
              ('best', 'do'): -4.477034922802087,
              ('best', 'pp'): -3.8414887494511074}),
 0.00796610832214295)

In [4]:
set_seed(1024)
trainer.reset()
trainer.train(num_epochs=20, generalization_batch_size=128)

Parameter containing:
tensor([[-0.0281, -0.1176,  0.0406,  ..., -0.0404, -0.0612,  0.0135],
        [ 0.0124, -0.0557,  0.0024,  ...,  0.0176,  0.0403, -0.0465],
        [-0.0286, -0.1178,  0.0405,  ..., -0.0407, -0.0608,  0.0137],
        ...,
        [ 0.0051, -0.0475, -0.0095,  ..., -0.0836,  0.0460, -0.0518],
        [-0.0311, -0.1246,  0.0523,  ..., -0.0421, -0.1130,  0.0493],
        [-0.0153, -0.0356,  0.0149,  ..., -0.0156, -0.0292,  0.0113]],
       device='cuda:0', requires_grad=True)


Epoch: 100%|██████████| 20/20 [00:01<00:00, 14.33it/s]


Best Epoch: 20. Validation: 0.012644103765487635
Val performance recheck: 0.012644103765487635
Parameter containing:
tensor([[-0.0280, -0.1174,  0.0406,  ..., -0.0403, -0.0611,  0.0134],
        [ 0.0124, -0.0556,  0.0024,  ...,  0.0175,  0.0402, -0.0464],
        [-0.0285, -0.1176,  0.0404,  ..., -0.0406, -0.0607,  0.0137],
        ...,
        [ 0.0051, -0.0474, -0.0095,  ..., -0.0834,  0.0459, -0.0517],
        [-0.0311, -0.1244,  0.0522,  ..., -0.0420, -0.1127,  0.0492],
        [ 0.0594, -0.0786,  0.1391,  ..., -0.0770,  0.0290,  0.0645]],
       device='cuda:0')


In [9]:
trainer.reset()

In [13]:
trainer.model.lm.model.get_output_embeddings().weight

Parameter containing:
tensor([[-0.0280, -0.1174,  0.0406,  ..., -0.0403, -0.0611,  0.0134],
        [ 0.0124, -0.0556,  0.0024,  ...,  0.0175,  0.0402, -0.0464],
        [-0.0285, -0.1176,  0.0404,  ..., -0.0406, -0.0607,  0.0137],
        ...,
        [ 0.0051, -0.0474, -0.0095,  ..., -0.0834,  0.0459, -0.0517],
        [-0.0311, -0.1244,  0.0522,  ..., -0.0420, -0.1127,  0.0492],
        [-0.0153, -0.0356,  0.0149,  ..., -0.0156, -0.0292,  0.0113]],
       device='cuda:0', requires_grad=True)

In [8]:
trainer.embs[-1]

tensor([[ 0.0594, -0.0786,  0.1391, -0.0720, -0.0173, -0.0145, -0.0123, -0.0721,
         -0.0778,  0.1443, -0.0553,  0.1230,  0.0583, -0.0161,  0.0431, -0.1057,
          0.0318,  0.0421,  0.1448,  0.0411, -0.1002,  0.0208,  0.0823, -0.0963,
          0.1013, -0.0793, -0.0171,  0.0054,  0.0020,  0.0030, -0.1619, -0.0912,
         -0.1012,  0.1167,  0.0401, -0.0597, -0.1020,  0.0823,  0.0445, -0.0788,
          0.0251,  0.0284, -0.0053, -0.1356, -0.1391,  0.1693, -0.0720, -0.0985,
          0.0259,  0.1205,  0.0014,  0.0597, -0.0721,  0.0562, -0.0828, -0.0367,
         -0.0367,  0.1403, -0.1406,  0.0050, -0.0153,  0.1719,  0.0559, -0.1277,
         -0.0191,  0.0945, -0.0620,  0.0103,  0.1261, -0.0050, -0.0246, -0.0400,
          0.0397, -0.0884, -0.0310, -0.1131, -0.0714, -0.0015, -0.0356,  0.0574,
          0.1411,  0.1152,  0.0754, -0.1191, -0.0694, -0.1232,  0.0884,  0.0698,
          0.0669,  0.0112, -0.0479, -0.0433, -0.0799, -0.0713,  0.0827, -0.0050,
          0.0483,  0.0547, -

In [5]:
trainer.model.freeze_full()

In [10]:
len(trainer.metrics['val_performance'])

20

In [5]:
trainer.model.new_index

8192

In [3]:
class Learner:
    def __init__(
        self,
        model_name,
        device="cpu",
        gaussian=True,
        added_tokens=[" [verb]"],
        target_params=["model.decoder.embed_tokens.weight"],
    ):
        """Learner Class"""
        self.lm = scorer.IncrementalLMScorer(model_name, device)
        self.device = device
        self.gaussian = gaussian
        self.added_tokens = added_tokens
        self.target_params = target_params
        self.model_config = {
            "model_name": model_name,
            "device": device,
            "gaussian": gaussian,
            "added_tokens": added_tokens,
            "target_params": target_params,
        }
        self.length = self.lm.model.get_input_embeddings().weight.shape[0]
        self.new_length = self.length + len(self.added_tokens)
        self.new_index = self.new_length - len(self.added_tokens)

    def _initialize_gaussian(self):
        embeddings_weight = self.lm.model.get_input_embeddings().weight
        embeddings_weight.requires_grad = False

        mu = embeddings_weight[: self.new_index].mean(0).detach()
        n = self.length
        sigma = (
            (embeddings_weight[: self.new_index] - mu).T
            @ (embeddings_weight[: self.new_index] - mu)
        ) / n
        dist = torch.distributions.multivariate_normal.MultivariateNormal(
            mu, covariance_matrix=1e-5 * sigma
        )

        embeddings_weight[self.new_index :] = torch.stack(
            tuple((dist.sample() for _ in range(len(self.added_tokens)))), dim=0
        )
        embeddings_weight.requires_grad = True

    def _freeze(self):
        for param in self.lm.model.named_parameters():
            if param[0] not in self.target_params:
                param[1].requires_grad = False

        assert [
            param[0]
            for param in self.lm.model.named_parameters()
            if param[1].requires_grad
        ] == self.target_params

    def freeze_full(self):
        for param in self.lm.model.parameters():
            param.requires_grad = False

    def add_tokens(self):
        self.lm.tokenizer.add_tokens(self.added_tokens)
        self.lm.model.resize_token_embeddings(self.new_length)
        print(
            f"New token added. New embedding size: {self.lm.model.get_output_embeddings().weight.shape}"
        )

        if self.gaussian:
            self._initialize_gaussian()

        self._freeze()
        # self.lm.model = self.lm.model.to(self.device)

    def add_token_and_reinitialize(self, target_emb):
        self.add_tokens()
        self.freeze_full()
        self.lm.model.get_input_embeddings().weight[self.new_index] = target_emb
        # self.lm.model = self.lm.model.to(self.device)

    def reinitialize(self, target_emb):
        self.lm.model.get_input_embeddings().weight[self.new_index] = target_emb

    def prepare_text(self, text, **kwargs):
        encoded, offset = self.lm.prepare_text(text=text, **kwargs)
        encoded["input_ids"] = torch.tensor(
            [
                [t - 1 if t > self.length else t for t in token_ids]
                for token_ids in encoded.input_ids
            ]
        )
        return encoded, offset

    def token_score(
        self,
        batch,
        surprisal=False,
        prob=False,
        base_two=False,
        rank=False,
        decode=True,
        **kwargs,
    ):
        """
        For every input sentence, returns a list of tuples in the following format:
            `(token, score)`,

        where score represents the log-probability (by default) of the token given context. Can also return ranks along with scores.

        :param ``Union[str, List[str]]`` batch: a single sentence or a batch of sentences.
        :param ``bool`` surprisal: If `True`, returns per-word surprisals instead of log-probabilities.
        :param ``bool`` prob: If `True`, returns per-word probabilities instead of log-probabilities.
        :param ``bool`` base_two: If `True`, uses log base 2 instead of natural-log (returns bits of values in case of surprisals)
        :param ``bool`` rank: If `True`, also returns the rank of each word in context (based on the log-probability value)

        :return: A `List` containing a `Tuple` consisting of the word, its associated score, and optionally, its rank.
        :rtype: ``Union[List[Tuple[str, float]], List[Tuple[str, float, int]]]``
        """

        assert not (
            surprisal and prob
        ), "cannot both evaluate probability and surprisal at the same time!"
        assert not (
            base_two and prob
        ), "cannot both use base (which is for a log), and a probability measure at the same time!"

        tokenized = self.prepare_text(batch, **kwargs)
        if rank:
            scores, ranks = self.lm.compute_stats(
                tokenized, rank=rank, prob=prob, base_two=base_two, return_tensors=True
            )
        else:
            scores = self.lm.compute_stats(
                tokenized, prob=prob, base_two=base_two, return_tensors=True
            )

        if surprisal:
            scores = [-1.0 * s for s in scores]

        scores = [s.tolist() for s in scores]

        # indices = [
        #     [i for i in indexed if i != self.tokenizer.pad_token_id]
        #     for indexed in tokenized[0]["input_ids"].tolist()
        # ]

        indices = [
            [i for i, am in zip(instance, attention_mask) if am != 0]
            for instance, attention_mask in zip(
                tokenized[0]["input_ids"].tolist(),
                tokenized[0]["attention_mask"].tolist(),
            )
        ]
        indices = [[ii + 1 if ii >= self.length else ii for ii in i] for i in indices]
        # print(indices)
        if decode:
            tokens = [self.lm.decode(idx) for idx in indices]
        else:
            tokens = [self.lm.tokenizer.convert_ids_to_tokens(idx) for idx in indices]

        if rank:
            assert len(tokens) == len(scores) == len(ranks)
        else:
            assert len(tokens) == len(scores)

        res = []
        if rank:
            for t, s, r in zip(tokens, scores, ranks):
                if len(t) > len(s):
                    diff = len(t) - len(s)
                    sc = [0.0] * diff + s
                    ra = [0] * diff + r
                    res.append(list(zip(t, sc, ra)))
                else:
                    res.append(list(zip(t, sc, ra)))
            # return [list(zip(t, s, r)) for t, s, r in zip(tokens, scores, ranks)]
        else:
            for t, s in zip(tokens, scores):
                if len(t) > len(s):
                    diff = len(t) - len(s)
                    sc = [0.0] * diff + s
                    res.append(list(zip(t, sc)))
                else:
                    res.append(list(zip(t, sc)))

        return res

    def sequence_score(
        self,
        batch,
        reduction=lambda x: x.mean(0).item(),
        prob=False,
        base_two=False,
        **kw,
    ):
        """
        Pooled estimates of sequence log probabilities (or some modification of it).

        :param batch: a batch of sequences whose score you want to calculate.
        :type batch: ``Union[str, List[str]]``
        :param reduction: Reduction function, is selected to be
            ``lambda x: x.mean(0).item()`` by default, which stands for the avg. log-probability per token for each sequence in the batch.
        :type reduction: Callable
        :param kw: model-specific keyword arguments to pass to the `prepare_text` function
        :return: List of floats specifying the desired score for the stimuli part of the input, e.g., P(stimuli | preamble).
        :rtype: ``List[float]``

        TODO: reduction should be a string, if it's a function, specify what kind of function. --> how to ensure it is always that type?
        """
        tokenized = self.prepare_text(batch, **kw)
        # print(tokenized)
        scores = self.lm.compute_stats(
            tokenized, rank=False, base_two=base_two, prob=prob, return_tensors=True
        )
        reduced = list(map(reduction, scores))
        return reduced

    def logprob(self, corpus, batch_size=-1, by_instance=False):
        """gets the avg. log prob per token given a corpus."""
        if batch_size > 0:
            scores = []
            dl = DataLoader(corpus, batch_size=batch_size)
            for batch in dl:
                scores.extend(self.sequence_score(batch))
        else:
            scores = self.sequence_score(corpus)
        if by_instance:
            return scores
        return np.mean(scores)

In [4]:
validation = utils.read_json("../../data/experiments/verbhood.json")

generalization = utils.read_jsonl("../../data/experiments/generalization.jsonl")

In [5]:
class Trainer:
    def __init__(
        self,
        model,
        training_set,
        generalization_set,
        validation_set,
        val_performance_metric="diff",
        learning_rate=1e-3,
        weight_decay=0.0,
    ):
        """Trainer Class."""
        self.model = model
        self.model.add_tokens()
        self.val_performance_metric = val_performance_metric
        self.learning_rate = learning_rate
        self.weight_decay = weight_decay
        self.metrics = {"train_loss": [], "val_performance": []}
        self.training_set = training_set
        self.validation_set = validation_set
        self.generalization_set = generalization_set
        self.generalization_results = []
        self.agg_gen_results = defaultdict(float)
        self.best_epoch = 70
        self.embs = []

    def validate(self, batch_size=-1):
        # if self.validation set is a json with two cats, then we get
        # logprob for both and take diff if thats the metric, or else
        # just do pairwise comparison of 1 > 2.
        # if validation set is a list of sentences, then we return avg. logprob.
        if isinstance(self.validation_set, list):
            return self.model.logprob(self.validation_set)
        elif isinstance(self.validation_set, dict):
            if len(self.validation_set) == 2:
                if self.val_performance_metric == "diff":
                    return self.model.logprob(
                        self.validation_set["good"], batch_size=batch_size
                    ) - self.model.logprob(
                        self.validation_set["bad"], batch_size=batch_size
                    )
                else:
                    num_correct = 0
                    goods = self.model.logprob(
                        self.validation_set["good"],
                        batch_size=batch_size,
                        by_instance=True,
                    )
                    bads = self.model.logprob(
                        self.validation_set["bad"],
                        batch_size=batch_size,
                        by_instance=True,
                    )

                    for good, bad in zip(goods, bads):
                        if good > bad:
                            num_correct += 1

                    return num_correct / len(self.validation_set["good"])
            else:
                raise ValueError(
                    "Validation set must be a list of sentences or a dictionary with two keys (good and bad)."
                )

    def optimizer_setup(self):
        no_decay = ["bias", "LayerNorm.weight"]
        optimizer_grouped_parameters = [
            {
                "params": [
                    p
                    for n, p in self.model.lm.model.named_parameters()
                    if not any(nd in n for nd in no_decay)
                ],
                "weight_decay": 0.0,
            },
            {
                "params": [
                    p
                    for n, p in self.model.lm.model.named_parameters()
                    if any(nd in n for nd in no_decay)
                ],
                "weight_decay": 0.0,
            },
        ]
        self.optimizer = AdamW(
            optimizer_grouped_parameters, lr=self.learning_rate, eps=1e-8
        )
        self.scheduler = get_constant_schedule(self.optimizer)

    def generalization_step(self, model_state, batch_size=64):
        dl = DataLoader(self.generalization_set, batch_size=batch_size, shuffle=False)
        results = []
        datives = []
        for batch in dl:
            dative = batch["sentence"]
            dative_type = batch["dative"]
            datives.extend(dative_type)

            scores = self.model.sequence_score(dative)
            results.extend(scores)

        for i, (res, dat) in enumerate(zip(results, datives)):
            # self.generalization_results.append([i + 1, model_state, dat, res])
            self.generalization_results.append({
                "item": i+1,
                "model_state": model_state,
                "dative": dat,
                "logprob": res
            })

    def aggregate_generalization_results(self):
        # check if generalization results has something in it:
        assert len(self.generalization_results) != 0

        results = defaultdict(lambda : defaultdict(list))
        for entry in self.generalization_results:
            results[entry['model_state']][entry['dative']].append(entry['logprob'])

        results = dict(results)
        for state, dative in results.items():
            dative = dict(dative)
            for d, scores in dative.items():
                avg = np.mean(scores)
                self.agg_gen_results[(state, d)] = avg

    def train(self, num_epochs, generalization_batch_size):
        self.generalization_step("initial", generalization_batch_size)
        self.optimizer_setup()
        encoded, offset = self.model.prepare_text(self.training_set)
        encoded = encoded.to(self.model.device)

        labels = encoded.input_ids.clone()
        if self.model.lm.tokenizer.pad_token_id is not None:
            labels[labels == self.model.lm.tokenizer.pad_token_id] = -100

        if self.model.lm.tokenizer.bos_token_id is not None:
            labels[labels == 1] = -100

        for i in trange(num_epochs, desc="Epoch"):
            epoch = i + 1
            output = self.model.lm.model(**encoded, labels=labels)
            output.loss.backward()

            for m, p in self.model.lm.model.named_parameters():
                if m in self.model.target_params:
                    # embeddings = p
                    p.grad[: self.model.new_index] = 0.0
                    break

            self.optimizer.step()
            self.scheduler.step()
            self.model.lm.model.zero_grad()

            # store embeddings
            emb = (
                self.model.lm.model.resize_token_embeddings()
                .weight[self.model.new_index :]
                .detach()
                .clone()
            )
            # emb.requires_grad = False
            self.embs.append(emb)

            self.metrics["train_loss"].append(output.loss.item())
            self.metrics["val_performance"].append(self.validate())

        # self.generalization_step("final", generalization_batch_size)

        # print(self.model.lm.model.resize_token_embeddings().weight)

        self.best_epoch = np.argmax(self.metrics["val_performance"]) + 1
        print(
            f"Best Epoch: {self.best_epoch}. Validation: {self.metrics['val_performance'][self.best_epoch-1]}"
        )
        # re-train the model to the best epoch

        # reset model to initial state
        # self.model.zero_grad()
        # self.model.lm.model.eval()
        # self.model.requires_grad = False
        # self.model.zero_grad()
        # print(self.model.lm.model.resize_token_embeddings().weight[
        #     self.model.new_index :
        # ])
        # print(self.embs[self.best_epoch-1])
        for m, p in self.model.lm.model.named_parameters():
            if m in self.model.target_params:
                p.requires_grad = False
                # embeddings = p
                # p.grad[: self.model.new_index] = 0.0
                # break

        # print(self.embs[self.best_epoch-1])
        # print(self.model.lm.model.resize_token_embeddings().weight[
        #     self.model.new_index :
        # ] == self.embs[self.best_epoch - 1])

        # print(self.embs[self.best_epoch-1])

        self.model.lm.model.get_output_embeddings().weight[self.model.new_index :] = (
            self.embs[self.best_epoch - 1]
        )
        # print(self.model.model_config)
        # set_seed(42)
        # self.model = Learner(**self.model.model_config)
        # self.model.add_tokens()

        # self.optimizer_setup()
        # self.optimizer.zero_grad()
        # for i in trange(self.best_epoch, desc="Epoch"):
        #     output = self.model.lm.model(**encoded, labels=labels)
        #     output.loss.backward()

        #     for m, p in self.model.lm.model.named_parameters():
        #         if m in self.model.target_params:
        #             # embeddings = p
        #             p.grad[: self.model.new_index] = 0.0
        #             break

        #     self.optimizer.step()
        #     self.scheduler.step()
        #     self.model.lm.model.zero_grad()

        print(f"Val performance recheck: {self.validate()}")
        print(self.model.lm.model.get_output_embeddings().weight)

        self.generalization_step("best", generalization_batch_size)

        self.aggregate_generalization_results()
        



In [6]:
model = Learner(
    "kanishka/smolm-aochildes-vocab_8192-layers_8-attn_8-hidden_256-inter_1024-lr_1e-3-seed_1709",
    device="cuda:0",
)

trainer = Trainer(
    model,
    "do you see lucy with the ball ?\n<s> lucy [verb] it to a cute dog .",
    generalization,
    validation,
    learning_rate=0.005,
)
trainer.train(num_epochs=100, generalization_batch_size=128)



New token added. New embedding size: torch.Size([8193, 256])


Epoch: 100%|██████████| 100/100 [00:06<00:00, 14.76it/s]


Best Epoch: 100. Validation: -0.08604023218154921
Val performance recheck: -0.08604023218154921
Parameter containing:
tensor([[-0.0281, -0.1179,  0.0407,  ..., -0.0405, -0.0613,  0.0135],
        [ 0.0125, -0.0558,  0.0024,  ...,  0.0176,  0.0404, -0.0466],
        [-0.0286, -0.1180,  0.0405,  ..., -0.0408, -0.0610,  0.0137],
        ...,
        [ 0.0051, -0.0476, -0.0095,  ..., -0.0837,  0.0461, -0.0519],
        [-0.0312, -0.1249,  0.0525,  ..., -0.0421, -0.1132,  0.0494],
        [ 0.0353, -0.0599,  0.1119,  ...,  0.2288, -0.1100,  0.0599]],
       device='cuda:0')


In [7]:
# trainer.model.token_score(["lucy [verb] it to a cute dog ."])
trainer.agg_gen_results

defaultdict(float,
            {('initial', 'do'): -6.521739979946252,
             ('initial', 'pp'): -5.627667588657803,
             ('best', 'do'): -5.013841332811298,
             ('best', 'pp'): -4.412945683797201})

In [None]:
# trainer.generalization_step("initial")

In [None]:
# trainer.generalization_results
# trainer.aggregate_generalization_results()

Epoch: 100%|██████████| 100/100 [00:06<00:00, 14.77it/s]


Best Epoch: 91. Validation: -0.42660699367523236
Val performance recheck: -0.42660699367523236
Parameter containing:
tensor([[-0.0281, -0.1179,  0.0407,  ..., -0.0405, -0.0613,  0.0135],
        [ 0.0125, -0.0558,  0.0024,  ...,  0.0176,  0.0404, -0.0466],
        [-0.0286, -0.1180,  0.0405,  ..., -0.0408, -0.0610,  0.0137],
        ...,
        [ 0.0051, -0.0476, -0.0095,  ..., -0.0837,  0.0461, -0.0519],
        [-0.0312, -0.1249,  0.0525,  ..., -0.0421, -0.1132,  0.0494],
        [ 0.0290, -0.0254,  0.0722,  ...,  0.0630, -0.0333,  0.0320]],
       device='cuda:0')


In [15]:
trainer.metrics['val_performance']

[-0.9306175160408019,
 -0.8505882799625395,
 -0.7908285689353942,
 -0.7474884629249576,
 -0.7134220278263097,
 -0.6831685650348662,
 -0.6556312465667729,
 -0.6301717817783361,
 -0.6079445362091063,
 -0.589710837602615,
 -0.5756191515922549,
 -0.5640258622169494,
 -0.554303393363953,
 -0.5464347922801975,
 -0.5404288601875304,
 -0.5351755726337437,
 -0.5309364080429075,
 -0.5272162830829625,
 -0.5239656615257262,
 -0.5209910655021668,
 -0.5180504345893855,
 -0.5148744332790374,
 -0.5116088163852694,
 -0.5080862390995025,
 -0.504292080402374,
 -0.5007421243190766,
 -0.4970718777179721,
 -0.49366640686988816,
 -0.49084588170051546,
 -0.48819660544395393,
 -0.48569181323051414,
 -0.4834382438659661,
 -0.4815681314468385,
 -0.4799581658840184,
 -0.4787719416618348,
 -0.4778393423557281,
 -0.4776478064060212,
 -0.477638304233551,
 -0.4776538145542144,
 -0.47753682017326415,
 -0.4775212001800533,
 -0.47766709327697754,
 -0.47794182300567556,
 -0.4784782385826105,
 -0.47933374881744406,
 -0.48