In [326]:
import os
import functools
import statistics
import itertools
import random
import math
from pathlib import Path
import pdb

import pandas as pd
import swifter
import numpy as np
import hickle as hkl

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.tensorboard import SummaryWriter
from torch.utils.data import Dataset

import spacy
from cached_property import cached_property

In [274]:
if 'nlp' not in vars():
    nlp = spacy.load(
        "en_core_web_lg",
        disable=["tagger", "ner", "textcat"]
    )

In [275]:
if 'articles' not in vars():
    articles = pd.read_parquet("data/articles-processed.parquet.gzip")

In [276]:
class NNModel(nn.Module):
    def __init__(self, *_args, **_kwargs):
        super().__init__()
        
        self._args = _args
        self._kwargs = _kwargs
        
    def save(self, path):
        torch.save(
            {
                'state': self.state_dict(),
                'args': self._args,
                'kwargs': self._kwargs
            },
            path
        )
    
    @classmethod
    def load(cls, path):
        if Path(path).exists():
            data = torch.load(path)
            
            model = cls(*data['args'], **data['kwargs'])
            model.load_state_dict(data['state'])

            return model
        else:
            raise FileNotFoundError

In [277]:
class DiscriminatorNet(NNModel):
    def __init__(self, input_size):
        super(DiscriminatorNet, self).__init__()
        
        self.input_size = input_size
        
        self.linear = nn.Linear(input_size, 1)

    def forward(self, state):
        """
        The forward pass for the network
        
        hidden_state : tensor (batch_num, hidden_size)
        
        returns         : tensor (batch_num, 1)
        """
        
        state = state.transpose(0, 1).reshape(-1, self.input_size)
        state = self.linear(state)
        state = F.sigmoid(state)
        
        return state

In [278]:
class SummarizeNet(NNModel):
    def __init__(self, hidden_size, input_size, num_layers):
        super(SummarizeNet, self).__init__(
            hidden_size=hidden_size,
            input_size=input_size,
            num_layers=num_layers
        )
        
        self.hidden_size = hidden_size
        
        self.encode_gru = nn.GRU(
            input_size,
            hidden_size,
            num_layers,
            batch_first=True,
            bidirectional=True
        )
        
        self.decode_gru = nn.GRU(
            hidden_size,
            input_size,
            num_layers,
            batch_first=True,
            bidirectional=True
        )
        
        self.discriminate = DiscriminatorNet(num_layers * 2 * input_size)
        
    def take_last_pass(self, predicted):
        return predicted.reshape(
            predicted.shape[0],
            predicted.shape[1],
            2,
            int(predicted.shape[2] / 2)
        )[:, :, 1, :]

    def forward(self, word_embeddings, modes):
        """
        The forward pass for the network
        
        word_embeddings : tensor (batch_num, max_seq_len, embedding_length)
        
        returns         : tuple (
                            tensor (batch_num, max_seq_len, vocab_len),
                            tensor (batch_num, hidden_size)
                          )
        
        First tensor in the returning tuple are predicted word embeddings
        The second tensor are probabilities of the output being a headline
        """
        
        predicted, _ = self.encode_gru(word_embeddings)
        predicted = self.take_last_pass(predicted)
        
        predicted, state = self.decode_gru(predicted)
        predicted = self.take_last_pass(predicted)
        
        predicted_modes = self.discriminate(state)
        
        return predicted, predicted_modes

In [279]:
class ArticlesDataset(Dataset):
    def __init__(self, dataframe, mode, transforms=[]):
        if mode not in ['train', 'test', 'val']:
            raise ValueError(f"{mode} not in the set of modes of the dataset (['train', 'test', 'val'])")
            
        self.data = dataframe[dataframe.set == mode]
        self.transforms = transforms
        self.mode = mode
        
    def __len__(self):
        return 2*len(self.data)
    
    def __getitem__(self, idx):
        _idx = []
        
        if torch.is_tensor(idx):
            _idx = idx.tolist()
        
        if isinstance(idx, list):
            _idx = idx
        else:
            _idx = [ idx ]
        
        _ids = [ (i - (i % 2))/2 for i in _idx]

        data = self.data.iloc[_ids, :]
        data['asked_id'] = _idx
        
        data = pd.DataFrame(
            {
                'set': [self.mode for _ in range(0, len(_ids))],
                'mode': np.array([ (0.0 if i % 2 == 0 else 1.0) for i in _idx ]),
                'text': data.apply(lambda row: row['text'] if row['asked_id'] % 2 == 0 else row['headline'], axis=1),
                'title': data['normalized_title']
            }
        )

        for transform in self.transforms:
            data = transform(data)

        return data

In [280]:
class TextToParsedDoc(object):
    def __init__(self, nlp):
        self.nlp = nlp
        
    def __call__(self, sample):
        sample['doc'] = sample.swifter.progress_bar(False).apply(lambda row: self.nlp(row['text']), axis=1)
        return sample

In [281]:
class WordsToVectors(object):
    def __init__(self, nlp):
        self.nlp = nlp
        
    def document_embeddings(self, doc):
        word_embeddings = [
            [ l.vector ] if l.whitespace_ == '' else [ l.vector, np.zeros_like(l.vector) ] for l in doc
        ]

        return np.stack(
            [
                vector for vectors in word_embeddings for vector in vectors
            ]
        )

    def __call__(self, sample):
        
        sample['word_embeddings'] = sample.swifter.progress_bar(False).apply(
            lambda row: self.document_embeddings(row['doc']),
            axis=1
        )
    
        return sample

In [282]:
class AddNoiseToEmbeddings(object):
    def __init__(self, probability_of_mask_for_word):
        self.probability_of_mask_for_word = probability_of_mask_for_word
        self.rng = np.random.default_rng()
        
    def mask_vector(self, vector):
        """
        Masks words with zeros randomly
        """
        seq_len = vector.shape[0]
        vector_len = vector.shape[1]
        
        mask = np.repeat(
            self.rng.choice(
                [0, 1],
                seq_len,
                p=[
                    self.probability_of_mask_for_word,
                    (1 - self.probability_of_mask_for_word)
                ]
            ).reshape((seq_len, 1)),
            vector_len,
            axis=1
        )
        
        return vector * mask
        
    def __call__(self, sample):       
        sample['noisy_word_embeddings'] = sample['word_embeddings'].apply(self.mask_vector)

        return sample

In [283]:
class MergeBatch(object):
    def __init__(self, device):
        self.device = device
        
    def stack_vectors(self, vectors):
        max_seq = max([vector.shape[0] for vector in vectors])
        
        return np.stack(
            [
                np.pad(vector, [(0, max_seq - vector.shape[0]), (0, 0)])
                for vector in vectors
            ]
        )
        
    def __call__(self, sample):
        del sample['doc']
        
        sample = sample.to_dict(orient="list")
        
        sample['word_embeddings'] = torch.from_numpy(
            self.stack_vectors(
                sample['word_embeddings']
            ).astype(np.float32, copy=False)
        ).to(self.device)
        
        if 'noisy_word_embeddings' in sample:
            sample['noisy_word_embeddings'] = torch.from_numpy(
                self.stack_vectors(
                    sample['noisy_word_embeddings']
                ).astype(np.float32, copy=False)
            ).to(self.device)
        
        sample['mode'] = torch.from_numpy(
            np.stack(
                sample['mode']
            ).astype(np.float32, copy=False)
        ).to(self.device)
    
        return sample

In [284]:
class SetAllToSummarizing(object):
    def __call__(self, sample):
        sample['mode'] = np.ones_like(sample['mode']).astype(np.float32, copy=False)
        
        return sample

In [327]:
class Vocabulary(object):
    def __init__(self, nlp, series):
        if Path("vocabulary.hkl").exists():
            data = hkl.load("vocabulary.hkl")
            
            self.words = data['words']
            self.index = data['index']
        else:
            text = ""
            words = []
            index = {}
            
            for serie in series:
                text += " " + " ".join(serie.fillna('').values.tolist())
                
            counts = nlp(text).count_by(spacy.attrs.LOWER)

            for ix, _ in sorted([(ix, counts[ix]) for ix in counts],key=lambda t: t[1],reverse=True):
                words.append(nlp.vocab[ix].text)
                index[ix] = len(words)
                
            self.words = words
            self.index = index
            
            hkl.dump({words: words, index: index}, 'vocabulary.hkl', mode='w')

In [285]:
class DataLoader(object):
    def __init__(self, dataset, batch_size=8):
        self.dataset = dataset
        self.batch_size = batch_size
        
    @property
    def epoch_size(self):
        return math.ceil(len(self.dataset) / self.batch_size) * self.batch_size
    
    def __iter__(self):
        ids = random.choices(range(0, len(self.dataset)), k=self.epoch_size)
        
        for start_ix in range(0, self.epoch_size, self.batch_size):
            yield self.dataset[ids[start_ix:(start_ix + self.batch_size)]]

In [286]:
class ArticlesBatch:
    def __init__(self, data, ix=0):
        self.data = data
        self.ix = ix
    
        
    def __getattr__(self, name):
        if name in self.data:
            return self.data[name]
        else:
            raise AttributeError(f"Attribute missing: {name}")

In [287]:
class Decoder(object):
    def __init__(self, nlp):
        self.nlp = nlp

    def decode_embeddings(self, word_embeddings):
        data = word_embeddings.cpu().data.numpy()
        
        return [
            self.decode_embeddings_1d(data[ix, :, :])
            for ix in range(0, data.shape[0])
        ]
        
    def decode_embeddings_1d(self, word_embeddings):
        """
        Decodes a single document. Word embeddings given are of shape (N, D)
        where N is the number of lexemes and D the dimentionality of the embedding vector
        """
        
        return "".join(
            [
                token.text.lower() if not token.is_oov else " "
                for token in [
                    self.nlp.vocab[ks[0]]
                    for ks in self.nlp.vocab.vectors.most_similar(
                        word_embeddings, n=1
                    )[0]
                ]
            ]
        ).strip()

In [288]:
class Metrics(object):
    def __init__(self, mode, loss=None):
        self.mode = mode
        self.losses = [loss.cpu().item()] if loss is not None else []
    
    @classmethod
    def empty(cls, mode):
        return cls(mode)
    
    @property
    def loss(self):
        if len(self.losses) == 0:
            return 0
        else:
            return statistics.mean(self.losses)
    
    @property
    def last_loss(self):
        return self.losses[len(self.losses) - 1]
    
    def running_mean_loss(self, n=100):
        cumsum = np.cumsum(np.insert(np.array(self.losses), 0, 0)) 
        return (cumsum[n:] - cumsum[:-n]) / float(n)
    
    def __add__(self, other):
        self.losses += other.losses
        
        return self

In [289]:
class UpdateInfo(object):
    def __init__(self, decoder, batch, word_embeddings, loss_sum, mode):
        self.decoder = decoder
        self.batch = batch
        self.word_embeddings = word_embeddings
        self.loss_sum = loss_sum
        self.mode = mode
        
    @property
    def from_train(self):
        return self.mode == "train"
    
    @property
    def from_evaluate(self):
        return self.mode == "val"
        
    @cached_property
    def decoded_inferred_texts(self):
        return self.decoder.decode_embeddings(self.word_embeddings)
    
    @cached_property
    def metrics(self):
        return Metrics(self.mode, self.loss_sum)

    def __str__(self):
        return f"{self.mode} | {self.batch.ix}\t| Loss: {loss_sum}\t"

In [290]:
class BaseTrainer:
    def __init__(self, name, nlp, dataframe,
                 optimizer_class_name,
                 model_args, optimizer_args, 
                 batch_size, update_every,
                 probability_of_mask_for_word,
                 device
                ):
        self.name = name
        
        self.device = device
        
        self.datasets = {
            "train": ArticlesDataset(
                dataframe,
                "train",
                transforms=[
                    TextToParsedDoc(nlp),
                    WordsToVectors(nlp),
                    AddNoiseToEmbeddings(probability_of_mask_for_word),
                    MergeBatch(device)
                ]
            ),
            "test":  ArticlesDataset(
                dataframe,
                "test",
                transforms=[
                    TextToParsedDoc(nlp),
                    WordsToVectors(nlp),
                    AddNoiseToEmbeddings(0),
                    SetAllToSummarizing(),
                    MergeBatch(device)
                ]
            ),
            "val":  ArticlesDataset(
                dataframe,
                "val",
                transforms=[
                    TextToParsedDoc(nlp),
                    WordsToVectors(nlp),
                    AddNoiseToEmbeddings(0),
                    MergeBatch(device)
                ]
            )
        }
        
        self.batch_size = batch_size
        self.update_every = update_every
        
        self.optimizer_class_name = optimizer_class_name
        
        self.model_args = model_args
        self.optimizer_args = optimizer_args
        
        self.current_batch_id = 0
        
        self.decoder = Decoder(nlp)
        
        if self.has_checkpoint:
            self.load_last_checkpoint()
        
    @cached_property
    def model(self):
        try:
            return SummarizeNet.load(f"{self.checkpoint_path}/model.pth").to(self.device)
        except FileNotFoundError:
            return SummarizeNet(**self.model_args).to(self.device)
        
    @cached_property
    def optimizer(self):
        class_ = getattr(torch.optim, self.optimizer_class_name)
        
        return class_(self.model.parameters(), **self.optimizer_args)
    
    @property
    def checkpoint_path(self):
        return f"checkpoints/{self.name}/batch-#{self.current_batch_id}"
    
    def save_checkpoint(self):
        os.makedirs(self.checkpoint_path, exist_ok=True)
        
        self.model.save(f"{self.checkpoint_path}/model.pth")
        
        torch.save(
            {
                'current_batch_id': self.current_batch_id,
                'batch_size': self.batch_size,
                'update_every': self.update_every,
                'optimizer_class_name': self.optimizer_class_name,
                'optimizer_args': self.optimizer_args,
                'optimizer_state_dict': self.optimizer.state_dict()
            },
            f"{self.checkpoint_path}/trainer.pth"
        )
    
    @property
    def checkpoint_directories(self):
        return sorted(Path(".").glob(f"checkpoints/{self.name}/batch-*"), reverse=True)
    
    @property
    def has_checkpoint(self):
        return len(self.checkpoint_directories) > 0
    
    def load_last_checkpoint(self):
        path = self.checkpoint_directories[0]
        
        data = torch.load(f"{path}/trainer.pth")
        
        self.batch_size = data['batch_size']
        self.update_every = data['update_every']
        
        self.optimizer_class_name = data['optimizer_class_name']
        self.optimizer_args = data['optimizer_args']
        
        self.current_batch_id = data['current_batch_id']
        
        if 'model' in self.__dict__:
            del self.__dict__['model']
            
        if 'optimzer' in self.__dict__:
            del self.__dict__['optimizer']
        
        self.optimizer.load_state_dict(data['optimizer_state_dict'])
    
    def batches(self, mode):       
        while True:
            loader = DataLoader(
                self.datasets[mode],
                batch_size=self.batch_size
            )

            for data in loader:
                self.current_batch_id += 1
                
                yield(
                    ArticlesBatch(
                        data,
                        ix=self.current_batch_id
                    )
                )
    
    def work_batch(self, batch):
        raise NotImplementedError
        
    def updates(self, mode="train", update_every=None):
        batches = self.batches(mode)
        loss_sum = 0
        
        if update_every is None:
            update_every = self.update_every
        
        for batch in batches:
            if mode == "train":
                self.model.train()
            else:
                self.model.eval()
            
            loss, word_embeddings = self.work_batch(batch)
            loss /= self.update_every
            
            if mode == "train":
                loss.backward()
                
            loss_sum += loss
            
            # we're doing the accumulated gradients trick to get the gradients variance
            # down while being able to use commodity GPU:
            if batch.ix % update_every == 0:
                if mode == "train":
                    self.optimizer.step()
                    self.optimizer.zero_grad()
                
                yield(UpdateInfo(self.decoder, batch, word_embeddings, loss_sum, mode=mode))
                
                loss_sum = 0
    
    def train_and_evaluate_updates(self, evaluate_every=100):
        train_updates = self.updates(mode="train")
        evaluate_updates = self.updates(mode="val")
        
        for update_info in train_updates:
            yield(update_info)
            
            if update_info.batch.ix != 0 and update_info.batch.ix % evaluate_every == 0:
                yield(next(evaluate_updates))
    
    def test_updates(self):
        return self.updates(mode="test", update_every=1)

In [291]:
class Trainer(BaseTrainer):
    def __init__(self, *args, **kwargs):
        super(Trainer, self).__init__(*args, **kwargs)
        
    def compute_loss(self, word_embeddings, original_word_embeddings, discriminate_probs): 
        embeddings_loss = F.cosine_embedding_loss(
          word_embeddings.reshape((-1, word_embeddings.shape[2])),
          original_word_embeddings.reshape((-1, original_word_embeddings.shape[2])),
          torch.ones(word_embeddings.shape[0] * word_embeddings.shape[1]).to(self.device)
        )
        
        discriminator_loss = F.binary_cross_entropy(
            discriminate_probs,
            torch.zeros_like(discriminate_probs).to(self.device)
        )
        
        return embeddings_loss + discriminator_loss
        

    def work_batch(self, batch):
        word_embeddings, discriminate_probs = self.model(
            batch.noisy_word_embeddings,
            batch.mode
        )

        # we're diverging from the article here by outputting the word embeddings
        # instead of the probabilities for each word in a vocabulary
        # our loss function is using the cosine embedding loss coupled with
        # the discriminator loss:
        return (
            self.compute_loss(word_embeddings, batch.word_embeddings, discriminate_probs),
            word_embeddings
        )

In [292]:
class InNotebookTrainer(Trainer):
    def __init__(self, *args, **kwargs):
        super(InNotebookTrainer, self).__init__(*args, **kwargs)
        
        self.writer = SummaryWriter(comment=self.name)

    def train(self, evaluate_every=1000):
        test_updates = self.test_updates()
        
        cumulative_train_metrics = Metrics.empty(mode="train")
        cumulative_evaluate_metrics = Metrics.empty(mode="eval")

        for update_info in self.train_and_evaluate_updates(evaluate_every=evaluate_every):
            if update_info.from_train:
                cumulative_train_metrics += update_info.metrics
                
                print(f"{update_info.batch.ix}")
                
                self.writer.add_scalar(
                    'loss/train',
                    update_info.metrics.loss,
                    update_info.batch.ix
                )

            if update_info.from_evaluate:
                cumulative_evaluate_metrics += update_info.metrics
                
                self.writer.add_scalar(
                    'loss/eval',
                    update_info.metrics.loss,
                    update_info.batch.ix
                )

                print(f"Eval: {update_info.metrics.loss}")
                print(f"Saving checkpoint")
                self.save_checkpoint()

#             if update_info.batch.ix % 1000 == 0 and update_info.batch.ix != 0:
#                 test_update = next(test_updates)
                
#                 self.test_texts_stream.write(
#                     (
#                         update_info.batch.text,
#                         update_info.decoded_inferred_texts
#                     )
#                 )
                
    def test(self):
        cumulative_metrics = Metrics.empty(mode="test")
        
        for update_info in self.test_updates():
            cumulative_metrics += update_info.metrics

        print(cumulative_metrics)

In [265]:
RUN_TESTS = True

### Tests

In [266]:
import unittest
from hypothesis import given, settings, note, assume, reproduce_failure
import hypothesis.strategies as st
import hypothesis.extra.numpy as npst

class TestNotebook(unittest.TestCase):
    def test_trainer_batches_yields_proper_ixs(self):
        for mode in ['train', 'test', 'val']:
            trainer = Trainer(
                'unit-test-run-1',
                nlp,
                articles,
                optimizer_class_name='Adam',
                model_args={
                    'hidden_size': 128,
                    'input_size': 300,
                    'num_layers': 2
                },
                optimizer_args={},
                batch_size=32,
                update_every=1,
                probability_of_mask_for_word=0.3,
                device=torch.device('cpu')
            )
            self.assertGreater(len(trainer.datasets[mode]), 0)
            ixs = [batch.ix for batch in itertools.islice(trainer.batches(mode), 10)]
            self.assertEqual(list(ixs), list(range(1, 11)))

In [267]:
if __name__ == '__main__' and RUN_TESTS:
    import doctest
    
    doctest.testmod()
    unittest.main(
        argv=['first-arg-is-ignored'],
        failfast=True,
        exit=False
    )

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
.
----------------------------------------------------------------------
Ran 1 test in 30.661s

OK


In [268]:
# articles.set.unique()

array(['train', 'val', 'test'], dtype=object)

In [320]:
' '.join(articles.sample(n=2)["text"].fillna('').values.tolist())

' Select a fresh, ripe fruit. Choose a fruit that’s slightly (and evenly) soft, but not mushy. Avoid juicing a fruit that feels hard.The harder the fruit, the more difficult it will be to juice.\nThe heavier the fruit feels, the juicier it will be!The best citrus fruit has a rind free of blemishes, and a bold, sweet scent.;\n, Leave it sitting at room temperature until it is no longer cold to the touch.Warm the fruit in the microwave, on high, for twenty to thirty seconds, if desired. Allow it to cool for one minute.Warm fruit is easier to juice.\nWarming the fruit is especially useful if it was refrigerated.\n\n, Hold the fruit against a countertop or other surface. Apply gentle pressure. Roll the fruit back and forth.Rolling the fruit loosens its inner segments, to prepare the fruit for juicing.\n\n, Make the cut a bit off-center.Cut the fruit lengthwise if it will fit that way into your squeezer.If not, cut the fruit crosswise, then slice off pointy tip of the rind (if applicable).Y

In [328]:
Vocabulary(nlp, [ articles["text"], articles["headline"] ])

ValueError: [E088] Text of length 198298788 exceeds maximum of 1000000. The v2.x parser and NER models require roughly 1GB of temporary memory per 100,000 characters in the input. This means long texts may cause memory allocation errors. If you're not using the parser or NER, it's probably safe to increase the `nlp.max_length` limit. The limit is in number of characters, so you can check whether your inputs are too long by checking `len(text)`.

In [325]:
%%time

text = ' '.join(articles.sample(n=200)["text"].fillna('').values.tolist())
counts = nlp(text).count_by(spacy.attrs.LOWER)

[
    nlp.vocab[ix].text
    for ix, _ in sorted(
        [(ix, counts[ix]) for ix in counts],
        key=lambda t: t[1],
        reverse=True
    )
]

CPU times: user 6.46 s, sys: 138 ms, total: 6.6 s
Wall time: 6.62 s


['.',
 ',',
 'the',
 'to',
 'you',
 'a',
 'and',
 'your',
 'of',
 'in',
 'it',
 'or',
 '\n\n',
 'for',
 '\n',
 'if',
 'can',
 'is',
 'on',
 'be',
 'that',
 'with',
 'will',
 'are',
 'as',
 'this',
 'have',
 '-',
 'do',
 'make',
 '\n\n\n',
 'use',
 '"',
 'not',
 'them',
 'they',
 'at',
 '(',
 ')',
 'from',
 'out',
 'may',
 'when',
 'should',
 'want',
 'an',
 'like',
 'more',
 'up',
 'also',
 'one',
 'by',
 'about',
 'so',
 "'s",
 "n't",
 'into',
 'sure',
 '\n\n  ',
 'other',
 'try',
 'time',
 ';',
 'some',
 'all',
 'any',
 'but',
 'get',
 'good',
 'help',
 '“',
 ':',
 '”',
 'add',
 'just',
 'need',
 'then',
 '’s',
 'n’t',
 'way',
 'there',
 'than',
 'water',
 'over',
 'back',
 'their',
 'take',
 'keep',
 'these',
 'before',
 'around',
 'using',
 'what',
 'look',
 'off',
 'too',
 'many',
 'has',
 'through',
 'go',
 'until',
 'place',
 'find',
 'even',
 'down',
 'top',
 'two',
 'might',
 'see',
 'small',
 'once',
 'hair',
 'most',
 'work',
 'well',
 'how',
 'side',
 'while',
 'example',
 

In [298]:
nlp.vocab[1995909169258310477].text

'this'

In [293]:
# trainer = InNotebookTrainer(
#     'test-run-1',
#     nlp,
#     articles,
#     optimizer_class_name='Adam',
#     model_args={
#         'hidden_size': 128,
#         'input_size': 300,
#         'num_layers': 2
#     },
#     optimizer_args={},
#     batch_size=1,
#     update_every=1,
#     probability_of_mask_for_word=0.2,
#     device=torch.device('cuda')
# )

# update_info = next(trainer.updates('train'))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [294]:
# %%time

# print(update_info.decoded_inferred_texts)

['jean-clad46364636463646364636464246424642464212dpo12dpo4642464246424642464246424642464246424642464246424642464246424642464246424642464246424642464246424642464046404640464046404642hpthpt464246424642464246424642464246424642464246424642464246424642464212dpo12dpostigmatismstigmatismstigmatismstigmatismstigmatism464246424642464246424642464246364636463646364642463646364636463646364642464246425.3335.3334642464046404640464046404640464046404640464046424642464246424642464246424642464246424642464246424642464246424642464246364636463646364642464046404640464046404640464046404640464246424642464246424642464246424642464246424640464046424642464246404640464012dpo12dpo4642464246424642464246424640464046405.3335.3335.3334642464046404640464246424640464246424642jean-cladjean-clad46424642464246424642464246424642464246424642464246424642stigmatismstigmatismstigmatism46424640464046404640464046404640464046404642464246424642464246424642464246424642464246424642stigmatism12dpo464046404640464046404642464246424642464

### Training

In [269]:
if not RUN_TESTS:
    if 'trainer' in vars():
        print(f"About to delete old trainer")
        del trainer

    trainer = InNotebookTrainer(
        'test-run-1',
        nlp,
        articles,
        optimizer_class_name='Adam',
        model_args={
            'hidden_size': 128,
            'input_size': 300,
            'num_layers': 2
        },
        optimizer_args={},
        batch_size=32,
        update_every=1,
        probability_of_mask_for_word=0.2,
        device=torch.device('cuda')
    )

    trainer.train()

In [270]:
trainer.current_batch_id

NameError: name 'trainer' is not defined