### Electra Training
The author uses an `ELECTRA` training object, which seems to contain 3 predominant object classes: `Discriminator`, `Generator`, `AdversarialTrainer`

In [1]:
import os
if os.getcwd().endswith('notebooks'): os.chdir('..')
    
from random import randint, shuffle
from random import random as rand

import numpy as np
import torch
import torch.nn as nn
import argparse
from tensorboardX import SummaryWriter
import os
import multiprocessing as mp
import src.tokenization
import src.models
import src.optim
import src.train
from src.utils import set_seeds, get_device
from torch.utils.data import Dataset, DataLoader
from src.data import seek_random_offset, SentPairDataset, Pipeline, Preprocess4Pretrain, seq_collate

from config import CONFIG as args

cfg = src.train.Config.from_json(args.train_cfg)
model_cfg = src.models.Config.from_json(args.model_cfg)

tokenizer = src.tokenization.FullTokenizer(vocab_file=args.vocab, do_lower_case=True)
tokenize = lambda x: tokenizer.tokenize(tokenizer.convert_to_unicode(x))

pipeline = [Preprocess4Pretrain(args.max_pred,
                                args.mask_prob,
                                list(tokenizer.vocab.keys()),
                                tokenizer.convert_tokens_to_ids,
                                model_cfg.max_len,
                                args.mask_alpha,
                                args.mask_beta,
                                args.max_gram)]
data_iter = DataLoader(SentPairDataset(args.data_file,
                            cfg.batch_size,
                            tokenize,
                            model_cfg.max_len,
                            pipeline=pipeline), 
                        batch_size=cfg.batch_size, 
                        collate_fn=seq_collate,
                        num_workers=mp.cpu_count())

from src.pretrain import Discriminator
discriminator = Discriminator(model_cfg)

from src.pretrain import Generator
generator_cfg = src.models.Config.from_json(args.generator_cfg)
generator = Generator(generator_cfg)

optimizer = src.optim.optim4GPU(cfg, generator, discriminator)
# self.g_optimizer = optim.optim4GPU(cfg, generator)
trainer = src.train.AdversarialTrainer(cfg, 
    discriminator, generator, 
    data_iter, 
    optimizer, args.ratio, args.save_dir, get_device())
os.makedirs(os.path.join(args.log_dir, args.name), exist_ok=True)
writer = SummaryWriter(log_dir=os.path.join(args.log_dir, args.name)) # for tensorboardX

trainer.train(writer, model_file=None, data_parallel=False)

### Saving the model

In [6]:
assert cfg.save_steps == 10000
# trainer.save(10000)

In [9]:
trainer.generator.state_dict()

OrderedDict([('decoder_bias',
              tensor([-0.1247, -0.1097, -0.1262,  ..., -0.0903, -0.1164, -0.1771],
                     device='cuda:0')),
             ('transformer.embed.tok_embed1.weight',
              tensor([[ 0.1084, -1.6856,  0.8938,  ..., -1.6080,  0.7902, -0.1269],
                      [ 1.4402,  1.1113,  0.3414,  ...,  0.3611,  0.5536, -1.6411],
                      [-0.6360, -0.2318, -0.5088,  ..., -1.2780,  0.5024, -0.1618],
                      ...,
                      [ 0.4303,  1.7491,  0.3746,  ..., -0.1478,  1.0164, -0.0169],
                      [-1.6252, -0.2167, -0.8579,  ..., -0.2637, -0.4763,  0.1835],
                      [ 1.2073, -0.4374,  0.1862,  ..., -0.1416, -0.1738,  1.2703]],
                     device='cuda:0')),
             ('transformer.embed.tok_embed2.weight',
              tensor([[ 0.2156,  0.1961,  0.0817,  ...,  0.1595, -0.2192, -0.1461],
                      [ 0.0744, -0.0150,  0.1650,  ..., -0.0079,  0.1621, -0.1273],
 

Quite frankly, the saving and loading is shit. Propose a saving and loading of `state_dict` with the following code snippet.  
https://pytorch.org/tutorials/beginner/saving_loading_models.html  

```
model = TheModelClass(*args, **kwargs)
model.load_state_dict(torch.load(PATH))
model.eval()
```

### The training script

In [1]:
import os
if os.getcwd().endswith('notebooks'): os.chdir('..')
    
from random import randint, shuffle
from random import random as rand

import numpy as np
import torch
import torch.nn as nn
import argparse
from tensorboardX import SummaryWriter
import os
import multiprocessing as mp
import src.tokenization
import src.models
import src.optim
import src.train
from src.utils import set_seeds, get_device
from torch.utils.data import Dataset, DataLoader
from src.data import seek_random_offset, SentPairDataset, Pipeline, Preprocess4Pretrain, seq_collate

from config import CONFIG as args

### 0. Tokenizer and pipelines
There is a curious 2 parameters `max_len` in the `electra.json` and `max_pred` in the `config.py`.  
`max_len` is the maximum sequence of our model whereas `max_pred` is intended for the training class, the maximum number of replaced tokens. But what is alarming is that the `DataLoader` consistently raises errors when the two are different. Meaning they are intended to be different numbers, but for some reason they are enforced to be the same in the data loader. 

In [2]:
cfg = src.train.Config.from_json(args.train_cfg)
cfg

Config(seed=128, batch_size=10, lr=0.0005, n_epochs=10, warmup=0.1, save_steps=10000, total_steps=1000000)

In [3]:
model_cfg = src.models.Config.from_json(args.model_cfg)
model_cfg

Config(vocab_size=30522, hidden=256, hidden_ff=1024, embedding=64, p_drop_hidden=0.1, n_layers=12, n_heads=4, max_len=400, n_segments=2)

In [4]:
tokenizer = src.tokenization.FullTokenizer(vocab_file=args.vocab, do_lower_case=True)
tokenize = lambda x: tokenizer.tokenize(tokenizer.convert_to_unicode(x))

pipeline = [Preprocess4Pretrain(args.max_pred,
                                args.mask_prob,
                                list(tokenizer.vocab.keys()),
                                tokenizer.convert_tokens_to_ids,
                                model_cfg.max_len,
                                args.mask_alpha,
                                args.mask_beta,
                                args.max_gram)]
data_iter = DataLoader(SentPairDataset(args.data_file,
                            cfg.batch_size,
                            tokenize,
                            model_cfg.max_len,
                            pipeline=pipeline), 
                        batch_size=cfg.batch_size, 
                        collate_fn=seq_collate,
                        num_workers=mp.cpu_count())


In [5]:
args.data_file

'./data/wiki.train.tokens'

### 1. Discriminator

In [6]:
from src.pretrain import Discriminator
discriminator = Discriminator(model_cfg)

### 2. Generator

In [10]:
from src.pretrain import Generator
generator_cfg = src.models.Config.from_json(args.generator_cfg)
generator = Generator(generator_cfg)
print(generator_cfg)

Config(vocab_size=30522, hidden=64, hidden_ff=256, embedding=32, p_drop_hidden=0.1, n_layers=12, n_heads=1, max_len=400, n_segments=2)


### 3. AdversarialTrainer

In [8]:
optimizer = src.optim.optim4GPU(cfg, generator, discriminator)
# self.g_optimizer = optim.optim4GPU(cfg, generator)
trainer = src.train.AdversarialTrainer(cfg, 
    discriminator, generator, 
    data_iter, 
    optimizer, args.ratio, args.save_dir, get_device())
os.makedirs(os.path.join(args.log_dir, args.name), exist_ok=True)
writer = SummaryWriter(log_dir=os.path.join(args.log_dir, args.name)) # for tensorboardX

cuda (1 GPUs)


### Train

In [9]:
trainer.train(writer, model_file=None, data_parallel=False)

Iter (loss=X.XXX):   0%|          | 0/3672 [00:00<?, ?it/s]

AssertionError: Caught AssertionError in DataLoader worker process 0.
Original Traceback (most recent call last):
  File "/home/vinitrinh/anaconda3/envs/gr3/lib/python3.7/site-packages/torch/utils/data/_utils/worker.py", line 178, in _worker_loop
    data = fetcher.fetch(index)
  File "/home/vinitrinh/anaconda3/envs/gr3/lib/python3.7/site-packages/torch/utils/data/_utils/fetch.py", line 44, in fetch
    data = [self.dataset[idx] for idx in possibly_batched_index]
  File "/home/vinitrinh/anaconda3/envs/gr3/lib/python3.7/site-packages/torch/utils/data/_utils/fetch.py", line 44, in <listcomp>
    data = [self.dataset[idx] for idx in possibly_batched_index]
  File "/home/vinitrinh/Desktop/trainfromhome-transformer/src/data.py", line 109, in __getitem__
    instance = proc(instance)
  File "/home/vinitrinh/Desktop/trainfromhome-transformer/src/data.py", line 192, in __call__
    assert len(v) == self.max_len, f"unexpected shape in {k}: {v}"
AssertionError: unexpected shape in masked_ids: [101, 12411, 5558, 2053, 11748, 4801, 4360, 1017, 1024, 1026, 4895, 2243, 1028, 11906, 1006, 2887, 1024, 1856, 30332, 30197, 30222, 30218, 30259, 30227, 30255, 30258, 30219, 2509, 1010, 5507, 1012, 11748, 4801, 4360, 1997, 1996, 11686, 1017, 1007, 1010, 4141, 3615, 2000, 2004, 11748, 4801, 4360, 11906, 3523, 2648, 2900, 1010, 2003, 1037, 8608, 2535, 1030, 1011, 1030, 2652, 2678, 2208, 2764, 2011, 16562, 1998, 2865, 1012, 4432, 2005, 1996, 9160, 12109, 1012, 2207, 1999, 2254, 2249, 1999, 2900, 1010, 2009, 2003, 1996, 2353, 2208, 1999, 1996, 11748, 4801, 4360, 2186, 1012, 1026, 4895, 2243, 1028, 1996, 2168, 10077, 1997, 8608, 1998, 2613, 1030, 1011, 1030, 2051, 11247, 2004, 2049, 16372, 1010, 1996, 2466, 3216, 5903, 2000, 1996, 2034, 2208, 1998, 4076, 1996, 1000, 2171, 3238, 1000, 1010, 1037, 18476, 2510, 3131, 3529, 1996, 3842, 1997, 26033, 2401, 2076, 1996, 2117, 12124, 2078, 2162, 2040, 4685, 3595, 2304, 3136, 1998, 2024, 25895, 2114, 1996, 4461, 3131, 1000, 1026, 4895, 2243, 1028, 10000, 1000, 1012, 1996, 2208, 2211, 2458, 1999, 2230, 1010, 4755, 2058, 1037, 2312, 4664, 1997, 1996, 2147, 2589, 2006, 11748, 4801, 4360, 11906, 2462, 1012, 2096, 2009, 6025, 1996, 3115, 2838, 1997, 1996, 2186, 1010, 2009, 2036, 9601, 3674, 24081, 1010, 2107, 2004, 2437, 1996, 2208, 2062, 1026, 4895, 2243, 1028, 2005, 2186, 24159, 1012, 2839, 5859, 1026, 4895, 2243, 1028, 10189, 23099, 1998, 4543, 2718, 24303, 7842, 21138, 11439, 2119, 2513, 2013, 3025, 10445, 1010, 2247, 2007, 11748, 4801, 4360, 11906, 2462, 2472, 3138, 4048, 11472, 10830, 1012, 1037, 2312, 2136, 1997, 4898, 8971, 1996, 5896, 1012, 1996, 2208, 1005, 1055, 3098, 4323, 2001, 7042, 2011, 2089, 1005, 1050, 1012, 102, 2009, 2777, 2007, 3893, 4341, 1999, 2900, 1010, 1998, 2001, 5868, 2011, 2119, 2887, 1998, 2530, 4401, 1012, 2044, 2713, 1010, 2009, 2363, 26720, 4180, 1010, 2247, 2007, 2019, 4423, 3179, 1999, 2281, 1997, 2008, 2095, 1012, 2009, 2001, 2036, 5967, 2046, 8952, 1998, 2019, 2434, 2678, 7284, 2186, 1012, 2349, 2000, 2659, 4341, 1997, 11748, 4801, 4360, 11906, 2462, 1010, 11748, 4801, 4360, 11906, 3523, 2001, 2025, 22574, 1010, 2021, 1037, 5470, 5449, 11892, 2007, 1996, 2208, 1005, 1055, 4423, 3179, 2001, 2207, 1999, 2297, 1012, 2865, 1012, 4432, 2052, 2709, 2000, 1996, 6329, 2007, 1996, 2458, 1997, 11748, 4801, 4360, 1024, 24296, 4329, 2005, 1996, 9160, 1018, 1012, 102]


### Does it really work? Is it really intelligent?

### The `SentPairDataset` dataset object and the `DataLoader` data loader. 

In [11]:
SPD = SentPairDataset('./data/wiki.test.tokens',
                                16,
                                tokenize,
                                400,
                                pipeline=pipeline)

In [8]:
from tqdm import tqdm 

tokenizer = src.tokenization.FullTokenizer(vocab_file='./data/vocab.txt', do_lower_case=True)
tokenize = lambda x: tokenizer.tokenize(tokenizer.convert_to_unicode(x))

pipeline = [Preprocess4Pretrain(75,
                                0.15,
                                list(tokenizer.vocab.keys()),
                                tokenizer.convert_tokens_to_ids,
                                400,
                                1,
                                1,
                                3)]
data_iter = DataLoader(SentPairDataset('./data/wiki.test.tokens',
                            16,
                            tokenize,
                            400,
                            pipeline=pipeline
                                      ), 
                       batch_size=16, 
                       collate_fn=seq_collate, 
                       num_workers=8)

for batch in tqdm(data_iter):
    input_ids, segment_ids, input_mask, masked_ids, masked_pos, masked_weights, is_next, original_ids = batch

  0%|          | 0/273 [00:00<?, ?it/s]

NameError: Caught NameError in DataLoader worker process 0.
Original Traceback (most recent call last):
  File "/home/vinitrinh/anaconda3/envs/gr3/lib/python3.7/site-packages/torch/utils/data/_utils/worker.py", line 178, in _worker_loop
    data = fetcher.fetch(index)
  File "/home/vinitrinh/anaconda3/envs/gr3/lib/python3.7/site-packages/torch/utils/data/_utils/fetch.py", line 44, in fetch
    data = [self.dataset[idx] for idx in possibly_batched_index]
  File "/home/vinitrinh/anaconda3/envs/gr3/lib/python3.7/site-packages/torch/utils/data/_utils/fetch.py", line 44, in <listcomp>
    data = [self.dataset[idx] for idx in possibly_batched_index]
  File "/home/vinitrinh/Desktop/trainfromhome-transformer/src/data.py", line 109, in __getitem__
    instance = proc(instance)
  File "/home/vinitrinh/Desktop/trainfromhome-transformer/src/data.py", line 192, in __call__
    assert len(input_) == max_len, f"unexpected shape in {input_.__name__}: {input_}"
NameError: name 'max_len' is not defined
