In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
import sys
sys.path.append("../src/")
import numpy as np
import pandas as pd
import math
from datasets import Dataset, list_metrics, load_metric, load_from_disk
from transformers import AutoConfig, AutoTokenizer, AutoModelForMaskedLM, DataCollatorForLanguageModeling, Trainer, HfArgumentParser, TrainingArguments, default_data_collator

import matplotlib as plt
import seaborn as sns
sns.set_theme()

from argument_parsing.model_args import ModelArguments
from argument_parsing.data_args import DataArguments
from argument_parsing.experiment_args import ExperimentArguments
from runners.run_mlm_exp import ExpRunner

Using GPU:0,1,2,3


In [3]:
gpuids = [0, 1, 2, 3]

if gpuids is None or len(gpuids) == 0:
    os.environ["CUDA_VISIBLE_DEVICES"] = ""
    print("Using CPU")
else:
    gpuid_str = str(gpuids[0])
    for gpuid in gpuids[1:]:
        gpuid_str += ",{}".format(gpuid)
    os.environ["CUDA_VISIBLE_DEVICES"] = gpuid_str
    print("Using GPU:{}".format(gpuid_str))

Using GPU:0,1,2,3


In [4]:
import torch
torch.cuda.is_available()

True

In [5]:
from torch.utils.data.dataloader import DataLoader

In [6]:
parser = HfArgumentParser((ExperimentArguments, ModelArguments, DataArguments, TrainingArguments))

In [7]:
exp_args, model_args, data_args, training_args = parser.parse_json_file(json_file="../src/argument_configs/temp.json")

In [8]:
exp_runner = ExpRunner(exp_args, model_args, data_args, training_args)

Failed to load pre-existing training dataset. Reloading...
Did not find existing datasets. Reloading..


[INFO|configuration_utils.py:517] 2021-06-21 23:35:03,850 >> loading configuration file https://huggingface.co/distilbert-base-uncased/resolve/main/config.json from cache at /data/ddmg/redditlanguagemodeling/cached/temp/23454919702d26495337f3da04d1655c7ee010d5ec9d77bdb9e399e00302c0a1.d423bdf2f58dc8b77d5f5d18028d7ae4a72dcfd8f468e81fe979ada957a8c361
[INFO|configuration_utils.py:553] 2021-06-21 23:35:03,852 >> Model config DistilBertConfig {
  "activation": "gelu",
  "architectures": [
    "DistilBertForMaskedLM"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "initializer_range": 0.02,
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "pad_token_id": 0,
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "transformers_version": "4.6.1",
  "vocab_size": 30522
}

[INFO|tokenization_utils_base.py:1717] 2021-06-21 23:35:04,190 >> loading file ht

preparing train data


HBox(children=(FloatProgress(value=0.0, max=22.0), HTML(value='')))


saving train dataset to /data/ddmg/redditlanguagemodeling/data/processed/train_all
preparing eval data


HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))


saving eval dataset to /data/ddmg/redditlanguagemodeling/data/processed/eval_all
preparing pred data


HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))


saving pred dataset to /data/ddmg/redditlanguagemodeling/data/processed/pred_all


In [9]:
exp_runner.data_collator = DataCollatorForLanguageModeling(
            tokenizer=exp_runner.tokenizer,
            mlm_probability=exp_runner.data_args.mlm_probability,
            pad_to_multiple_of=512
        )

In [10]:
exp_runner.data_collator

DataCollatorForLanguageModeling(tokenizer=PreTrainedTokenizerFast(name_or_path='distilbert-base-uncased', vocab_size=30522, model_max_len=512, is_fast=True, padding_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}), mlm=True, mlm_probability=0.15, pad_to_multiple_of=512)

In [11]:
exp_runner.trainer = exp_runner.load_trainer()

In [12]:
exp_runner.pred_dataset

Dataset({
    features: ['__index_level_0__', 'attention_mask', 'author', 'created_utc', 'data_split', 'id', 'input_ids', 'special_tokens_mask', 'subreddit', 'text'],
    num_rows: 521
})

In [13]:
exp_runner.trainer.get_eval_dataloader(exp_runner.pred_dataset)

[INFO|trainer.py:515] 2021-06-21 23:35:24,324 >> The following columns in the evaluation set  don't have a corresponding argument in `DistilBertForMaskedLM.forward` and have been ignored: __index_level_0__, data_split, text, author, special_tokens_mask, subreddit, created_utc, id.


<torch.utils.data.dataloader.DataLoader at 0x7f4e9c836b50>

In [15]:
new_dataset = exp_runner.pred_dataset.add_column(name="int_id", column=np.arange(521))
new_dataset

Dataset({
    features: ['__index_level_0__', 'attention_mask', 'author', 'created_utc', 'data_split', 'id', 'input_ids', 'special_tokens_mask', 'subreddit', 'text', 'int_id'],
    num_rows: 521
})

In [16]:
exp_runner.trainer._signature_columns += ["int_id"]
exp_runner.trainer._signature_columns

['input_ids',
 'attention_mask',
 'head_mask',
 'inputs_embeds',
 'labels',
 'output_attentions',
 'output_hidden_states',
 'return_dict',
 'label',
 'label_ids',
 'int_id']

In [17]:
pred_dl = exp_runner.trainer.get_eval_dataloader(new_dataset)

[INFO|trainer.py:515] 2021-06-21 23:35:37,817 >> The following columns in the evaluation set  don't have a corresponding argument in `DistilBertForMaskedLM.forward` and have been ignored: __index_level_0__, data_split, text, author, special_tokens_mask, subreddit, created_utc, id.


In [18]:
all_batches = []
for item in pred_dl:
    all_batches.append(item)

In [19]:
for item in all_batches:
    print(item['input_ids'].shape)

torch.Size([32, 512])
torch.Size([32, 512])
torch.Size([32, 512])
torch.Size([32, 512])
torch.Size([32, 512])
torch.Size([32, 512])
torch.Size([32, 512])
torch.Size([32, 512])
torch.Size([32, 512])
torch.Size([32, 512])
torch.Size([32, 512])
torch.Size([32, 512])
torch.Size([32, 512])
torch.Size([32, 512])
torch.Size([32, 512])
torch.Size([32, 512])
torch.Size([9, 512])


In [20]:
new_dataset['int_id']

[0,
 1,
 2,
 3,
 4,
 5,
 6,
 7,
 8,
 9,
 10,
 11,
 12,
 13,
 14,
 15,
 16,
 17,
 18,
 19,
 20,
 21,
 22,
 23,
 24,
 25,
 26,
 27,
 28,
 29,
 30,
 31,
 32,
 33,
 34,
 35,
 36,
 37,
 38,
 39,
 40,
 41,
 42,
 43,
 44,
 45,
 46,
 47,
 48,
 49,
 50,
 51,
 52,
 53,
 54,
 55,
 56,
 57,
 58,
 59,
 60,
 61,
 62,
 63,
 64,
 65,
 66,
 67,
 68,
 69,
 70,
 71,
 72,
 73,
 74,
 75,
 76,
 77,
 78,
 79,
 80,
 81,
 82,
 83,
 84,
 85,
 86,
 87,
 88,
 89,
 90,
 91,
 92,
 93,
 94,
 95,
 96,
 97,
 98,
 99,
 100,
 101,
 102,
 103,
 104,
 105,
 106,
 107,
 108,
 109,
 110,
 111,
 112,
 113,
 114,
 115,
 116,
 117,
 118,
 119,
 120,
 121,
 122,
 123,
 124,
 125,
 126,
 127,
 128,
 129,
 130,
 131,
 132,
 133,
 134,
 135,
 136,
 137,
 138,
 139,
 140,
 141,
 142,
 143,
 144,
 145,
 146,
 147,
 148,
 149,
 150,
 151,
 152,
 153,
 154,
 155,
 156,
 157,
 158,
 159,
 160,
 161,
 162,
 163,
 164,
 165,
 166,
 167,
 168,
 169,
 170,
 171,
 172,
 173,
 174,
 175,
 176,
 177,
 178,
 179,
 180,
 181,
 182,
 183,
 184,


In [21]:
# next step --> collect data and save as new dataset!!!
data_keys = ['attention_mask', 'input_ids', 'token_type_ids', 'int_id', 'labels']
collate_dict = {k: [] for k in data_keys}
for key in data_keys:
    for batch in all_batches:
        for item in batch[key]:
            if key == "int_id":
                item = item.item()
            else:
                item = list(item.numpy())
            collate_dict[key].append(item)

KeyError: 'token_type_ids'

In [22]:
new_dataset2 = new_dataset.add_column(name="labels", column=collate_dict['labels'])

In [23]:
new_dataset2

Dataset({
    features: ['__index_level_0__', 'attention_mask', 'author', 'created_utc', 'data_split', 'id', 'input_ids', 'special_tokens_mask', 'subreddit', 'text', 'token_type_ids', 'int_id', 'labels'],
    num_rows: 7884
})

In [24]:
new_dataset2 = new_dataset2.remove_columns([key for key in collate_dict if key != "labels"])

In [25]:
new_dataset2

Dataset({
    features: ['__index_level_0__', 'author', 'created_utc', 'data_split', 'id', 'special_tokens_mask', 'subreddit', 'text', 'labels'],
    num_rows: 7884
})

In [26]:
for key, val in collate_dict.items():
    if key != "labels":
        new_dataset2 = new_dataset2.add_column(name=key, column=val)

In [27]:
new_dataset2

Dataset({
    features: ['__index_level_0__', 'author', 'created_utc', 'data_split', 'id', 'special_tokens_mask', 'subreddit', 'text', 'labels', 'attention_mask', 'input_ids', 'token_type_ids', 'int_id'],
    num_rows: 7884
})

In [28]:
shapes = [len(x) for x in new_dataset2['input_ids']]

In [29]:
set(shapes)

{512}

In [30]:
save_path = "/data/ddmg/redditlanguagemodeling/data/processed/pred_all_fixed_labels"

In [31]:
new_dataset2.save_to_disk(save_path)

## Load Data and Test in New ExpRunner Instance

In [31]:
dataset = load_from_disk(save_path)

In [32]:
dataset

Dataset({
    features: ['__index_level_0__', 'attention_mask', 'author', 'created_utc', 'data_split', 'id', 'input_ids', 'int_id', 'labels', 'special_tokens_mask', 'subreddit', 'text', 'token_type_ids'],
    num_rows: 7884
})

In [34]:
trainer = exp_runner.trainer

In [35]:
dataset2 = trainer._remove_unused_columns(dataset, description="evaluation")

[INFO|trainer.py:515] 2021-06-21 21:44:46,754 >> The following columns in the evaluation set  don't have a corresponding argument in `BertForMaskedLM.forward` and have been ignored: author, text, subreddit, __index_level_0__, created_utc, id, data_split, special_tokens_mask.


In [36]:
dataset2

Dataset({
    features: ['attention_mask', 'input_ids', 'int_id', 'labels', 'token_type_ids'],
    num_rows: 7884
})

In [37]:
sampler = trainer._get_eval_sampler(dataset2)

In [38]:
trainer.args.world_size

1

In [39]:
sampler

<torch.utils.data.sampler.SequentialSampler at 0x7f4fb29b3100>

In [40]:
trainer.args.per_device_eval_batch_size

8

In [41]:
trainer.args.dataloader_pin_memory

True

In [42]:
dl = DataLoader(
    dataset2,
    sampler=sampler,
    batch_size=trainer.args.eval_batch_size,
    collate_fn=default_data_collator,
    drop_last=trainer.args.dataloader_drop_last,
    num_workers=trainer.args.dataloader_num_workers,
    pin_memory=trainer.args.dataloader_pin_memory
)

In [43]:
idx = 0
for batch in dl:
    print(batch['input_ids'].shape)
    if idx % 100 == 0:
        print("batch {}".format(idx))
        print(batch)
    idx += 1

torch.Size([32, 512])
batch 0
{'labels': tensor([[-100, -100, -100,  ..., 3262, -100, -100],
        [-100, 2129, -100,  ..., -100, -100, -100],
        [-100, -100, -100,  ..., -100, -100, -100],
        ...,
        [-100, -100, -100,  ..., -100, -100, -100],
        [-100, -100, -100,  ..., -100, -100, -100],
        [-100, -100, -100,  ..., -100, -100, -100]]), 'attention_mask': tensor([[1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]]), 'input_ids': tensor([[  101,  2026,  3566,  ...,   103,  2138,   102],
        [  101,   103,  2064,  ...,     0,     0,     0],
        [  101,  2026,  9504,  ...,     0,     0,     0],
        ...,
        [  101,  2323, 20643,  ...,     0,     0,     0],
        [  101,  3679,  6040,  ...,     0,     0,     0],
        [  101,  5724, 12882,  ...,     0,     0,     0]]), 'token_type_ids': t

torch.Size([32, 512])
torch.Size([32, 512])
torch.Size([32, 512])
torch.Size([32, 512])
torch.Size([32, 392])
torch.Size([32, 512])
torch.Size([32, 512])
torch.Size([32, 512])
torch.Size([32, 512])
torch.Size([32, 512])
torch.Size([32, 512])
torch.Size([32, 512])
torch.Size([32, 512])
torch.Size([32, 512])
torch.Size([32, 512])
torch.Size([32, 512])
torch.Size([32, 512])
torch.Size([32, 512])
torch.Size([32, 495])
torch.Size([32, 510])
torch.Size([32, 512])
torch.Size([32, 512])
torch.Size([32, 512])
torch.Size([32, 512])
torch.Size([32, 512])
torch.Size([32, 512])
torch.Size([32, 512])
torch.Size([32, 512])
torch.Size([32, 512])
torch.Size([32, 512])
torch.Size([32, 512])
torch.Size([32, 512])
torch.Size([32, 512])
torch.Size([32, 512])
torch.Size([32, 512])
torch.Size([32, 512])
torch.Size([32, 512])
torch.Size([32, 512])
torch.Size([32, 512])
torch.Size([32, 512])
torch.Size([32, 512])
torch.Size([32, 512])
torch.Size([32, 512])
torch.Size([32, 512])
torch.Size([12, 423])


## Test as part of full model pipeline

In [6]:
parser = HfArgumentParser((ExperimentArguments, ModelArguments, DataArguments, TrainingArguments))

In [7]:
exp_args, model_args, data_args, training_args = parser.parse_json_file(json_file="../src/argument_configs/temp.json")

In [8]:
data_args

DataArguments(dataset_files=['/data/ddmg/redditlanguagemodeling/data/4_all_data.csv'], fixed_test_data_path='/data/ddmg/redditlanguagemodeling/data/processed/pred_all_fixed_labels', preprocessing_num_workers=None, mlm_probability=0.15, max_train_samples=None, max_eval_samples=None, max_pred_samples=None, train_author=None, eval_author=None, pred_author=None, train_subreddit=None, eval_subreddit=None, pred_subreddit=None, save_dataset=True, load_existing_dataset=True, dataset_dir='/data/ddmg/redditlanguagemodeling/data/processed')

In [9]:
exp_runner = ExpRunner(exp_args, model_args, data_args, training_args)

[INFO|configuration_utils.py:517] 2021-06-21 14:22:47,485 >> loading configuration file https://huggingface.co/bert-base-uncased/resolve/main/config.json from cache at /data/ddmg/redditlanguagemodeling/cached/temp/3c61d016573b14f7f008c02c4e51a366c67ab274726fe2910691e2a761acf43e.37395cee442ab11005bcd270f3c34464dc1704b715b5d7d52b1a461abe3b9e4e
[INFO|configuration_utils.py:553] 2021-06-21 14:22:47,487 >> Model config BertConfig {
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.6.1",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}



Found existing dataset to load.


[INFO|tokenization_utils_base.py:1717] 2021-06-21 14:22:47,824 >> loading file https://huggingface.co/bert-base-uncased/resolve/main/vocab.txt from cache at /data/ddmg/redditlanguagemodeling/cached/temp/45c3f7a79a80e1cf0a489e5c62b43f173c15db47864303a55d623bb3c96f72a5.d789d64ebfe299b0e416afc4a169632f903f693095b4629a7ea271d5a0cf2c99
[INFO|tokenization_utils_base.py:1717] 2021-06-21 14:22:47,825 >> loading file https://huggingface.co/bert-base-uncased/resolve/main/tokenizer.json from cache at /data/ddmg/redditlanguagemodeling/cached/temp/534479488c54aeaf9c3406f647aa2ec13648c06771ffe269edabebd4c412da1d.7f2721073f19841be16f41b0a70b600ca6b880c8f3df6f3535cbc704371bdfa4
[INFO|tokenization_utils_base.py:1717] 2021-06-21 14:22:47,826 >> loading file https://huggingface.co/bert-base-uncased/resolve/main/added_tokens.json from cache at None
[INFO|tokenization_utils_base.py:1717] 2021-06-21 14:22:47,827 >> loading file https://huggingface.co/bert-base-uncased/resolve/main/special_tokens_map.json fr

loaded existing train data
loaded existing eval data
loaded existing pred data


In [20]:
exp_runner.trainer.data_collator

<function transformers.data.data_collator.default_data_collator(features: List[InputDataClass]) -> Dict[str, torch.Tensor]>

In [10]:
exp_runner.pred_exists

True

In [22]:
exp_runner.trainer._get_eval_sampler(exp_runner.pred_dataset)

<torch.utils.data.sampler.SequentialSampler at 0x7feb7c420f10>

In [23]:
dl = exp_runner.trainer.get_eval_dataloader(exp_runner.pred_dataset)

[INFO|trainer.py:515] 2021-06-21 11:05:00,774 >> The following columns in the evaluation set  don't have a corresponding argument in `BertForMaskedLM.forward` and have been ignored: __index_level_0__, author, subreddit, text, special_tokens_mask, id, created_utc, int_id, data_split.


In [31]:
exp_runner.trainer.args.fp16_full_eval

False

In [30]:
model = exp_runner.trainer._wrap_model(exp_runner.trainer.model, training=False)

In [33]:
batch_size = dl.batch_size
batch_size

32

In [34]:
model.eval()

DataParallel(
  (module): BertForMaskedLM(
    (bert): BertModel(
      (embeddings): BertEmbeddings(
        (word_embeddings): Embedding(30522, 768, padding_idx=0)
        (position_embeddings): Embedding(512, 768)
        (token_type_embeddings): Embedding(2, 768)
        (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (encoder): BertEncoder(
        (layer): ModuleList(
          (0): BertLayer(
            (attention): BertAttention(
              (self): BertSelfAttention(
                (query): Linear(in_features=768, out_features=768, bias=True)
                (key): Linear(in_features=768, out_features=768, bias=True)
                (value): Linear(in_features=768, out_features=768, bias=True)
                (dropout): Dropout(p=0.1, inplace=False)
              )
              (output): BertSelfOutput(
                (dense): Linear(in_features=768, out_features=768, bias=True)
           

In [39]:
inputs = exp_runner.trainer._prepare_inputs(my_batch)

In [40]:
inputs

{'labels': tensor([[-100, -100, -100,  ..., 3262, -100, -100],
         [-100, 2129, -100,  ..., -100, -100, -100],
         [-100, -100, -100,  ..., -100, -100, -100],
         ...,
         [-100, -100, -100,  ..., -100, -100, -100],
         [-100, -100, -100,  ..., -100, -100, -100],
         [-100, -100, -100,  ..., -100, -100, -100]], device='cuda:0'),
 'attention_mask': tensor([[1, 1, 1,  ..., 1, 1, 1],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         ...,
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0'),
 'input_ids': tensor([[  101,  2026,  3566,  ...,   103,  2138,   102],
         [  101,   103,  2064,  ...,     0,     0,     0],
         [  101,  2026,  9504,  ...,     0,     0,     0],
         ...,
         [  101,  2323, 20643,  ...,     0,     0,     0],
         [  101,  3679,  6040,  ...,     0,     0,     0],
         [  101,  5724, 12882,  ...,     0,     0,     0

In [41]:
from transformers.trainer_pt_utils import nested_detach

In [43]:
labels = nested_detach(tuple(inputs.get("labels")))

In [45]:
len(labels)

32

In [52]:
len(inputs['input_ids'])

32

In [49]:
outputs = model(**inputs)

RuntimeError: NCCL Error 2: unhandled system error

In [47]:
with torch.no_grad():
    loss, outputs = exp_runner.trainer.compute_loss(model, inputs, return_outputs=True)

RuntimeError: NCCL Error 2: unhandled system error

In [36]:
for batch in dl:
    my_batch = batch
    break

In [37]:
my_batch

{'labels': tensor([[-100, -100, -100,  ..., 3262, -100, -100],
         [-100, 2129, -100,  ..., -100, -100, -100],
         [-100, -100, -100,  ..., -100, -100, -100],
         ...,
         [-100, -100, -100,  ..., -100, -100, -100],
         [-100, -100, -100,  ..., -100, -100, -100],
         [-100, -100, -100,  ..., -100, -100, -100]]),
 'attention_mask': tensor([[1, 1, 1,  ..., 1, 1, 1],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         ...,
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0]]),
 'input_ids': tensor([[  101,  2026,  3566,  ...,   103,  2138,   102],
         [  101,   103,  2064,  ...,     0,     0,     0],
         [  101,  2026,  9504,  ...,     0,     0,     0],
         ...,
         [  101,  2323, 20643,  ...,     0,     0,     0],
         [  101,  3679,  6040,  ...,     0,     0,     0],
         [  101,  5724, 12882,  ...,     0,     0,     0]]),
 'token_type_ids': tensor([[0

In [35]:
for batch in dl:
    loss, logits, labels = exp_runner.trainer.prediction_step(model, batch, True)

RuntimeError: NCCL Error 2: unhandled system error

In [28]:
output = exp_runner.trainer.evaluation_loop(
    dl,
    description="Evaluation",
    prediction_loss_only=True,
)
    

[INFO|trainer.py:2115] 2021-06-21 11:07:23,956 >> ***** Running Evaluation *****
[INFO|trainer.py:2117] 2021-06-21 11:07:23,957 >>   Num examples = 64
[INFO|trainer.py:2120] 2021-06-21 11:07:23,959 >>   Batch size = 32


RuntimeError: NCCL Error 2: unhandled system error

In [17]:
batch = exp_runner.pred_dataset['input_ids']
for item in batch:
    print(len(item))

512
512
512
512
512
512
512
512
512
512
512
512
512
512
512
512
512
512
512
512
512
512
512
512
512
512
512
512
512
512
512
512
302
302
302
302
302
302
302
302
302
302
302
302
302
302
302
302
302
302
302
302
302
302
302
302
302
302
302
302
302
302
302
302
355
355
355
355
355
355
355
355
355
355
355
355
355
355
355
355
355
355
355
355
355
355
355
355
355
355
355
355
355
355
355
355
512
512
512
512
512
512
512
512
512
512
512
512
512
512
512
512
512
512
512
512
512
512
512
512
512
512
512
512
512
512
512
512
293
293
293
293
293
293
293
293
293
293
293
293
293
293
293
293
293
293
293
293
293
293
293
293
293
293
293
293
293
293
293
293
293
293
293
293
293
293
293
293
293
293
293
293
293
293
293
293
293
293
293
293
293
293
293
293
293
293
293
293
293
293
293
293
512
512
512
512
512
512
512
512
512
512
512
512
512
512
512
512
512
512
512
512
512
512
512
512
512
512
512
512
512
512
512
512
512
512
512
512
512
512
512
512
512
512
512
512
512
512
512
512
512
512
512
512
512
512
512
512
512
512


512
512
512
512
512
512
512
512
512
512
512
512
512
512
512
512
512
512
512
512
512
512
512
512
512
512
512
512
512
512
512
512
512
512
512
512
512
512
512
512
512
512
512
512
512
512
512
512
512
512
512
512
512
512
512
512
512
512
512
512
512
512
512
512
512
512
512
512
512
512
485
485
485
485
485
485
485
485
485
485
485
485
485
485
485
485
485
485
485
485
485
485
485
485
485
485
485
485
485
485
485
485
461
461
461
461
461
461
461
461
461
461
461
461
461
461
461
461
461
461
461
461
461
461
461
461
461
461
461
461
461
461
461
461
512
512
512
512
512
512
512
512
512
512
512
512
512
512
512
512
512
512
512
512
512
512
512
512
512
512
512
512
512
512
512
512
512
512
512
512
512
512
512
512
512
512
512
512
512
512
512
512
512
512
512
512
512
512
512
512
512
512
512
512
512
512
512
512
512
512
512
512
512
512
512
512
512
512
512
512
512
512
512
512
512
512
512
512
512
512
512
512
512
512
512
512
512
512
512
512
512
512
512
512
512
512
512
512
512
512
512
512
512
512
512
512
512
512
512
512


In [18]:
exp_runner.pred_dataset = exp_runner.pred_dataset.select(range(64))

In [15]:
exp_runner.test()

[INFO|trainer.py:515] 2021-06-21 14:30:48,541 >> The following columns in the evaluation set  don't have a corresponding argument in `BertForMaskedLM.forward` and have been ignored: author, data_split, text, int_id, __index_level_0__, created_utc, id, subreddit, special_tokens_mask.
[INFO|trainer.py:2115] 2021-06-21 14:30:48,549 >> ***** Running Evaluation *****
[INFO|trainer.py:2117] 2021-06-21 14:30:48,550 >>   Num examples = 7884
[INFO|trainer.py:2120] 2021-06-21 14:30:48,550 >>   Batch size = 32


In [14]:
exp_runner.training_args.output_dir

'/data/ddmg/redditlanguagemodeling/results/debug'