In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
import sys
sys.path.append("../src/")
import numpy as np
import pandas as pd
import math
from datasets import Dataset, list_metrics, load_metric, load_from_disk
from transformers import AutoConfig, AutoTokenizer, AutoModelForMaskedLM, DataCollatorForLanguageModeling, Trainer, HfArgumentParser, TrainingArguments, default_data_collator

import matplotlib as plt
import seaborn as sns
sns.set_theme()

from argument_parsing.model_args import ModelArguments
from argument_parsing.data_args import DataArguments
from argument_parsing.experiment_args import ExperimentArguments
from runners.run_mlm_exp import ExpRunner

Using GPU:0,1,2,3


In [3]:
gpuids = [0, 1, 2, 3]

if gpuids is None or len(gpuids) == 0:
    os.environ["CUDA_VISIBLE_DEVICES"] = ""
    print("Using CPU")
else:
    gpuid_str = str(gpuids[0])
    for gpuid in gpuids[1:]:
        gpuid_str += ",{}".format(gpuid)
    os.environ["CUDA_VISIBLE_DEVICES"] = gpuid_str
    print("Using GPU:{}".format(gpuid_str))

Using GPU:0,1,2,3


In [4]:
import torch
torch.cuda.is_available()

True

In [5]:
parser = HfArgumentParser((ExperimentArguments, ModelArguments, DataArguments, TrainingArguments))

In [6]:
exp_args, model_args, data_args, training_args = parser.parse_json_file(json_file="../src/argument_configs/temp.json")

In [7]:
exp_runner = ExpRunner(exp_args, model_args, data_args, training_args)

[INFO|configuration_utils.py:517] 2021-06-22 09:39:17,930 >> loading configuration file https://huggingface.co/distilbert-base-uncased/resolve/main/config.json from cache at /data/ddmg/redditlanguagemodeling/cached/temp/23454919702d26495337f3da04d1655c7ee010d5ec9d77bdb9e399e00302c0a1.d423bdf2f58dc8b77d5f5d18028d7ae4a72dcfd8f468e81fe979ada957a8c361
[INFO|configuration_utils.py:553] 2021-06-22 09:39:17,932 >> Model config DistilBertConfig {
  "activation": "gelu",
  "architectures": [
    "DistilBertForMaskedLM"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "initializer_range": 0.02,
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "pad_token_id": 0,
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "transformers_version": "4.6.1",
  "vocab_size": 30522
}



Using GPU:0,1,2,3


[INFO|tokenization_utils_base.py:1717] 2021-06-22 09:39:18,282 >> loading file https://huggingface.co/distilbert-base-uncased/resolve/main/vocab.txt from cache at /data/ddmg/redditlanguagemodeling/cached/temp/0e1bbfda7f63a99bb52e3915dcf10c3c92122b827d92eb2d34ce94ee79ba486c.d789d64ebfe299b0e416afc4a169632f903f693095b4629a7ea271d5a0cf2c99
[INFO|tokenization_utils_base.py:1717] 2021-06-22 09:39:18,285 >> loading file https://huggingface.co/distilbert-base-uncased/resolve/main/tokenizer.json from cache at /data/ddmg/redditlanguagemodeling/cached/temp/75abb59d7a06f4f640158a9bfcde005264e59e8d566781ab1415b139d2e4c603.7f2721073f19841be16f41b0a70b600ca6b880c8f3df6f3535cbc704371bdfa4
[INFO|tokenization_utils_base.py:1717] 2021-06-22 09:39:18,286 >> loading file https://huggingface.co/distilbert-base-uncased/resolve/main/added_tokens.json from cache at None
[INFO|tokenization_utils_base.py:1717] 2021-06-22 09:39:18,287 >> loading file https://huggingface.co/distilbert-base-uncased/resolve/main/sp

Failed to load pre-existing training dataset. Reloading...
Did not find existing datasets. Reloading..


In [8]:
exp_runner.pred_dataset

Dataset({
    features: ['id', 'author', 'data_split', 'subreddit', 'text', 'created_utc', '__index_level_0__'],
    num_rows: 521
})

In [9]:
pred_data = exp_runner.pred_dataset

In [13]:
exp_runner.tokenizer.model_max_length

512

In [14]:
pred_data = pred_data.map(
    lambda x: exp_runner.tokenizer(x["text"], padding='max_length', truncation=True, return_special_tokens_mask=True),
    batched=True,
    num_proc=exp_runner.data_args.preprocessing_num_workers
)

HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))




In [15]:
set([len(x) for x in pred_data['input_ids']])

{512}

In [16]:
collator = DataCollatorForLanguageModeling(
    tokenizer=exp_runner.tokenizer,
    mlm_probability=exp_runner.data_args.mlm_probability,
)

In [19]:
inputs = torch.tensor(pred_data['input_ids'])
inputs.shape

torch.Size([521, 512])

In [20]:
special_tokens_mask = torch.tensor(pred_data['special_tokens_mask'])
special_tokens_mask.shape

torch.Size([521, 512])

In [21]:
inputs, labels = collator.mask_tokens(inputs, special_tokens_mask)

In [28]:
pred_data = pred_data.rename_column('input_ids', 'original_input_ids')

In [29]:
inputs

Using GPU:0,1,2,3


tensor([[  101,  2309, 10551,  ...,     0,     0,     0],
        [  101,  2129,  2079,  ...,     0,     0,     0],
        [  101,  2323,  1045,  ...,     0,     0,     0],
        ...,
        [  101,  1045,  1031,  ...,  2202,   103,   102],
        [  101,   103,  1045,  ...,     0,     0,     0],
        [  101,  6040,   103,  ...,     0,     0,     0]])

In [23]:
labels

tensor([[-100, -100, -100,  ..., -100, -100, -100],
        [-100, -100, -100,  ..., -100, -100, -100],
        [-100, -100, -100,  ..., -100, -100, -100],
        ...,
        [-100, -100, -100,  ..., -100, 1996, -100],
        [-100, 2323, -100,  ..., -100, -100, -100],
        [-100, -100, 3531,  ..., -100, -100, -100]])

In [32]:
list_inputs = inputs.tolist()
list_inputs

[[101,
  2309,
  10551,
  2317,
  2606,
  103,
  103,
  2038,
  103,
  2081,
  2033,
  8025,
  1012,
  1045,
  4025,
  2000,
  103,
  1996,
  8138,
  3962,
  1006,
  2783,
  2028,
  103,
  2006,
  1996,
  2067,
  1997,
  103,
  7223,
  1007,
  2008,
  7502,
  1037,
  10551,
  2317,
  2606,
  1012,
  1996,
  2606,
  3544,
  13338,
  1998,
  4050,
  4832,
  2039,
  2185,
  2013,
  1996,
  3096,
  1010,
  1996,
  14902,
  5683,
  2066,
  1037,
  103,
  3538,
  1997,
  6081,
  1012,
  2009,
  8005,
  2041,
  4089,
  1998,
  2788,
  7502,
  2067,
  1012,
  2673,
  2842,
  1999,
  1996,
  103,
  2003,
  3294,
  3671,
  103,
  2601,
  6910,
  2849,
  2606,
  1012,
  103,
  2801,
  103,
  2023,
  2003,
  1029,
  102,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
 

In [35]:
pred_data = pred_data.add_column(name="input_ids", column=inputs.tolist())

In [36]:
pred_data = pred_data.add_column(name="labels", column=labels.tolist())

In [38]:
pred_data

Dataset({
    features: ['__index_level_0__', 'attention_mask', 'author', 'created_utc', 'data_split', 'id', 'original_input_ids', 'special_tokens_mask', 'subreddit', 'text', 'input_ids', 'labels'],
    num_rows: 521
})

In [41]:
print(pred_data['original_input_ids'][0:2])
print(pred_data['input_ids'][0:2])

[[101, 2309, 10551, 2317, 2606, 1029, 2023, 2038, 2074, 2081, 2033, 8025, 1012, 1045, 4025, 2000, 2031, 1996, 8138, 3962, 1006, 2783, 2028, 2003, 2006, 1996, 2067, 1997, 2026, 7223, 1007, 2008, 7502, 1037, 10551, 2317, 2606, 1012, 1996, 2606, 3544, 13338, 1998, 4050, 4832, 2039, 2185, 2013, 1996, 3096, 1010, 1996, 14902, 5683, 2066, 1037, 4857, 3538, 1997, 6081, 1012, 2009, 8005, 2041, 4089, 1998, 2788, 7502, 2067, 1012, 2673, 2842, 1999, 1996, 2181, 2003, 3294, 3671, 1010, 2601, 6910, 2849, 2606, 1012, 2151, 2801, 2054, 2023, 2003, 1029, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 

## Test New Prepare Data Code

In [48]:
exp_runner = ExpRunner(exp_args, model_args, data_args, training_args)

[INFO|configuration_utils.py:517] 2021-06-22 10:00:28,329 >> loading configuration file https://huggingface.co/distilbert-base-uncased/resolve/main/config.json from cache at /data/ddmg/redditlanguagemodeling/cached/temp/23454919702d26495337f3da04d1655c7ee010d5ec9d77bdb9e399e00302c0a1.d423bdf2f58dc8b77d5f5d18028d7ae4a72dcfd8f468e81fe979ada957a8c361
[INFO|configuration_utils.py:553] 2021-06-22 10:00:28,331 >> Model config DistilBertConfig {
  "activation": "gelu",
  "architectures": [
    "DistilBertForMaskedLM"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "initializer_range": 0.02,
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "pad_token_id": 0,
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "transformers_version": "4.6.1",
  "vocab_size": 30522
}



Using GPU:0,1,2,3


[INFO|tokenization_utils_base.py:1717] 2021-06-22 10:00:28,675 >> loading file https://huggingface.co/distilbert-base-uncased/resolve/main/vocab.txt from cache at /data/ddmg/redditlanguagemodeling/cached/temp/0e1bbfda7f63a99bb52e3915dcf10c3c92122b827d92eb2d34ce94ee79ba486c.d789d64ebfe299b0e416afc4a169632f903f693095b4629a7ea271d5a0cf2c99
[INFO|tokenization_utils_base.py:1717] 2021-06-22 10:00:28,677 >> loading file https://huggingface.co/distilbert-base-uncased/resolve/main/tokenizer.json from cache at /data/ddmg/redditlanguagemodeling/cached/temp/75abb59d7a06f4f640158a9bfcde005264e59e8d566781ab1415b139d2e4c603.7f2721073f19841be16f41b0a70b600ca6b880c8f3df6f3535cbc704371bdfa4
[INFO|tokenization_utils_base.py:1717] 2021-06-22 10:00:28,678 >> loading file https://huggingface.co/distilbert-base-uncased/resolve/main/added_tokens.json from cache at None
[INFO|tokenization_utils_base.py:1717] 2021-06-22 10:00:28,679 >> loading file https://huggingface.co/distilbert-base-uncased/resolve/main/sp

Failed to load pre-existing prediction dataset. Reloading...
Did not find existing datasets. Reloading..
Creating prediction dataset with labels


HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))


Saving prediction dataset to /data/ddmg/redditlanguagemodeling/data/processed/pred_all_fixed_labels
preparing pred data
saving pred dataset to /data/ddmg/redditlanguagemodeling/data/processed/pred_all_fixed_labels


In [49]:
exp_runner = ExpRunner(exp_args, model_args, data_args, training_args)

[INFO|configuration_utils.py:517] 2021-06-22 10:00:38,880 >> loading configuration file https://huggingface.co/distilbert-base-uncased/resolve/main/config.json from cache at /data/ddmg/redditlanguagemodeling/cached/temp/23454919702d26495337f3da04d1655c7ee010d5ec9d77bdb9e399e00302c0a1.d423bdf2f58dc8b77d5f5d18028d7ae4a72dcfd8f468e81fe979ada957a8c361
[INFO|configuration_utils.py:553] 2021-06-22 10:00:38,882 >> Model config DistilBertConfig {
  "activation": "gelu",
  "architectures": [
    "DistilBertForMaskedLM"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "initializer_range": 0.02,
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "pad_token_id": 0,
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "transformers_version": "4.6.1",
  "vocab_size": 30522
}

[INFO|tokenization_utils_base.py:1717] 2021-06-22 10:00:39,229 >> loading file ht

Found existing test data with labels.
loaded existing train data
loaded existing eval data
loaded existing pred data (w/out fixed labels)
Found existing test dataset w/ fixed labels: /data/ddmg/redditlanguagemodeling/data/processed/pred_all_fixed_labels
Loading...


In [50]:
exp_runner.pred_dataset

Dataset({
    features: ['__index_level_0__', 'attention_mask', 'author', 'created_utc', 'data_split', 'id', 'input_ids', 'labels', 'original_input_ids', 'special_tokens_mask', 'subreddit', 'text'],
    num_rows: 521
})