In [1]:
%load_ext autoreload
%autoreload 2

In [4]:
import os
import sys
sys.path.append("../src/")
import numpy as np
import pandas as pd
import math
from datasets import Dataset, list_metrics, load_metric
from transformers import AutoConfig, AutoTokenizer, AutoModelForMaskedLM, DataCollatorForLanguageModeling, Trainer, HfArgumentParser, TrainingArguments

import matplotlib as plt
import seaborn as sns
sns.set_theme()

from argument_parsing.model_args import ModelArguments
from argument_parsing.data_args import DataArguments
from argument_parsing.experiment_args import ExperimentArguments
from runners.run_mlm_exp import ExpRunner
from models.naive_baseline_models import ModeBaseline

In [5]:
gpuids = [0, 1, 2, 3]

if gpuids is None or len(gpuids) == 0:
    os.environ["CUDA_VISIBLE_DEVICES"] = ""
    print("Using CPU")
else:
    gpuid_str = str(gpuids[0])
    for gpuid in gpuids[1:]:
        gpuid_str += ",{}".format(gpuid)
    os.environ["CUDA_VISIBLE_DEVICES"] = gpuid_str
    print("Using GPU:{}".format(gpuid_str))

Using GPU:0,1,2,3


In [6]:
import torch
torch.cuda.is_available()

True

In [7]:
parser = HfArgumentParser((ExperimentArguments, ModelArguments, DataArguments, TrainingArguments))

In [8]:
exp_args, model_args, data_args, training_args = parser.parse_json_file(json_file="../src/argument_configs/temp.json")

In [9]:
exp_runner = ExpRunner(exp_args, model_args, data_args, training_args)

[INFO|configuration_utils.py:517] 2021-06-15 18:57:12,933 >> loading configuration file https://huggingface.co/bert-base-uncased/resolve/main/config.json from cache at /data/ddmg/personalizedmentalhealth/reddit_project/cached/temp/3c61d016573b14f7f008c02c4e51a366c67ab274726fe2910691e2a761acf43e.37395cee442ab11005bcd270f3c34464dc1704b715b5d7d52b1a461abe3b9e4e
[INFO|configuration_utils.py:553] 2021-06-15 18:57:12,936 >> Model config BertConfig {
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.6.1",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size

Found existing dataset to load.


[INFO|tokenization_utils_base.py:1717] 2021-06-15 18:57:13,275 >> loading file https://huggingface.co/bert-base-uncased/resolve/main/vocab.txt from cache at /data/ddmg/personalizedmentalhealth/reddit_project/cached/temp/45c3f7a79a80e1cf0a489e5c62b43f173c15db47864303a55d623bb3c96f72a5.d789d64ebfe299b0e416afc4a169632f903f693095b4629a7ea271d5a0cf2c99
[INFO|tokenization_utils_base.py:1717] 2021-06-15 18:57:13,276 >> loading file https://huggingface.co/bert-base-uncased/resolve/main/tokenizer.json from cache at /data/ddmg/personalizedmentalhealth/reddit_project/cached/temp/534479488c54aeaf9c3406f647aa2ec13648c06771ffe269edabebd4c412da1d.7f2721073f19841be16f41b0a70b600ca6b880c8f3df6f3535cbc704371bdfa4
[INFO|tokenization_utils_base.py:1717] 2021-06-15 18:57:13,277 >> loading file https://huggingface.co/bert-base-uncased/resolve/main/added_tokens.json from cache at None
[INFO|tokenization_utils_base.py:1717] 2021-06-15 18:57:13,278 >> loading file https://huggingface.co/bert-base-uncased/resol

loaded existing train data
loaded existing eval data
loaded existing pred data


In [10]:
exp_runner.train_dataset

Dataset({
    features: ['__index_level_0__', 'attention_mask', 'author', 'created_utc', 'data_split', 'id', 'input_ids', 'special_tokens_mask', 'subreddit', 'text', 'token_type_ids'],
    num_rows: 1364349
})

In [11]:
exp_runner.tokenizer

PreTrainedTokenizerFast(name_or_path='bert-base-uncased', vocab_size=30522, model_max_len=512, is_fast=True, padding_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'})

In [12]:
exp_runner.tokenizer.vocab_size

30522

In [14]:
myarr = np.zeros(5)
myarr

array([0., 0., 0., 0., 0.])

In [15]:
toks = np.array([4, 1, 2])
counts = np.array([1, 2, 3])
myarr[toks] += counts

In [16]:
myarr

array([0., 2., 3., 0., 1.])

In [18]:
def get_vocab_counts(train_dataset, vocab_size):
    tok2count = np.zeros(vocab_size)
    for idx, inputs in enumerate(train_dataset['input_ids']):
        inputs = np.array(inputs)
        special_tokens_mask = np.array(train_dataset['special_tokens_mask'][idx])
        inputs = inputs[special_tokens_mask == 0]
        toks, counts = np.unique(inputs, return_counts=True)
        tok2count[toks] += counts
    return tok2count

In [19]:
tok2count = get_vocab_counts(exp_runner.train_dataset, exp_runner.tokenizer.vocab_size)

TypeError: get_vocab_counts() missing 1 required positional argument: 'vocab_size'

In [63]:
len(tok2count.keys())

2432

In [64]:
tok2count

{1005: 266,
 1010: 450,
 1012: 784,
 1029: 42,
 1037: 284,
 1045: 580,
 1049: 39,
 1055: 83,
 1056: 125,
 1998: 438,
 2000: 451,
 2003: 151,
 2004: 50,
 2005: 159,
 2007: 119,
 2009: 190,
 2014: 134,
 2016: 186,
 2021: 90,
 2023: 110,
 2024: 43,
 2025: 68,
 2026: 351,
 2031: 102,
 2033: 141,
 2036: 21,
 2038: 45,
 2042: 40,
 2043: 92,
 2054: 50,
 2057: 89,
 2061: 65,
 2065: 36,
 2069: 18,
 2070: 23,
 2071: 18,
 2074: 42,
 2079: 51,
 2085: 28,
 2096: 18,
 2145: 15,
 2160: 17,
 2200: 17,
 2360: 33,
 2393: 24,
 2403: 4,
 2428: 29,
 2477: 21,
 2515: 18,
 2574: 5,
 2693: 7,
 2769: 15,
 2812: 5,
 2868: 3,
 2987: 21,
 3670: 1,
 3772: 3,
 4326: 1,
 4902: 9,
 4952: 2,
 4999: 1,
 5358: 1,
 5870: 2,
 6057: 3,
 6200: 1,
 6881: 2,
 8744: 1,
 9015: 2,
 9164: 2,
 9202: 5,
 10740: 2,
 11158: 1,
 12954: 6,
 13233: 2,
 13268: 1,
 13325: 1,
 14099: 1,
 15366: 2,
 16021: 2,
 999: 29,
 1000: 107,
 1003: 5,
 1006: 61,
 1007: 63,
 1013: 27,
 1024: 27,
 1059: 3,
 1996: 333,
 1997: 218,
 1999: 150,
 2001: 175,