In [2]:
%load_ext autoreload
%autoreload 2

In [20]:
import os
import sys
sys.path.append("../src/")
import numpy as np
import pandas as pd
import torch
import math
from datasets import Dataset, list_metrics, load_metric
from transformers import AutoConfig, AutoTokenizer, AutoModelForMaskedLM, DataCollatorForLanguageModeling, Trainer

import matplotlib as plt
import seaborn as sns
sns.set_theme()

from trainers.my_trainer import MyTrainer

In [4]:
gpuids = [0, 1, 2, 3]

if gpuids is None or len(gpuids) == 0:
    os.environ["CUDA_VISIBLE_DEVICES"] = ""
    print("Using CPU")
else:
    gpuid_str = str(gpuids[0])
    for gpuid in gpuids[1:]:
        gpuid_str += ",{}".format(gpuid)
    os.environ["CUDA_VISIBLE_DEVICES"] = gpuid_str
    print("Using GPU:{}".format(gpuid_str))

Using GPU:0,1,2,3


In [5]:
torch.cuda.is_available()

True

In [6]:
model_args = {
    "model_name_or_path": "bert-base-uncased",
    "tokenizer_name": "bert-base-uncased",
    "cache_dir": "/data/ddmg/personalizedmentalhealth/reddit_project/cached_models",
    "use_fast_tokenizer": True,
    "config_name_or_path": "bert-base-uncased",
}

data_args = {
    "data_files": ["/data/ddmg/personalizedmentalhealth/reddit_project/data/4_all_data.csv"],
    "preprocessing_num_workers": None,  # number of processes to use for the preprocessing
    "mlm_probability": 0.15,  # ratio of tokens to mask for MLM loss
    "max_eval_samples": None,  # for debugging purposes, truncate # of evaluation samples to this value if set,
    "data_split": "val",  # evaluate just on this split of data (or all data if None)
}

In [7]:
df_list = []
for data_path in data_args["data_files"]:
    df = pd.read_csv(data_path)
    df.drop(columns="Unnamed: 0", inplace=True)
    df_list.append(df)
data_df = pd.concat(df_list)
if data_args["data_split"] is not None:
    data_df = data_df[data_df["data_split"] == data_args["data_split"]]

In [8]:
dataset = Dataset.from_pandas(data_df)

In [9]:
tokenizer_kwargs = {
    "use_fast": model_args["use_fast_tokenizer"],
    "cache_dir": model_args["cache_dir"]
}

tokenizer = AutoTokenizer.from_pretrained(model_args["tokenizer_name"], **tokenizer_kwargs)

In [10]:
config_kwargs = {
    "cache_dir": model_args["cache_dir"]
}

config = AutoConfig.from_pretrained(model_args["config_name_or_path"])

In [11]:
model = AutoModelForMaskedLM.from_pretrained(
    model_args["model_name_or_path"],
    config=config,
    cache_dir=model_args["cache_dir"]
)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [12]:
def tokenize_func(examples):
    return tokenizer(
        examples["text"],
        padding=False,  # do dynamic padding to longest sequence in batch later
        truncation=True,
        # We use this option because DataCollatorForLanguageModeling (see below) is more efficient when it
        # receives the `special_tokens_mask`.
        return_special_tokens_mask=True
    )

In [13]:
tokenized_dataset = dataset.map(
    tokenize_func,
    batched=True,
    num_proc=data_args["preprocessing_num_workers"]
)

HBox(children=(FloatProgress(value=0.0, max=8.0), HTML(value='')))




In [14]:
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm_probability=data_args["mlm_probability"],
)

In [29]:
metric = load_metric("accuracy")

In [30]:
def compute_accuracy(eval_pred):
    predictions, labels = eval_pred
    return metric.compute(predictions=predictions, references=labels)

In [33]:
trainer = MyTrainer(
    model=model,
    eval_dataset=tokenized_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_accuracy,
    data_collator=data_collator
)

In [38]:
torch.cuda.empty_cache()

In [39]:
metrics = trainer.evaluate()
max_eval_samples = data_args["max_eval_samples"] if data_args["max_eval_samples"] is not None else len(dataset)
metrics["eval_samples"] = min(max_eval_samples, len(dataset))
try:
    perplexity = math.exp(metrics["eval_loss"])
except OverflowError:
    perplexity = float("inf")
metrics["perplexity"] = perplexity
trainer.log_metrics("eval", metrics)
trainer.save_metrics("eval", metrics)



Python 3.8.10 (default, May 19 2021, 18:05:58) 
Type 'copyright', 'credits' or 'license' for more information
IPython 7.22.0 -- An enhanced Interactive Python. Type '?' for help.

In [1]: type(logits)
Out[1]: torch.Tensor

In [2]: logits.shape
Out[2]: torch.Size([32, 512, 30522])

In [3]: %exit_raise



KillEmbedded: Embedded IPython raising error, as user requested.