In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
import pandas as pd
import math
from datasets import Dataset
from transformers import AutoConfig, AutoTokenizer, AutoModelForMaskedLM, DataCollatorForLanguageModeling, Trainer

## 1) Select GPUs to use

In [3]:
gpuids = [0, 1, 2, 3]

if gpuids is None or len(gpuids) == 0:
    os.environ["CUDA_VISIBLE_DEVICES"] = ""
    print("Using CPU")
else:
    gpuid_str = str(gpuids[0])
    for gpuid in gpuids[1:]:
        gpuid_str += ",{}".format(gpuid)
    os.environ["CUDA_VISIBLE_DEVICES"] = gpuid_str
    print("Using GPU:{}".format(gpuid_str))

Using GPU:0,1,2,3


In [4]:
os.environ["CUDA_VISIBLE_DEVICES"]

'0,1,2,3'

In [5]:
import torch
torch.cuda.is_available()

True

## 2) Set Experiment Args

In [6]:
model_args = {
    "model_name_or_path": "bert-base-uncased",
    "tokenizer_name": "bert-base-uncased",
    "cache_dir": "/data/ddmg/personalizedmentalhealth/reddit_project/cached_models",
    "use_fast_tokenizer": True,
    "config_name_or_path": "bert-base-uncased",
}

data_args = {
    "data_files": ["/data/ddmg/personalizedmentalhealth/reddit_project/data/4_all_data.csv"],
    "preprocessing_num_workers": None,  # number of processes to use for the preprocessing
    "mlm_probability": 0.15,  # ratio of tokens to mask for MLM loss
    "max_eval_samples": None,  # for debugging purposes, truncate # of evaluation samples to this value if set,
    "data_split": "val",  # evaluate just on this split of data (or all data if None)
}

## 3) Load dataset

In [7]:
df_list = []
for data_path in data_args["data_files"]:
    df = pd.read_csv(data_path)
    df.drop(columns="Unnamed: 0", inplace=True)
    df_list.append(df)
data_df = pd.concat(df_list)
if data_args["data_split"] is not None:
    data_df = data_df[data_df["data_split"] == data_args["data_split"]]

In [8]:
dataset = Dataset.from_pandas(data_df)

## 4) Load Tokenizer

In [9]:
tokenizer_kwargs = {
    "use_fast": model_args["use_fast_tokenizer"],
    "cache_dir": model_args["cache_dir"]
}

tokenizer = AutoTokenizer.from_pretrained(model_args["tokenizer_name"], **tokenizer_kwargs)

## 5) Load Config

In [10]:
config_kwargs = {
    "cache_dir": model_args["cache_dir"]
}

config = AutoConfig.from_pretrained(model_args["config_name_or_path"])

## 6) Load Model

In [11]:
model = AutoModelForMaskedLM.from_pretrained(
    model_args["model_name_or_path"],
    config=config,
    cache_dir=model_args["cache_dir"]
)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


## 7) Preprocess data

In [12]:
dataset = dataset.select(range(100))

TypeError: 'int' object is not iterable

In [21]:
def tokenize_func(examples):
    return tokenizer(
        examples["text"],
        padding=False,  # do dynamic padding to longest sequence in batch later
        truncation=True,
        # We use this option because DataCollatorForLanguageModeling (see below) is more efficient when it
        # receives the `special_tokens_mask`.
        return_special_tokens_mask=True
    )

In [22]:
tokenized_dataset = dataset.map(
    tokenize_func,
    batched=True,
    num_proc=data_args["preprocessing_num_workers"]
)

HBox(children=(FloatProgress(value=0.0, max=8.0), HTML(value='')))




In [31]:
tokenized_dataset

Dataset({
    features: ['__index_level_0__', 'attention_mask', 'author', 'created_utc', 'data_split', 'id', 'input_ids', 'special_tokens_mask', 'subreddit', 'text', 'token_type_ids'],
    num_rows: 7226
})

In [34]:
input_ids = tokenized_dataset['input_ids']

In [23]:
if data_args["max_eval_samples"] is not None:
    tokenized_dataset = tokenized_dataset.select(range(data_args["max_eval_samples"]))

In [26]:
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm_probability=data_args["mlm_probability"],
)

## 8) Initialize Trainer

In [27]:
trainer = Trainer(
    model=model,
    eval_dataset=tokenized_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator
)

## 9) Evaluate Model

In [40]:
metrics = trainer.evaluate()
max_eval_samples = data_args["max_eval_samples"] if data_args["max_eval_samples"] is not None else len(dataset)
metrics["eval_samples"] = min(max_eval_samples, len(dataset))
try:
    perplexity = math.exp(metrics["eval_loss"])
except OverflowError:
    perplexity = float("inf")
metrics["perplexity"] = perplexity
trainer.log_metrics("eval", metrics)
trainer.save_metrics("eval", metrics)

In [41]:
metrics

{'eval_loss': 2.6866581439971924,
 'eval_runtime': 95.2486,
 'eval_samples_per_second': 75.865,
 'eval_samples': 7226,
 'perplexity': 14.682526950967066}

## 10) Fine-Tune Model

In [44]:
data_args = {
    "data_files": ["/data/ddmg/personalizedmentalhealth/reddit_project/data/4_all_data.csv"],
    "preprocessing_num_workers": None,  # number of processes to use for the preprocessing
    "mlm_probability": 0.15,  # ratio of tokens to mask for MLM loss
    "max_train_samples": None, # for debugging purposes, truncate # of train samples to this value if set
    "max_eval_samples": None,  # for debugging purposes, truncate # of evaluation samples to this value if set
}

In [45]:
df_list = []
for data_path in data_args["data_files"]:
    df = pd.read_csv(data_path)
    df.drop(columns="Unnamed: 0", inplace=True)
    df_list.append(df)
data_df = pd.concat(df_list)
train_df = data_df[data_df["data_split"] == "train"]
eval_df = data_df[data_df["data_split"] == "val"]
train_dataset = Dataset.from_pandas(train_df)
eval_dataset = Dataset.from_pandas(eval_df)

In [46]:
tokenized_train = train_dataset.map(
    tokenize_func,
    batched=True,
    num_proc=data_args["preprocessing_num_workers"]
)
if data_args["max_train_samples"] is not None:
    tokenized_dataset = tokenized_train.select(range(data_args["max_train_samples"]))

HBox(children=(FloatProgress(value=0.0, max=1365.0), HTML(value='')))




In [47]:
tokenized_eval = eval_dataset.map(
    tokenize_func,
    batched=True,
    num_proc=data_args["preprocessing_num_workers"]
)
if data_args["max_eval_samples"] is not None:
    tokenized_dataset = tokenized_eval.select(range(data_args["max_eval_samples"]))

HBox(children=(FloatProgress(value=0.0, max=8.0), HTML(value='')))




In [49]:
trainer = Trainer(
    model=model,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_eval,
    tokenizer=tokenizer,
    data_collator=data_collator
)

In [None]:
train_result = trainer.train()
trainer.save_model()
metrics = train_result.metrics
max_train_samples = (
    data_args["max_train_samples"] if data_args["max_train_samples"] is not None else len(train_dataset)
)
metrics["train_samples"] = min(max_train_samples, len(train_dataset))
trainer.log_metrics("train", metrics)
trainer.save_metrics("train", metrics)
trainer.save_state()

Step,Training Loss
500,1.9304
1000,1.8001
1500,1.749
2000,1.7172
2500,1.6951
3000,1.6768
3500,1.6633
4000,1.6508
4500,1.6383
5000,1.6159


