In [1]:
import os
os.environ['TRANSFORMERS_CACHE'] = '/scratch/nhj4247/python_cache/'

In [2]:
from transformers import pipeline

fill_mask = pipeline(
    "fill-mask",
    model="roberta-base",
    tokenizer="roberta-base"
)
fill_mask("Send these <mask> back!")

[{'sequence': 'Send these pictures back!',
  'score': 0.16661524772644043,
  'token': 3493,
  'token_str': ' pictures'},
 {'sequence': 'Send these photos back!',
  'score': 0.10792797058820724,
  'token': 2356,
  'token_str': ' photos'},
 {'sequence': 'Send these emails back!',
  'score': 0.0767090767621994,
  'token': 5575,
  'token_str': ' emails'},
 {'sequence': 'Send these images back!',
  'score': 0.0486077181994915,
  'token': 3156,
  'token_str': ' images'},
 {'sequence': 'Send these letters back!',
  'score': 0.04841756820678711,
  'token': 5430,
  'token_str': ' letters'}]

In [None]:
## Out-of-the box model training

from transformers import RobertaTokenizer, RobertaForMaskedLM
from transformers import LineByLineTextDataset
from transformers import DataCollatorForLanguageModeling
from transformers import Trainer, TrainingArguments

tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
model = RobertaForMaskedLM.from_pretrained('roberta-base')

dataset = LineByLineTextDataset(
    tokenizer=tokenizer,
    file_path="/scratch/nhj4247/data/tweeteval/datasets/hate/train_text.txt",
    block_size=512,
)

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=True, mlm_probability=0.15
)

training_args = TrainingArguments(
    output_dir="./roberta-retrained",
    overwrite_output_dir=True,
    num_train_epochs=25,
    per_device_train_batch_size=48,
    save_steps=500,
    save_total_limit=2,
    seed=1
)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=dataset
)

trainer.train()

trainer.save_model("./roberta-retrained")



***** Running training *****
  Num examples = 8993
  Num Epochs = 25
  Instantaneous batch size per device = 48
  Total train batch size (w. parallel, distributed & accumulation) = 48
  Gradient Accumulation steps = 1
  Total optimization steps = 4700


Step,Training Loss
500,2.3401
1000,2.0873
1500,1.9403
2000,1.8361


Saving model checkpoint to ./roberta-retrained/checkpoint-500
Configuration saved in ./roberta-retrained/checkpoint-500/config.json
Model weights saved in ./roberta-retrained/checkpoint-500/pytorch_model.bin
Saving model checkpoint to ./roberta-retrained/checkpoint-1000
Configuration saved in ./roberta-retrained/checkpoint-1000/config.json
Model weights saved in ./roberta-retrained/checkpoint-1000/pytorch_model.bin
Saving model checkpoint to ./roberta-retrained/checkpoint-1500
Configuration saved in ./roberta-retrained/checkpoint-1500/config.json
Model weights saved in ./roberta-retrained/checkpoint-1500/pytorch_model.bin
Deleting older checkpoint [roberta-retrained/checkpoint-500] due to args.save_total_limit
Saving model checkpoint to ./roberta-retrained/checkpoint-2000
Configuration saved in ./roberta-retrained/checkpoint-2000/config.json
Model weights saved in ./roberta-retrained/checkpoint-2000/pytorch_model.bin
Deleting older checkpoint [roberta-retrained/checkpoint-1000] due to 

In [20]:
## Custom model training

from transformers import RobertaTokenizer, RobertaForMaskedLM
from transformers import LineByLineTextDataset
from transformers import DataCollatorForLanguageModeling
from transformers import Trainer, TrainingArguments
from transformers import AdamW, get_linear_schedule_with_warmup
from tqdm import trange, tqdm
from torch.utils.data import RandomSampler, DataLoader, SequentialSampler
import torch
import os

device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")

tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
model = RobertaForMaskedLM.from_pretrained('roberta-base')

train_dataset = LineByLineTextDataset(
    tokenizer=tokenizer,
    file_path="/scratch/nhj4247/data/tweeteval/datasets/hate/train_text.txt",
    block_size=512,
)

eval_dataset = LineByLineTextDataset(
    tokenizer=tokenizer,
    file_path="/scratch/nhj4247/data/tweeteval/datasets/hate/val_text.txt",
    block_size=512,
)

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=True, mlm_probability=0.15
)

args = TrainingArguments(
    output_dir="./mlm_roberta_tweeteval",
    overwrite_output_dir=True,
    num_train_epochs=20,
    per_device_train_batch_size=48,
    save_steps=500,
    save_total_limit=2,
    seed=1,
    eval_steps=500
)

# trainer = Trainer(
#     model=model,
#     args=args,
#     data_collator=data_collator,
#     train_dataset=dataset
# )


def evaluate(args, model, eval_dataset):
    
    eval_dataloader = DataLoader(eval_dataset,
                                 batch_size=args.per_device_eval_batch_size,
                                 sampler=SequentialSampler(eval_dataset),
                                 collate_fn=data_collator)
    eval_iterator = tqdm(eval_dataloader, desc="Evaluating", position=0, leave=True)
    eval_loss = 0
    for _, batch in enumerate(eval_iterator):
        batch.to(device)
        model.eval()
        outputs = model(**batch)
        eval_loss += outputs['loss'].item()
        
    return eval_loss


## Training loop

train_sampler = RandomSampler(train_dataset)
train_dataloader = DataLoader(train_dataset,
                              batch_size=args.per_device_train_batch_size,
                              sampler=train_sampler,
                              collate_fn=data_collator)
t_total = len(train_dataloader) * args.num_train_epochs

no_decay = ['bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
        {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': args.weight_decay},
        {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
        ]
optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total)


print("***** Running training *****")
print("  Num examples = %d" %len(train_dataset))
print("  Num Epochs = %d" %args.num_train_epochs)
print("  Instantaneous batch size per GPU = %d" %args.per_device_train_batch_size)
print("  Total optimization steps = %d" %t_total)

train_iterator = trange(int(args.num_train_epochs), desc="Epoch")
global_step = 0
model.to(device)

for epoch in train_iterator:
    epoch_iterator = tqdm(train_dataloader, desc="Iteration", position=0, leave=True)
    for step, batch in enumerate(epoch_iterator):
        batch.to(device)
        model.train()
        outputs = model(**batch)
        loss = outputs['loss']
        
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm)
        optimizer.step()
        scheduler.step()
        model.zero_grad()
        global_step += 1
        
        if global_step % args.save_steps == 0:
            output_dir = os.path.join(args.output_dir, 'checkpoint-{}'.format(global_step))
            if not os.path.exists(output_dir):
                os.makedirs(output_dir)
                model.save_pretrained(output_dir)
                torch.save(args, os.path.join(output_dir, 'training_args.bin'))
                
        if global_step % args.eval_steps == 0:
            eval_loss = evaluate(args, model, eval_dataset)
            print("Eval loss at %d is %.2f" %(global_step, eval_loss))
            
            

                                 



***** Running training *****
  Num examples = 8993
  Num Epochs = 20
  Instantaneous batch size per GPU = 48
  Total optimization steps = 3760


Iteration: 100%|██████████| 188/188 [01:02<00:00,  3.03it/s]
Iteration: 100%|██████████| 188/188 [01:02<00:00,  3.00it/s]
Evaluating: 100%|██████████| 125/125 [00:02<00:00, 49.07it/s]
Iteration:  66%|██████▌   | 124/188 [00:45<01:27,  1.37s/it]

Eval loss at 500 is 271.78


Iteration: 100%|██████████| 188/188 [01:06<00:00,  2.82it/s]
Iteration: 100%|██████████| 188/188 [01:03<00:00,  2.97it/s]
Iteration: 100%|██████████| 188/188 [01:02<00:00,  2.99it/s]
Evaluating: 100%|██████████| 125/125 [00:02<00:00, 49.15it/s]
Iteration:  32%|███▏      | 60/188 [00:23<02:55,  1.37s/it]

Eval loss at 1000 is 257.74


Iteration: 100%|██████████| 188/188 [01:06<00:00,  2.82it/s]
Iteration: 100%|██████████| 188/188 [01:03<00:00,  2.97it/s]
Evaluating: 100%|██████████| 125/125 [00:02<00:00, 49.00it/s]
Iteration:  98%|█████████▊| 184/188 [01:05<00:05,  1.39s/it]

Eval loss at 1500 is 274.02


Iteration: 100%|██████████| 188/188 [01:06<00:00,  2.83it/s]
Iteration: 100%|██████████| 188/188 [01:03<00:00,  2.98it/s]
Iteration: 100%|██████████| 188/188 [01:03<00:00,  2.98it/s]
Evaluating: 100%|██████████| 125/125 [00:02<00:00, 48.70it/s]
Iteration:  64%|██████▍   | 120/188 [00:43<01:34,  1.39s/it]

Eval loss at 2000 is 263.11


Iteration: 100%|██████████| 188/188 [01:06<00:00,  2.83it/s]
Iteration: 100%|██████████| 188/188 [01:02<00:00,  2.99it/s]
Iteration: 100%|██████████| 188/188 [01:03<00:00,  2.98it/s]
Evaluating: 100%|██████████| 125/125 [00:02<00:00, 48.68it/s]
Iteration:  30%|██▉       | 56/188 [00:22<03:00,  1.37s/it]

Eval loss at 2500 is 263.85


Iteration: 100%|██████████| 188/188 [01:06<00:00,  2.82it/s]
Iteration: 100%|██████████| 188/188 [01:02<00:00,  2.99it/s]
Evaluating: 100%|██████████| 125/125 [00:02<00:00, 48.98it/s]
Iteration:  96%|█████████▌| 180/188 [01:03<00:11,  1.40s/it]

Eval loss at 3000 is 255.08


Iteration: 100%|██████████| 188/188 [01:05<00:00,  2.86it/s]
Iteration: 100%|██████████| 188/188 [01:03<00:00,  2.98it/s]
Iteration: 100%|██████████| 188/188 [01:03<00:00,  2.97it/s]
Evaluating: 100%|██████████| 125/125 [00:02<00:00, 49.06it/s]
Iteration:  62%|██████▏   | 116/188 [00:42<01:41,  1.41s/it]

Eval loss at 3500 is 261.31


Iteration: 100%|██████████| 188/188 [01:06<00:00,  2.82it/s]
Iteration: 100%|██████████| 188/188 [01:03<00:00,  2.98it/s]
Epoch: 100%|██████████| 20/20 [21:24<00:00, 64.20s/it]
