In [1]:
import os
import wandb
import numpy as np 
import pandas as pd
from tqdm import tqdm
from transformers import RobertaForMaskedLM, RobertaTokenizerFast, DataCollatorForLanguageModeling, TrainingArguments, Trainer

from torch.utils.data import DataLoader, Dataset

Dataset code from Big News paper: 
* https://github.com/launchnlp/POLITICS/blob/main/src/files/dataset.py

<h2>Load in Pre-Trained RoBERTa</h2>

In [2]:
model = RobertaForMaskedLM.from_pretrained("roberta-base")

In [3]:
tokenizer = RobertaTokenizerFast.from_pretrained('roberta-base', do_lower_case = True)

<h2>Load in Data</h2>

In [4]:
class MediaMLMDataset(Dataset):
    def __init__(self, data:list[str], tokenizer, max_len:int = 512):
        self.tokenizer = tokenizer
        self.max_len = max_len
        self.data = data
        
    
    def __getitem__(self, index):
        text = self.data[index]
        text = text.strip()
        
        return self.tokenizer(
                    text = text, 
                    padding = True, 
                    truncation = True, 
                    max_length = self.max_len, 
                    return_attention_mask = True, 
                    add_special_tokens = True,
                    return_special_tokens_mask = True,
                    return_token_type_ids = False,
                    return_offsets_mapping = False)
        
    
    def __len__(self):
        return len(self.data)
        

In [16]:
texts = []
files = [f for f in os.listdir("../data/partisan_media") if f.endswith('.txt')]
for file in files:
    with open(os.path.join('../data/partisan_media', file)) as f:
        text = f.readlines()
        texts.extend(text)

In [17]:
mlm_dataset = MediaMLMDataset(texts, tokenizer)

In [18]:
data_collator = DataCollatorForLanguageModeling(
    tokenizer = tokenizer, 
    mlm = True, 
    mlm_probability = 0.15,
)

In [19]:
BATCH_SIZE = 8
dataloader = DataLoader(mlm_dataset, batch_size = BATCH_SIZE, collate_fn = data_collator)

<h2>Set Up Training</h2>

In [20]:
from transformers import AutoModelForMaskedLM
model_checkpoint = "distilbert-base-uncased"
model = AutoModelForMaskedLM.from_pretrained(model_checkpoint)

In [21]:
training_args = TrainingArguments(
    output_dir = 'roberta_pretraining',
    num_train_epochs = 5,
    per_device_train_batch_size = 8,
    per_device_eval_batch_size = 8,
    evaluation_strategy = "epoch",
    logging_dir = f"roberta_pretraining/logs",
    logging_strategy = "steps",
    logging_steps = 10,
    learning_rate = 5e-5,
    weight_decay = 0.01,
    warmup_steps = 500,
    save_strategy = "epoch",
    load_best_model_at_end = True,
    save_total_limit = 2
)

In [22]:
trainer = Trainer(
    model = model,
    args = training_args,
    train_dataset = mlm_dataset,
    data_collator = data_collator,
    tokenizer = tokenizer,
)

In [23]:
trainer.train()

[34m[1mwandb[0m: Currently logged in as: [33mhaleyej[0m. Use [1m`wandb login --relogin`[0m to force relogin


You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
Error: command buffer exited with error status.
	The Metal Performance Shaders operations encoded on it may not have completed.
	Error: 
	(null)
	Internal Error (0000010a:Internal Error)
	<AGXG14GFamilyCommandBuffer: 0x3b8364c80>
    label = <none> 
    device = <AGXG14GDevice: 0x12f59dc00>
        name = Apple M2 
    commandQueue = <AGXG14GFamilyCommandQueue: 0x12e7fee00>
        label = <none> 
        device = <AGXG14GDevice: 0x12f59dc00>
            name = Apple M2 
    retainedReferences = 1
Error: command buffer exited with error status.
	The Metal Performance Shaders operations encoded on it may not have completed.
	Error: 
	(null)
	Internal Error (0000010a:Internal Error)
	<AGXG14GFamilyCommandBuffer: 0x3b83c14a0>
    label = <none> 
    device = <AGXG

Epoch,Training Loss,Validation Loss


Error: command buffer exited with error status.
	The Metal Performance Shaders operations encoded on it may not have completed.
	Error: 
	(null)
	Internal Error (0000010a:Internal Error)
	<AGXG14GFamilyCommandBuffer: 0x3bfe6e140>
    label = <none> 
    device = <AGXG14GDevice: 0x12f59dc00>
        name = Apple M2 
    commandQueue = <AGXG14GFamilyCommandQueue: 0x12e7fee00>
        label = <none> 
        device = <AGXG14GDevice: 0x12f59dc00>
            name = Apple M2 
    retainedReferences = 1
Error: command buffer exited with error status.
	The Metal Performance Shaders operations encoded on it may not have completed.
	Error: 
	(null)
	Internal Error (0000010a:Internal Error)
	<AGXG14GFamilyCommandBuffer: 0x4031d3d70>
    label = <none> 
    device = <AGXG14GDevice: 0x12f59dc00>
        name = Apple M2 
    commandQueue = <AGXG14GFamilyCommandQueue: 0x12e7fee00>
        label = <none> 
        device = <AGXG14GDevice: 0x12f59dc00>
            name = Apple M2 
    retainedReference

Error: command buffer exited with error status.
	The Metal Performance Shaders operations encoded on it may not have completed.
	Error: 
	(null)
	Internal Error (0000010a:Internal Error)
	<AGXG14GFamilyCommandBuffer: 0x31ffbf150>
    label = <none> 
    device = <AGXG14GDevice: 0x12f59dc00>
        name = Apple M2 
    commandQueue = <AGXG14GFamilyCommandQueue: 0x12e7fee00>
        label = <none> 
        device = <AGXG14GDevice: 0x12f59dc00>
            name = Apple M2 
    retainedReferences = 1
Error: command buffer exited with error status.
	The Metal Performance Shaders operations encoded on it may not have completed.
	Error: 
	(null)
	Internal Error (0000010a:Internal Error)
	<AGXG14GFamilyCommandBuffer: 0x371fd3de0>
    label = <none> 
    device = <AGXG14GDevice: 0x12f59dc00>
        name = Apple M2 
    commandQueue = <AGXG14GFamilyCommandQueue: 0x12e7fee00>
        label = <none> 
        device = <AGXG14GDevice: 0x12f59dc00>
            name = Apple M2 
    retainedReference

Error: command buffer exited with error status.
	The Metal Performance Shaders operations encoded on it may not have completed.
	Error: 
	(null)
	Internal Error (0000010a:Internal Error)
	<AGXG14GFamilyCommandBuffer: 0x4cce98400>
    label = <none> 
    device = <AGXG14GDevice: 0x12f59dc00>
        name = Apple M2 
    commandQueue = <AGXG14GFamilyCommandQueue: 0x12e7fee00>
        label = <none> 
        device = <AGXG14GDevice: 0x12f59dc00>
            name = Apple M2 
    retainedReferences = 1
Error: command buffer exited with error status.
	The Metal Performance Shaders operations encoded on it may not have completed.
	Error: 
	(null)
	Internal Error (0000010a:Internal Error)
	<AGXG14GFamilyCommandBuffer: 0x58267e130>
    label = <none> 
    device = <AGXG14GDevice: 0x12f59dc00>
        name = Apple M2 
    commandQueue = <AGXG14GFamilyCommandQueue: 0x12e7fee00>
        label = <none> 
        device = <AGXG14GDevice: 0x12f59dc00>
            name = Apple M2 
    retainedReference

Error: command buffer exited with error status.
	The Metal Performance Shaders operations encoded on it may not have completed.
	Error: 
	(null)
	Internal Error (0000010a:Internal Error)
	<AGXG14GFamilyCommandBuffer: 0x5837e7dc0>
    label = <none> 
    device = <AGXG14GDevice: 0x12f59dc00>
        name = Apple M2 
    commandQueue = <AGXG14GFamilyCommandQueue: 0x12e7fee00>
        label = <none> 
        device = <AGXG14GDevice: 0x12f59dc00>
            name = Apple M2 
    retainedReferences = 1
Error: command buffer exited with error status.
	The Metal Performance Shaders operations encoded on it may not have completed.
	Error: 
	(null)
	Internal Error (0000010a:Internal Error)
	<AGXG14GFamilyCommandBuffer: 0x5f1540740>
    label = <none> 
    device = <AGXG14GDevice: 0x12f59dc00>
        name = Apple M2 
    commandQueue = <AGXG14GFamilyCommandQueue: 0x12e7fee00>
        label = <none> 
        device = <AGXG14GDevice: 0x12f59dc00>
            name = Apple M2 
    retainedReference

Error: command buffer exited with error status.
	The Metal Performance Shaders operations encoded on it may not have completed.
	Error: 
	(null)
	Internal Error (0000010a:Internal Error)
	<AGXG14GFamilyCommandBuffer: 0x6bab726e0>
    label = <none> 
    device = <AGXG14GDevice: 0x12f59dc00>
        name = Apple M2 
    commandQueue = <AGXG14GFamilyCommandQueue: 0x12e7fee00>
        label = <none> 
        device = <AGXG14GDevice: 0x12f59dc00>
            name = Apple M2 
    retainedReferences = 1
Error: command buffer exited with error status.
	The Metal Performance Shaders operations encoded on it may not have completed.
	Error: 
	(null)
	Internal Error (0000010a:Internal Error)
	<AGXG14GFamilyCommandBuffer: 0x6ba9f7040>
    label = <none> 
    device = <AGXG14GDevice: 0x12f59dc00>
        name = Apple M2 
    commandQueue = <AGXG14GFamilyCommandQueue: 0x12e7fee00>
        label = <none> 
        device = <AGXG14GDevice: 0x12f59dc00>
            name = Apple M2 
    retainedReference

Error: command buffer exited with error status.
	The Metal Performance Shaders operations encoded on it may not have completed.
	Error: 
	(null)
	Internal Error (0000010a:Internal Error)
	<AGXG14GFamilyCommandBuffer: 0x4cc4dd5c0>
    label = <none> 
    device = <AGXG14GDevice: 0x12f59dc00>
        name = Apple M2 
    commandQueue = <AGXG14GFamilyCommandQueue: 0x12e7fee00>
        label = <none> 
        device = <AGXG14GDevice: 0x12f59dc00>
            name = Apple M2 
    retainedReferences = 1
Error: command buffer exited with error status.
	The Metal Performance Shaders operations encoded on it may not have completed.
	Error: 
	(null)
	Internal Error (0000010a:Internal Error)
	<AGXG14GFamilyCommandBuffer: 0x5a6d747f0>
    label = <none> 
    device = <AGXG14GDevice: 0x12f59dc00>
        name = Apple M2 
    commandQueue = <AGXG14GFamilyCommandQueue: 0x12e7fee00>
        label = <none> 
        device = <AGXG14GDevice: 0x12f59dc00>
            name = Apple M2 
    retainedReference

Error: command buffer exited with error status.
	The Metal Performance Shaders operations encoded on it may not have completed.
	Error: 
	(null)
	Internal Error (0000010a:Internal Error)
	<AGXG14GFamilyCommandBuffer: 0x6b9e4bfe0>
    label = <none> 
    device = <AGXG14GDevice: 0x12f59dc00>
        name = Apple M2 
    commandQueue = <AGXG14GFamilyCommandQueue: 0x12e7fee00>
        label = <none> 
        device = <AGXG14GDevice: 0x12f59dc00>
            name = Apple M2 
    retainedReferences = 1
Error: command buffer exited with error status.
	The Metal Performance Shaders operations encoded on it may not have completed.
	Error: 
	(null)
	Internal Error (0000010a:Internal Error)
	<AGXG14GFamilyCommandBuffer: 0x6ba1bb690>
    label = <none> 
    device = <AGXG14GDevice: 0x12f59dc00>
        name = Apple M2 
    commandQueue = <AGXG14GFamilyCommandQueue: 0x12e7fee00>
        label = <none> 
        device = <AGXG14GDevice: 0x12f59dc00>
            name = Apple M2 
    retainedReference

Error: command buffer exited with error status.
	The Metal Performance Shaders operations encoded on it may not have completed.
	Error: 
	(null)
	Internal Error (0000010a:Internal Error)
	<AGXG14GFamilyCommandBuffer: 0x71ee168c0>
    label = <none> 
    device = <AGXG14GDevice: 0x12f59dc00>
        name = Apple M2 
    commandQueue = <AGXG14GFamilyCommandQueue: 0x12e7fee00>
        label = <none> 
        device = <AGXG14GDevice: 0x12f59dc00>
            name = Apple M2 
    retainedReferences = 1
Error: command buffer exited with error status.
	The Metal Performance Shaders operations encoded on it may not have completed.
	Error: 
	(null)
	Internal Error (0000010a:Internal Error)
	<AGXG14GFamilyCommandBuffer: 0x71ee168c0>
    label = <none> 
    device = <AGXG14GDevice: 0x12f59dc00>
        name = Apple M2 
    commandQueue = <AGXG14GFamilyCommandQueue: 0x12e7fee00>
        label = <none> 
        device = <AGXG14GDevice: 0x12f59dc00>
            name = Apple M2 
    retainedReference

Error: command buffer exited with error status.
	The Metal Performance Shaders operations encoded on it may not have completed.
	Error: 
	(null)
	Internal Error (0000010a:Internal Error)
	<AGXG14GFamilyCommandBuffer: 0x602b102d0>
    label = <none> 
    device = <AGXG14GDevice: 0x12f59dc00>
        name = Apple M2 
    commandQueue = <AGXG14GFamilyCommandQueue: 0x12e7fee00>
        label = <none> 
        device = <AGXG14GDevice: 0x12f59dc00>
            name = Apple M2 
    retainedReferences = 1
Error: command buffer exited with error status.
	The Metal Performance Shaders operations encoded on it may not have completed.
	Error: 
	(null)
	Internal Error (0000010a:Internal Error)
	<AGXG14GFamilyCommandBuffer: 0x600958890>
    label = <none> 
    device = <AGXG14GDevice: 0x12f59dc00>
        name = Apple M2 
    commandQueue = <AGXG14GFamilyCommandQueue: 0x12e7fee00>
        label = <none> 
        device = <AGXG14GDevice: 0x12f59dc00>
            name = Apple M2 
    retainedReference

Error: command buffer exited with error status.
	The Metal Performance Shaders operations encoded on it may not have completed.
	Error: 
	(null)
	Internal Error (0000010a:Internal Error)
	<AGXG14GFamilyCommandBuffer: 0x71bac41a0>
    label = <none> 
    device = <AGXG14GDevice: 0x12f59dc00>
        name = Apple M2 
    commandQueue = <AGXG14GFamilyCommandQueue: 0x12e7fee00>
        label = <none> 
        device = <AGXG14GDevice: 0x12f59dc00>
            name = Apple M2 
    retainedReferences = 1
Error: command buffer exited with error status.
	The Metal Performance Shaders operations encoded on it may not have completed.
	Error: 
	(null)
	Internal Error (0000010a:Internal Error)
	<AGXG14GFamilyCommandBuffer: 0x7d334bbc0>
    label = <none> 
    device = <AGXG14GDevice: 0x12f59dc00>
        name = Apple M2 
    commandQueue = <AGXG14GFamilyCommandQueue: 0x12e7fee00>
        label = <none> 
        device = <AGXG14GDevice: 0x12f59dc00>
            name = Apple M2 
    retainedReference

KeyboardInterrupt: 