In [48]:
import os
import json
import wandb
import numpy as np 
import pandas as pd
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from transformers import RobertaForMaskedLM, RobertaTokenizerFast, DataCollatorForLanguageModeling, TrainingArguments, Trainer, AutoModelForMaskedLM

from torch.utils.data import DataLoader, Dataset

Dataset code from Big News paper: 
* https://github.com/launchnlp/POLITICS/blob/main/src/files/dataset.py

TO DO: 
* keep reddit / big news separate
* convert to a script

<h2>Partisan Media Dataset</h2>

In [3]:
tokenizer = RobertaTokenizerFast.from_pretrained('roberta-base', do_lower_case = True)

In [4]:
class MediaMLMDataset(Dataset):
    def __init__(self, data:list[str], tokenizer, max_len:int = 512):
        self.tokenizer = tokenizer
        self.max_len = max_len
        self.data = data
        
    
    def __getitem__(self, index):
        text = self.data[index]
        text = text.strip()
        
        return self.tokenizer(
                    text = text, 
                    padding = True, 
                    truncation = True, 
                    max_length = self.max_len, 
                    return_attention_mask = True, 
                    add_special_tokens = True,
                    return_special_tokens_mask = True,
                    return_token_type_ids = False,
                    return_offsets_mapping = False)
        
    
    def __len__(self):
        return len(self.data)
        

In [13]:
LABEL = 'right'
texts = []
files = [f for f in os.listdir("../data/partisan_media") if f.endswith('.txt')]
for file in files:
    if label in file:  
        with open(os.path.join('../data/partisan_media', file)) as f:
            text = f.readlines()
            texts.extend(text)

In [25]:
train_texts, eval_texts =  train_test_split(texts[:1000])

In [26]:
mlm_dataset = MediaMLMDataset(train_texts, tokenizer)
eval_dataset = MediaMLMDataset(eval_texts, tokenizer)

In [27]:
data_collator = DataCollatorForLanguageModeling(
    tokenizer = tokenizer, 
    mlm = True, 
    mlm_probability = 0.15,
)

In [28]:
BATCH_SIZE = 8
dataloader = DataLoader(mlm_dataset, batch_size = BATCH_SIZE, collate_fn = data_collator)
eval_dataloader = DataLoader(eval_dataset, batch_size = BATCH_SIZE, collate_fn = data_collator)

<h3>Set Up Training</h3>

In [60]:
model_checkpoint = "roberta-base"
model = AutoModelForMaskedLM.from_pretrained(model_checkpoint)

In [72]:
training_args = TrainingArguments(
    output_dir = 'roberta_pretraining',
    num_train_epochs = 5,
    per_device_train_batch_size = 8,
    per_device_eval_batch_size = 8,
    evaluation_strategy = "epoch",
    logging_dir = f"roberta_pretraining/logs",
    logging_strategy = "steps",
    logging_steps = 10,
    learning_rate = 5e-5,
    weight_decay = 0.01,
    warmup_steps = 500,
    save_strategy = "epoch",
    load_best_model_at_end = True,
    save_total_limit = 2
)

In [73]:
trainer = Trainer(
    model = model,
    args = training_args,
    train_dataset = mlm_dataset,
    eval_dataset = eval_dataset,
    data_collator = data_collator,
    tokenizer = tokenizer,
)

In [74]:
trainer.train()

RuntimeError: MPS backend out of memory (MPS allocated: 7.98 GB, other allocations: 18.53 GB, max allowed: 27.20 GB). Tried to allocate 785.39 MB on private pool. Use PYTORCH_MPS_HIGH_WATERMARK_RATIO=0.0 to disable upper limit for memory allocations (may cause system failure).