In [6]:
import pandas as pd
from sklearn.model_selection import train_test_split

import torch
from torch.utils.data import DataLoader, Dataset
from transformers import RobertaTokenizer, RobertaModel, RobertaForSequenceClassification, AdamW
from torch.nn.utils.rnn import pad_sequence
from torch.optim import AdamW
from torch.cuda.amp import GradScaler, autocast
import torch.nn as nn
from tqdm.auto import tqdm

In [7]:
# Reading in the preprocessed augmented data
data_preprocessed = pd.read_csv('data_2_preprocessed_balanced.csv')
reduced_data_preprocessed = data_preprocessed.sample(n=500, random_state=42)  # Set random_state for reproducibility


In [8]:
### Splitting the data into training/test sets
X = reduced_data_preprocessed['processed_text']
y = reduced_data_preprocessed['binary_label']

# Split the data - 80% training, 20% testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=42)


In [9]:
# # Initialize tokenizer
# tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

# Check if a GPU is available and set the device accordingly
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load the tokenizer and model
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
roberta_model = RobertaModel.from_pretrained('roberta-base')

# Define your custom dataset
class TextDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=512):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, item):
        text = str(self.texts[item])
        label = self.labels[item]
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

# Create datasets (X_train, X_test, y_train, y_test should be defined)
train_dataset = TextDataset(X_train.tolist(), y_train.tolist(), tokenizer)
test_dataset = TextDataset(X_test.tolist(), y_test.tolist(), tokenizer)

# Create DataLoaders with multiple workers
train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True, num_workers=4)
test_loader = DataLoader(test_dataset, batch_size=4, num_workers=4)

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [10]:

# Define the RoBERTa-LSTM model
class RobertaLSTM(nn.Module):
    def __init__(self, roberta_model, lstm_hidden_size, num_classes):
        super(RobertaLSTM, self).__init__()
        self.roberta = roberta_model
        self.lstm = nn.LSTM(roberta_model.config.hidden_size, lstm_hidden_size, batch_first=True)
        self.classifier = nn.Linear(lstm_hidden_size, num_classes)

    def forward(self, input_ids, attention_mask):
        with torch.no_grad():
            _, pooled_output = self.roberta(input_ids=input_ids, attention_mask=attention_mask, return_dict=False)
        lstm_output, _ = self.lstm(pooled_output.unsqueeze(1))
        logits = self.classifier(lstm_output.squeeze(1))
        return logits

# Initialize the RoBERTa-LSTM model
model = RobertaLSTM(roberta_model, lstm_hidden_size=256, num_classes=2)
model = model.to(device)

# Initialize the optimizer and scaler for mixed precision
optimizer = AdamW(model.parameters(), lr=1e-5)
scaler = GradScaler()




In [None]:

# Training function with mixed precision and tqdm progress bar
def train_epoch(model, data_loader, optimizer, device, scaler):
    model.train()
    losses = []
    for batch in tqdm(data_loader, desc="Training"):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        optimizer.zero_grad()

        with autocast():
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            loss = loss_fn(outputs, labels)

        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()

        losses.append(loss.item())

    return np.mean(losses)

# Training loop with tqdm progress bar
num_epochs = 3  # Adjust as needed
for epoch in range(num_epochs):
    train_loss = train_epoch(model, train_loader, optimizer, device, scaler)
    print(f'Epoch {epoch+1}/{num_epochs}, Loss: {train_loss:.4f}')


In [None]:
# ## Training the model
# # Initialize mixed precision
# scaler = GradScaler()
#
# # Training function with mixed precision and gradient accumulation
# def train_epoch(model, data_loader, optimizer, device, scheduler, grad_accumulation_steps=2):
#     model.train()
#     total_loss = 0
#     for step, batch in enumerate(data_loader):
#         input_ids = batch['input_ids'].to(device)
#         attention_mask = batch['attention_mask'].to(device)
#         labels = batch['labels'].to(device)
#
#         with autocast():  # Enable mixed precision
#             outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
#             loss = outputs.loss / grad_accumulation_steps  # Scale loss
#
#         scaler.scale(loss).backward()  # Scale loss for mixed precision
#
#         # Perform optimization step every `grad_accumulation_steps` steps
#         if (step + 1) % grad_accumulation_steps == 0:
#             scaler.step(optimizer)
#             scaler.update()
#             optimizer.zero_grad()
#             scheduler.step()  # Update learning rate
#
#     return total_loss / len(data_loader)
#
# # Define the number of training epochs
# epochs = 3
#
# # Initialize the optimizer and scheduler
# optimizer = AdamW(model.parameters(), lr=5e-5)
# scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=len(train_loader) * epochs)
#
# # Evaluation function
# def eval_model(model, data_loader, device):
#     model.eval()
#     total_loss = 0
#     with torch.no_grad():
#         for batch in tqdm(data_loader):
#             input_ids = batch['input_ids'].to(device)
#             attention_mask = batch['attention_mask'].to(device)
#             labels = batch['labels'].to(device)
#             outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
#             loss = outputs.loss
#             total_loss += loss.item()
#     return total_loss / len(data_loader)
#
# # Training loop
# for epoch in range(epochs):
#     train_loss = train_epoch(model, train_loader, optimizer, device, scheduler)
#     print(f"Epoch {epoch + 1}/{epochs}, Train Loss: {train_loss:.4f}")
#     val_loss = eval_model(model, test_loader, device)
#     print(f"Epoch {epoch + 1}/{epochs}, Validation Loss: {val_loss:.4f}")
#
# # Save the model
# model.save_pretrained('roberta_model_1')
# tokenizer.save_pretrained('roberta_model_1')

