In [21]:
import torch
import torch.nn as nn
import torch.optim as optim
import pandas as pd
from transformers import BertTokenizer, BertForSequenceClassification, get_linear_schedule_with_warmup
from torch.utils.data import Dataset, DataLoader
from torch.optim import AdamW

from tqdm import tqdm
import matplotlib.pyplot as plt
from IPython.display import display, clear_output

In [10]:
# Data Preparation
df = pd.read_csv('/kaggle/input/daigt-external-dataset/daigt_external_dataset.csv')
df['combined_text'] = df['text'].combine_first(df['source_text'])
df['label'] = df['text'].notna().astype(int)

In [11]:
# Tokenization & Dataset preparation
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
max_length = 256  # Adjust this based on your dataset

In [12]:
class TextDataset(Dataset):
    def __init__(self, texts, labels):
        self.texts = texts
        self.labels = labels

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        inputs = tokenizer.encode_plus(text, 
                                       add_special_tokens=True, 
                                       max_length=max_length, 
                                       padding='max_length', 
                                       truncation=True, 
                                       return_tensors='pt')
        input_ids = inputs['input_ids'].squeeze()
        attention_mask = inputs['attention_mask'].squeeze()
        return input_ids, attention_mask, label

In [13]:
# Define the train_size
train_size = int(0.8 * len(df))
val_size = len(df) - train_size

# Split the indices of the dataframe
indices = torch.randperm(len(df)).tolist()
train_indices = indices[:train_size]
val_indices = indices[train_size:]

# Extract train and validation dataframes using the indices
train_df = df.iloc[train_indices]
val_df = df.iloc[val_indices]

# Initialize your TextDataset and DataLoaders
train_dataset = TextDataset(train_df['combined_text'].tolist(), train_df['label'].tolist())
val_dataset = TextDataset(val_df['combined_text'].tolist(), val_df['label'].tolist())
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32)

In [14]:
# Model
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=1).to(torch.device('cuda' if torch.cuda.is_available() else 'cpu'))

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [15]:
# Loss & Optimizer
criterion = nn.BCEWithLogitsLoss()
optimizer = AdamW(model.parameters(), lr=2e-5)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=len(train_loader) * 3)

In [22]:
# Training loop
training_losses = []
num_epochs = 3
for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    
    # Wrap the DataLoader with tqdm for a progress bar
    progress_bar = tqdm(train_loader, desc=f"Epoch {epoch+1}")
    for input_ids, attention_mask, labels in progress_bar:
        input_ids, attention_mask, labels = input_ids.to(torch.device('cuda' if torch.cuda.is_available() else 'cpu')), attention_mask.to(torch.device('cuda' if torch.cuda.is_available() else 'cpu')), labels.to(torch.device('cuda' if torch.cuda.is_available() else 'cpu')).float()
        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask=attention_mask).logits.squeeze(1)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        scheduler.step()
        total_loss += loss.item()
        # Update tqdm description with the latest loss
        progress_bar.set_description(f"Epoch {epoch+1} - Loss: {loss.item():.4f}")
    
    avg_loss = total_loss / len(train_loader)
    print(f"Epoch {epoch+1}, Average Loss: {avg_loss:.4f}")

Epoch 1 - Loss: 0.0046: 100%|██████████| 61/61 [00:32<00:00,  1.87it/s]


Epoch 1, Average Loss: 0.0048


Epoch 2 - Loss: 0.0047: 100%|██████████| 61/61 [00:33<00:00,  1.84it/s]


Epoch 2, Average Loss: 0.0047


Epoch 3 - Loss: 0.0046: 100%|██████████| 61/61 [00:32<00:00,  1.89it/s]

Epoch 3, Average Loss: 0.0047





In [17]:
# Evaluation
model.eval()
correct = 0
total = 0
with torch.no_grad():
    for input_ids, attention_mask, labels in val_loader:
        input_ids, attention_mask, labels = input_ids.to(torch.device('cuda' if torch.cuda.is_available() else 'cpu')), attention_mask.to(torch.device('cuda' if torch.cuda.is_available() else 'cpu')), labels.to(torch.device('cuda' if torch.cuda.is_available() else 'cpu')).float()
        outputs = model(input_ids, attention_mask=attention_mask).logits.squeeze(1)
        rounded_preds = torch.round(torch.sigmoid(outputs))
        correct += (rounded_preds == labels).sum().item()
        total += len(labels)

print(f"Accuracy: {correct / total}")

Accuracy: 1.0
