In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import pandas as pd
from transformers import BertTokenizer, BertForSequenceClassification, get_linear_schedule_with_warmup
from torch.utils.data import Dataset, DataLoader
from torch.optim import AdamW

from tqdm import tqdm
import matplotlib.pyplot as plt
from IPython.display import display, clear_output



In [2]:
# Data Preparation
df = pd.read_csv('/kaggle/input/daigt-external-dataset/daigt_external_dataset.csv')
df['combined_text'] = df['text'].combine_first(df['source_text'])
df['label'] = df['text'].notna().astype(int)

In [3]:
# Tokenization & Dataset preparation
tokenizer = BertTokenizer.from_pretrained('/kaggle/input/bert-base-uncased')
max_length = 256  # Adjust this based on your dataset

In [4]:
class TextDataset(Dataset):
    def __init__(self, texts, labels):
        self.texts = texts
        self.labels = labels

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        inputs = tokenizer.encode_plus(text, 
                                       add_special_tokens=True, 
                                       max_length=max_length, 
                                       padding='max_length', 
                                       truncation=True, 
                                       return_tensors='pt')
        input_ids = inputs['input_ids'].squeeze()
        attention_mask = inputs['attention_mask'].squeeze()
        return input_ids, attention_mask, label

In [5]:
# Define the train_size
train_size = int(0.8 * len(df))
val_size = len(df) - train_size

# Split the indices of the dataframe
indices = torch.randperm(len(df)).tolist()
train_indices = indices[:train_size]
val_indices = indices[train_size:]

# Extract train and validation dataframes using the indices
train_df = df.iloc[train_indices]
val_df = df.iloc[val_indices]

# Initialize your TextDataset and DataLoaders
train_dataset = TextDataset(train_df['combined_text'].tolist(), train_df['label'].tolist())
val_dataset = TextDataset(val_df['combined_text'].tolist(), val_df['label'].tolist())
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32)

In [6]:
# Model
model = BertForSequenceClassification.from_pretrained('/kaggle/input/bert-base-uncased', num_labels=2).to(torch.device('cuda' if torch.cuda.is_available() else 'cpu'))

In [7]:
# Loss & Optimizer
criterion = nn.BCEWithLogitsLoss()
optimizer = AdamW(model.parameters(), lr=2e-5)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=len(train_loader) * 3)

In [8]:
# Training loop
training_losses = []
num_epochs = 3
for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    
    # Wrap the DataLoader with tqdm for a progress bar
    progress_bar = tqdm(train_loader, desc=f"Epoch {epoch+1}")
    for input_ids, attention_mask, labels in progress_bar:
        input_ids, attention_mask, labels = input_ids.to(torch.device('cuda' if torch.cuda.is_available() else 'cpu')), attention_mask.to(torch.device('cuda' if torch.cuda.is_available() else 'cpu')), labels.to(torch.device('cuda' if torch.cuda.is_available() else 'cpu')).float()
        optimizer.zero_grad()
        # Get the logits from the model
        logits = model(input_ids, attention_mask=attention_mask).logits
        # Use only the logits for class 1
        outputs = logits[:, 1]
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        scheduler.step()
        total_loss += loss.item()
        # Update tqdm description with the latest loss
        progress_bar.set_description(f"Epoch {epoch+1} - Loss: {loss.item():.4f}")
    
    avg_loss = total_loss / len(train_loader)
    print(f"Epoch {epoch+1}, Average Loss: {avg_loss:.4f}")

Epoch 1 - Loss: 0.0136: 100%|██████████| 61/61 [01:56<00:00,  1.92s/it]


Epoch 1, Average Loss: 0.1203


Epoch 2 - Loss: 0.0080: 100%|██████████| 61/61 [01:56<00:00,  1.91s/it]


Epoch 2, Average Loss: 0.0098


Epoch 3 - Loss: 0.0065: 100%|██████████| 61/61 [01:56<00:00,  1.91s/it]

Epoch 3, Average Loss: 0.0069





In [9]:
# Evaluation
model.eval()
correct = 0
total = 0
with torch.no_grad():
    for input_ids, attention_mask, labels in val_loader:
        input_ids, attention_mask, labels = input_ids.to(torch.device('cuda' if torch.cuda.is_available() else 'cpu')), attention_mask.to(torch.device('cuda' if torch.cuda.is_available() else 'cpu')), labels.to(torch.device('cuda' if torch.cuda.is_available() else 'cpu')).long()
        outputs = model(input_ids, attention_mask=attention_mask).logits
        preds = outputs.argmax(dim=1)  # get the index of the max log-probability
        correct += (preds == labels).sum().item()
        total += len(labels)

print(f"Accuracy: {correct / total}")

Accuracy: 1.0


In [10]:
test_df = pd.read_csv('/kaggle/input/llm-detect-ai-generated-text/test_essays.csv')
test_dataset = TextDataset(test_df['text'].tolist(), [0]*len(test_df))  # dummy labels
test_loader = DataLoader(test_dataset, batch_size=32)

# Model Inference
model.eval()
ids = test_df['id'].tolist()
predictions = []

with torch.no_grad():
    for input_ids, attention_mask, _ in test_loader:
        input_ids, attention_mask = input_ids.to(torch.device('cuda' if torch.cuda.is_available() else 'cpu')), attention_mask.to(torch.device('cuda' if torch.cuda.is_available() else 'cpu'))
        outputs = model(input_ids, attention_mask=attention_mask).logits
        ai_generated_logits = outputs[:, 1]  # take the logits for class `1` (generated by AI)
        probs = torch.sigmoid(ai_generated_logits).cpu().numpy()
        predictions.extend(probs)

# Create submission.csv
submission_df = pd.DataFrame({
    'id': ids,
    'generated': predictions
})

submission_df.to_csv('submission.csv', index=False)