In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import pandas as pd
from transformers import BertTokenizer, BertModel

In [None]:
# Load your dataset into a pandas DataFrame
df = pd.read_csv('/kaggle/input/daigt_external_dataset.csv')

# Combine student and AI texts into one column and create labels (0 for student, 1 for AI)
texts = list(df['text']) + list(df['source_text'])
labels = [0]*len(df['text']) + [1]*len(df['source_text'])

# Split data
train_texts, test_texts, train_labels, test_labels = train_test_split(
    texts, labels, test_size=0.2, random_state=42
)

# Tokenizer
tokenizer = transformers.BertTokenizer.from_pretrained('bert-base-uncased')
MAX_LEN = 256

In [None]:
class EssayDataset(Dataset):
    def __init__(self, essay_ids, texts, labels, tokenizer, max_len):
        self.essay_ids = essay_ids
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = self.labels[idx]
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            pad_to_max_length=True,
            return_attention_mask=True,
            return_tensors='pt',
        )
        return {
            'ids': encoding['input_ids'].flatten(),
            'mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.float),
            'essay_id': self.essay_ids[idx]
        }

In [None]:
class Classifier(nn.Module):
    def __init__(self):
        super(Classifier, self).__init__()
        self.bert = BertModel.from_pretrained('bert-base-uncased')
        self.dropout = nn.Dropout(0.3)
        self.linear = nn.Linear(768, 1)

    def forward(self, ids, mask):
        _, pooled_output = self.bert(ids, attention_mask=mask)
        output = self.dropout(pooled_output)
        return self.linear(output)

In [None]:
def train_epoch(model, data_loader, loss_fn, optimizer, device):
    model.train()
    total_loss = 0
    for d in data_loader:
        ids = d["ids"].to(device)
        mask = d["mask"].to(device)
        labels = d["labels"].unsqueeze(1).to(device)

        optimizer.zero_grad()
        outputs = model(ids, mask)
        loss = loss_fn(outputs, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    return total_loss / len(data_loader)

In [None]:
def predict_probabilities(model, data_loader, device):
    model.eval()
    predictions = []
    essay_ids = []
    with torch.no_grad():
        for d in data_loader:
            input_ids = d["ids"].to(device)
            mask = d["mask"].to(device)
            outputs = model(input_ids, mask)
            probs = torch.sigmoid(outputs).squeeze().tolist()
            predictions.extend(probs)
            essay_ids.extend(d["essay_id"])
    return essay_ids, predictions

In [None]:
# Replace with your dataset path
train_df = pd.read_csv('your_training_dataset_path.csv')
test_df = pd.read_csv('your_test_dataset_path.csv')

# Prepare data loaders
BATCH_SIZE = 16
train_dataset = EssayDataset(essay_ids, texts, labels, tokenizer, MAX_LEN)
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = Classifier().to(device)
optimizer = optim.Adam(model.parameters(), lr=1e-5)
loss_fn = nn.BCEWithLogitsLoss()

EPOCHS = 5
for epoch in range(EPOCHS):
    loss = train_epoch(model, train_loader, loss_fn, optimizer, device)
    print(f"Epoch {epoch + 1}/{EPOCHS} | Loss: {loss:.4f}")

# After training, predict on test set
test_texts = test_df['text'].tolist()
test_ids = test_df['id'].tolist()
test_dataset = EssayDataset(test_ids, test_texts, [0]*len(test_texts), tokenizer, MAX_LEN)  # Dummy labels
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)

ids, predictions = predict_probabilities(model, test_loader, device)

In [None]:
submission_df = pd.DataFrame({
    'id': ids,
    'generated': predictions
})

submission_df.to_csv('submission.csv', index=False)