## üì¶ Imports and Setup

Imports all required libraries for data handling, PyTorch model building, and Hugging Face Transformers.

In [None]:
import os
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from transformers import BertTokenizer, BertForSequenceClassification, BertConfig
from transformers.models.bert.modeling_bert import BertEncoder
from sklearn.metrics import roc_auc_score


## ‚öôÔ∏è Device Setup

Selects the fastest available device (MPS ‚Üí Metal GPU on Mac, otherwise CPU) and clears any cached GPU memory.

In [None]:
# =========================================================
# Device setup ‚Äî works on CUDA (Colab), MPS (Apple), or CPU
# =========================================================

if torch.cuda.is_available():
    device = torch.device("cuda")
    torch.cuda.empty_cache()
    print(f"‚úÖ Using CUDA GPU: {torch.cuda.get_device_name(0)}")

elif torch.backends.mps.is_available():
    device = torch.device("mps")
    torch.mps.empty_cache()
    print("‚úÖ Using Apple GPU (MPS)")

else:
    device = torch.device("cpu")
    print("‚öôÔ∏è  Using CPU (no GPU detected)")

print("Device:", device)


## üìÅ File Paths

Defines where to load and save model/data artifacts.

In [None]:
TRAIN_PATH = "./data/train_essays.csv"
TEST_PATH = "./data/test_essays.csv"
PROMPT_PATH = "./data/train_prompts.csv"
tokenizer_save_path = "./models/tokenizer"
model_save_path = "./models/bert-base"


## üìÇ Load and Prepare Data

Reads CSV files, renames columns for clarity, and confirms dataset shapes.

In [None]:
src_train = pd.read_csv(TRAIN_PATH)
src_test = pd.read_csv(TEST_PATH)
src_prompt = pd.read_csv(PROMPT_PATH)
print("‚úÖ Files loaded:")
print(f"Train: {src_train.shape}, Test: {src_test.shape}, Prompts: {src_prompt.shape}")
src_train.rename(columns={"text": "essay_text", "generated": "label"}, inplace=True)
src_test.rename(columns={"text": "essay_text"}, inplace=True)


## ü§ó Tokenizer and Pretrained Model

Loads BERT base uncased, moves it to MPS/CPU, and saves tokenizer + model locally for reuse.

In [None]:
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
pretrained_model = BertForSequenceClassification.from_pretrained("bert-base-uncased")
embedding_model = pretrained_model.bert.to(device)
os.makedirs(tokenizer_save_path, exist_ok=True)
os.makedirs(model_save_path, exist_ok=True)
tokenizer.save_pretrained(tokenizer_save_path)
pretrained_model.save_pretrained(model_save_path)


## ‚öôÔ∏è Training Parameters

Defines batch sizes, learning rate, and GAN/BERT architecture settings.

In [None]:
train_batch_size = 8
test_batch_size = 16
lr = 1e-4
beta1 = 0.5
nz = 100
num_epochs = 3
num_hidden_layers = 6
train_ratio = 0.8


## üß© Dataset Class and Split

Creates a custom PyTorch Dataset and splits the training set into train/test subsets.

In [None]:
class GANDAIGDataset(Dataset):
    def __init__(self, texts, labels=None):
        self.texts = texts
        self.labels = labels
    def __len__(self): return len(self.texts)
    def __getitem__(self, idx):
        text = self.texts[idx]
        return (text, self.labels[idx]) if self.labels is not None else text

all_num = len(src_train)
train_num = int(all_num * train_ratio)
train_set = src_train.sample(train_num, random_state=42)
test_set = src_train.drop(train_set.index).reset_index(drop=True)
train_dataset = GANDAIGDataset(train_set["essay_text"].tolist(), train_set["label"].tolist())
test_dataset  = GANDAIGDataset(test_set["essay_text"].tolist(),  test_set["label"].tolist())
train_loader  = DataLoader(train_dataset, batch_size=train_batch_size, shuffle=True)
test_loader   = DataLoader(test_dataset,  batch_size=test_batch_size,  shuffle=False)


## üß† Model Definitions

Defines the Generator, Discriminator, and pooling layer for GAN-BERT.

In [None]:
config = BertConfig(num_hidden_layers=num_hidden_layers)

class Generator(nn.Module):
    def __init__(self, input_dim):
        super().__init__()
        self.fc = nn.Linear(input_dim, 256 * 128)
        self.conv_net = nn.Sequential(
            nn.Conv1d(256, 128, 3, padding=1), nn.ReLU(),
            nn.Conv1d(128, 768, 3, padding=1), nn.ReLU()
        )
        self.bert_encoder = BertEncoder(config)
    def forward(self, x):
        x = self.fc(x).view(-1, 128, 256).permute(0, 2, 1)
        x = self.conv_net(x).permute(0, 2, 1)
        return self.bert_encoder(x)

class SumBertPooler(nn.Module):
    def forward(self, hidden_states):
        sum_hidden = hidden_states.sum(dim=1)
        denom = torch.clamp(sum_hidden.sum(1).unsqueeze(1), min=1e-9)
        return sum_hidden / denom

class Discriminator(nn.Module):
    def __init__(self):
        super().__init__()
        self.bert_encoder = BertEncoder(config)
        self.bert_encoder.layer = nn.ModuleList(
            [layer for layer in pretrained_model.bert.encoder.layer[:6]]
        )
        self.pooler = SumBertPooler()
        self.classifier = nn.Sequential(
            nn.Linear(768, 256), nn.ReLU(), nn.Linear(256, 1)
        )
    def forward(self, input):
        out = self.bert_encoder(input)
        out = self.pooler(out.last_hidden_state)
        out = self.classifier(out)
        return torch.sigmoid(out).view(-1)


## üßÆ Helper Functions

Evaluates AUC score and creates text ‚Üí embedding conversion utilities.

In [None]:
def eval_auc(model):
    model.eval(); preds, acts = [], []
    with torch.no_grad():
        for batch in test_loader:
            enc = tokenizer(batch[0], padding=True, truncation=True, return_tensors="pt")
            enc = {k: v.to(device) for k, v in enc.items()}
            embed = embedding_model(**enc).last_hidden_state
            lab = batch[1].float().to(device)
            out = model(embed)
            preds.extend(out.cpu().numpy()); acts.extend(lab.cpu().numpy())
    from sklearn.metrics import roc_auc_score
    try: auc = roc_auc_score(acts, preds)
    except ValueError: auc = 0.5
    print("AUC:", auc); return auc

def preparation_embedding(texts):
    enc = tokenizer(texts, padding=True, truncation=True, return_tensors="pt")
    enc = {k: v.to(device) for k, v in enc.items()}
    return embedding_model(**enc)


## üèãÔ∏è‚Äç‚ôÇÔ∏è Training Setup

Initializes networks, optimizers, and loss function.

In [None]:
netG = Generator(input_dim=nz).to(device)
netD = Discriminator().to(device)
criterion  = nn.BCELoss()
optimizerD = optim.Adam(netD.parameters(), lr=lr, betas=(beta1, 0.999))
optimizerG = optim.Adam(netG.parameters(), lr=lr, betas=(beta1, 0.999))


## üîÅ Training Loop

Performs alternating GAN updates and prints losses every 20 batches.

In [None]:
def GAN_step(optimizerG, optimizerD, netG, netD, real_data, label, epoch, i):
    netD.zero_grad()
    out = netD(real_data)
    errD_real = criterion(out, label); errD_real.backward()
    D_x = out.mean().item()
    noise = torch.randn(real_data.size(0), nz, device=device)
    fake_data = netG(noise).last_hidden_state
    label.fill_(1); out = netD(fake_data.detach())
    errD_fake = criterion(out, label); errD_fake.backward()
    D_G_z1 = out.mean().item()
    errD = errD_real + errD_fake; optimizerD.step()
    netG.zero_grad(); label.fill_(0)
    out = netD(fake_data); errG = criterion(out, label)
    errG.backward(); D_G_z2 = out.mean().item(); optimizerG.step()
    if i % 20 == 0:
        print(f"[{epoch}/{num_epochs}][{i}/{len(train_loader)}] "
              f"Loss_D:{errD.item():.4f} Loss_G:{errG.item():.4f} "
              f"D(x):{D_x:.4f} D(G(z)):{D_G_z1:.4f}/{D_G_z2:.4f}")
    return optimizerG, optimizerD, netG, netD

model_infos=[]
for epoch in range(num_epochs):
    for i, data in enumerate(train_loader):
        with torch.no_grad(): embed = preparation_embedding(data[0])
        optimizerG, optimizerD, netG, netD = GAN_step(
            optimizerG, optimizerD, netG, netD,
            real_data=embed.last_hidden_state.to(device),
            label=data[1].float().to(device),
            epoch=epoch, i=i)
    auc_score = eval_auc(netD)
    model_infos.append({"epoch": epoch, "auc_score": auc_score})
print("‚úÖ Training complete!")


## üßæ Inference and Submission

Runs the trained discriminator on test essays and saves predictions.

In [None]:
inference_dataset = GANDAIGDataset(src_test["essay_text"].tolist())
inference_loader = DataLoader(inference_dataset, batch_size=test_batch_size, shuffle=False)
netD.eval(); preds=[]
with torch.no_grad():
    for batch in inference_loader:
        enc = tokenizer(batch, padding=True, truncation=True, return_tensors="pt")
        enc = {k: v.to(device) for k, v in enc.items()}
        embed = embedding_model(**enc).last_hidden_state
        out = netD(embed); preds.extend(out.cpu().numpy())
sub_df = pd.DataFrame({"id": src_test["id"], "prediction": preds})
os.makedirs("outputs", exist_ok=True)
sub_df.to_csv("outputs/submission.csv", index=False)
print("‚úÖ Inference complete! Saved to outputs/submission.csv")
