In [1]:
# Define a Config class to store constants
class Config:
    PROJECT_NAME = 'detect-ai'
    EPOCHS = 2
    BATCH_SIZE = 16
    LEARNING_RATE = 1e-5
    TRAIN_RATIO = 0.8
    MODEL_NAME = 'microsoft/deberta-base'
    MODEL_PATH = '/kaggle/input/deberta-base/'
    DATA_PATH = '/kaggle/input/mydataset.csv'

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_distances

In [3]:
class DataAnalyzer:
    
    def __init__(self, dataframe_path):
        self.df = pd.read_csv(dataframe_path)
        self.features = [
            "word_count", "sentence_count", "avg_word_length",
            "ttr", "bigram_counts", "capitalized_words", 
            "numeric_count"
        ]
        self.prompt_features = [
            "shared_vocab_ratio", "distance_from_prompt"
        ]
        self.extracted_data = None

    def extract_features(self, text, prompt):
        # Basic features
        tokens = text.split()
        word_count = len(tokens)
        sentence_count = text.count('.') + text.count('!') + text.count('?')
        avg_word_length = sum(len(word) for word in tokens) / word_count
        ttr = len(set(tokens)) / word_count
        bigram_counts = len(set(zip(tokens[:-1], tokens[1:])))
        capitalized_words = sum(1 for word in tokens if word[0].isupper())
        numeric_count = sum(1 for char in text if char.isdigit())
        
        # Prompt related features
        if pd.notnull(prompt):
            prompt_tokens = prompt.split()
            shared_vocab_ratio = len(set(tokens).intersection(set(prompt_tokens))) / (len(set(prompt_tokens)) + 1e-5)
            
            vectorizer = TfidfVectorizer()
            tfidf_matrix = vectorizer.fit_transform([text, prompt])
            distance_from_prompt = cosine_distances(tfidf_matrix[0:1], tfidf_matrix[1:2])[0][0]
        else:
            shared_vocab_ratio, distance_from_prompt = np.nan, np.nan
        
        feature_values = [
            word_count, sentence_count, avg_word_length,
            ttr, bigram_counts, capitalized_words, numeric_count
        ]
        prompt_feature_values = [shared_vocab_ratio, distance_from_prompt]
        
        return feature_values + prompt_feature_values

    def analyze_data(self):
        self.extracted_data = pd.DataFrame(
            self.df.apply(
                lambda row: self.extract_features(row['text'], row.get('prompt')), axis=1
            ).tolist(),
            columns=self.features + self.prompt_features,
            index=self.df.index
        )
        return self.extracted_data

    def visualize_data(self):
        fig, axs = plt.subplots(len(self.features + self.prompt_features), 1, figsize=(10, 30))
        for i, feature in enumerate(self.features + self.prompt_features):
            generated = self.extracted_data[self.df["generated"]==1][feature].dropna()
            student = self.extracted_data[self.df["generated"]==0][feature].dropna()
            axs[i].hist([generated, student], bins=30, alpha=0.7, label=['Generated', 'Student'])
            axs[i].set_title(feature)
            axs[i].legend(loc='upper right')
        plt.tight_layout()
        plt.show()

In [4]:
analyzer = DataAnalyzer(Config.DATA_PATH)
analyzed_data = analyzer.analyze_data()
# analyzer.visualize_data()

In [5]:
import wandb
import torch
from torch.optim import AdamW
from torch.utils.data import DataLoader, Dataset, random_split
from transformers import DebertaTokenizer, DebertaModel
from sklearn.metrics import roc_auc_score

In [6]:
# Custom dataset
class DetectAIDataset(Dataset):
    def __init__(self, texts, prompts, features, labels):
        self.texts = texts
        self.prompts = prompts
        self.features = features
        self.labels = labels

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        prompt = self.prompts[idx]
        feature = self.features[idx]
        label = self.labels[idx]
        return text, prompt, feature, label

In [7]:
# 1. Data Preparation
texts = analyzer.df['text'].tolist()
prompts = analyzer.df['prompt'].fillna("").tolist()  # Handling NaN
labels = analyzer.df['generated'].tolist()
features = analyzed_data.fillna(0).values

In [8]:
# Tokenize texts and prompts
tokenizer = DebertaTokenizer.from_pretrained(Config.MODEL_NAME)
encoded_data = tokenizer(texts, prompts, padding='max_length', truncation=True, max_length=512, return_tensors='pt')
input_ids = encoded_data['input_ids']
attention_masks = encoded_data['attention_mask']

dataset = DetectAIDataset(input_ids, attention_masks, features, labels)

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pai

In [9]:
# Split data into training and validation
train_size = int(Config.TRAIN_RATIO * len(dataset))
val_size = len(dataset) - train_size
train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=Config.BATCH_SIZE)
val_dataloader = DataLoader(val_dataset, batch_size=Config.BATCH_SIZE)

In [10]:
# 2. Model Definition
class DetectAIModel(torch.nn.Module):
    def __init__(self, feature_size):
        super(DetectAIModel, self).__init__()
        self.deberta = DebertaModel.from_pretrained(Config.MODEL_NAME)
        self.concat_dim = self.deberta.config.hidden_size + feature_size  # DeBERTa's embeddings + our features
        self.classifier = torch.nn.Sequential(
            torch.nn.Linear(self.concat_dim, 256),
            torch.nn.ReLU(),
            torch.nn.Linear(256, 1),
            torch.nn.Sigmoid()
        )

    def forward(self, input_ids, attention_mask, features):
        outputs = self.deberta(input_ids, attention_mask)
        cls_output = outputs.last_hidden_state[:, 0, :]
        combined = torch.cat((cls_output, features), dim=1)
        return self.classifier(combined)

In [11]:
from torch.cuda.amp import autocast, GradScaler
from tqdm import tqdm

In [12]:
# 3. Training
torch.cuda.empty_cache()
scaler = GradScaler()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = DetectAIModel(features.shape[1]).to(device).to(torch.float32)
optimizer = AdamW(model.parameters(), lr=Config.LEARNING_RATE)

# wandb.init(project=Config.PROJECT_NAME)

In [13]:
from tqdm.notebook import tqdm  # Use tqdm.notebook for Jupyter notebooks

# Define the loss function outside the loop for efficiency
loss_function = torch.nn.BCEWithLogitsLoss()

# Training loop
for epoch in range(Config.EPOCHS):
    # Training step
    model.train()
    train_loss = 0.0
    
    # Tracking progress using tqdm
    train_progress = tqdm(train_dataloader, desc=f"Epoch {epoch+1}/{Config.EPOCHS} - Training", leave=True)
    for input_ids, attention_mask, features, labels in train_progress:
        input_ids = input_ids.to(device)
        attention_mask = attention_mask.to(device)
        features = features.to(device).to(torch.float32)
        labels = labels.to(device).to(torch.float32)
        
        optimizer.zero_grad()
        with autocast():
            outputs = model(input_ids, attention_mask, features)
            loss = loss_function(outputs, labels.unsqueeze(1))
          
        # Scale the loss and call backward
        scaler.scale(loss).backward()
        scaler.step(optimizer)  # Scale the gradients and update model parameters
        scaler.update()

        train_loss += loss.item()
        train_progress.set_postfix({'Training Loss': train_loss / (train_progress.n + 1)})
    
    avg_train_loss = train_loss / len(train_dataloader)
    # wandb.log({"train_loss": avg_train_loss})

    # Validation step
    model.eval()
    all_predictions = []
    all_labels = []
    val_progress = tqdm(val_dataloader, desc=f"Epoch {epoch+1}/{Config.EPOCHS} - Validating", leave=True)
    with torch.no_grad():
        for input_ids, attention_mask, features, labels in val_progress:
            input_ids = input_ids.to(device)
            attention_mask = attention_mask.to(device)
            features = features.to(device).to(torch.float32)
            labels = labels.to(device).to(torch.float32)
            
            outputs = model(input_ids, attention_mask, features).squeeze().tolist()
            all_predictions.extend(outputs)
            all_labels.extend(labels.tolist())

    auc_score = roc_auc_score(all_labels, all_predictions)
    print(f"Epoch {epoch+1}/{Config.EPOCHS} - Validation AUC: {auc_score:.4f}")
    # wandb.log({"val_auc": auc_score})

# wandb.finish()

Epoch 1/2 - Training:   0%|          | 0/2488 [00:00<?, ?it/s]

KeyboardInterrupt: 