In [3]:
import pandas as pd
import json

# Load the original full dataset
df = pd.read_json("../data/codegptsensor/python/train.jsonl", lines=True)

# Create a small test set (1,000 samples)
df_small = df.sample(n=1000, random_state=42)

# Save it properly
df_small.to_json('../data/test_dataset_small.jsonl', orient='records', lines=True)

print(f"✓ Small dataset created: {len(df_small)} samples")
print(f"Saved to: ../data/test_dataset_small.jsonl")


✓ Small dataset created: 1000 samples
Saved to: ../data/test_dataset_small.jsonl


In [4]:
import numpy as np
import pandas as pd

# Load your small dataset again
df_small = pd.read_json('../data/test_dataset_small.jsonl', lines=True)

# Create triplets: (anchor, positive, negative)
# For each pair, we'll create training examples that compare them
triplets = []
labels_contrastive = []

for _, row in df_small.iterrows():
    code = row['code']
    contrast = row['contrast']
    label = row['label']
    
    # Store the pair with labels
    if label == 0:
        # code is human, contrast is AI
        triplets.append({
            'human': code,
            'ai': contrast
        })
    else:
        # code is AI, contrast is human
        triplets.append({
            'human': contrast,
            'ai': code
        })

print(f"Created {len(triplets)} pairs for contrastive learning")
print("\nExample pair:")
print(f"Human code: {triplets[0]['human'][:100]}...")
print(f"AI code: {triplets[0]['ai'][:100]}...")


Created 1000 pairs for contrastive learning

Example pair:
Human code: def save_file(filename, data, mk_parents=True):
    """Save file to disk.
    Paramaters
    -------...
AI code: import pathlib

def save_to_disk(filename: pathlib.Path, data: str, mk_parents: bool = False) -> Non...


### Contrastive Classifier

In [5]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class ContrastiveClassifier(nn.Module):
    def __init__(self, input_dim=768, hidden_dim=256):
        super(ContrastiveClassifier, self).__init__()
        # Projection head for contrastive learning
        self.projection = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(hidden_dim, 128)  # Project to 128-dim space
        )
        
        # Classification head
        self.classifier = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(hidden_dim, 2)
        )
    
    def forward(self, x, return_projection=False):
        if return_projection:
            return self.projection(x)
        return self.classifier(x)

# Create model
contrastive_model = ContrastiveClassifier()
print(contrastive_model)
print(f"Total parameters: {sum(p.numel() for p in contrastive_model.parameters())}")


ContrastiveClassifier(
  (projection): Sequential(
    (0): Linear(in_features=768, out_features=256, bias=True)
    (1): ReLU()
    (2): Dropout(p=0.3, inplace=False)
    (3): Linear(in_features=256, out_features=128, bias=True)
  )
  (classifier): Sequential(
    (0): Linear(in_features=768, out_features=256, bias=True)
    (1): ReLU()
    (2): Dropout(p=0.3, inplace=False)
    (3): Linear(in_features=256, out_features=2, bias=True)
  )
)
Total parameters: 427138


### Contrastive Loss Function

In [6]:
class ContrastiveLoss(nn.Module):
    def __init__(self, temperature=0.5):
        super(ContrastiveLoss, self).__init__()
        self.temperature = temperature
    
    def forward(self, human_proj, ai_proj):
        # Normalize embeddings
        human_proj = F.normalize(human_proj, dim=1)
        ai_proj = F.normalize(ai_proj, dim=1)
        
        # Compute similarity
        similarity = torch.matmul(human_proj, ai_proj.T) / self.temperature
        
        # Labels: diagonal elements are positive pairs
        batch_size = human_proj.shape[0]
        labels = torch.arange(batch_size).to(human_proj.device)
        
        # Cross-entropy loss
        loss = F.cross_entropy(similarity, labels)
        return loss

contrastive_criterion = ContrastiveLoss(temperature=0.5)
print("✓ Contrastive loss function created")


✓ Contrastive loss function created


### Prepare Paired Dataset

In [None]:
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm
from transformers import AutoTokenizer

class PairedCodeDataset(Dataset):
    def __init__(self, triplets, tokenizer, model, max_length=256):
        self.triplets = triplets
        self.tokenizer = tokenizer
        self.model = model
        self.max_length = max_length
        
        # Pre-compute embeddings for speed
        print("Pre-computing embeddings...")
        self.human_embeddings = []
        self.ai_embeddings = []
        
        model.eval()
        with torch.no_grad():
            for pair in tqdm(triplets):
                # Get human code embedding
                h_inputs = tokenizer(pair['human'], padding='max_length', 
                                    truncation=True, max_length=max_length, 
                                    return_tensors="pt")
                h_output = model(**h_inputs)
                h_emb = h_output.last_hidden_state[:, 0, :].squeeze()
                
                # Get AI code embedding
                a_inputs = tokenizer(pair['ai'], padding='max_length', 
                                    truncation=True, max_length=max_length, 
                                    return_tensors="pt")
                a_output = model(**a_inputs)
                a_emb = a_output.last_hidden_state[:, 0, :].squeeze()
                
                self.human_embeddings.append(h_emb)
                self.ai_embeddings.append(a_emb)
    
    def __len__(self):
        return len(self.triplets)
    
    def __getitem__(self, idx):
        return self.human_embeddings[idx], self.ai_embeddings[idx]

# Create dataset (this will take 2-3 minutes)
paired_dataset = PairedCodeDataset(triplets, tokenizer, model, max_length=256)
paired_loader = DataLoader(paired_dataset, batch_size=32, shuffle=True)

print(f"✓ Dataset ready: {len(paired_dataset)} pairs")


NameError: name 'tokenizer' is not defined