In [2]:
import torch
import pandas as pd
from tqdm import tqdm
from transformers import AutoTokenizer, AutoModel


# Set pandas to show full column content
pd.set_option('display.max_colwidth', None)
# Read data
df = pd.read_json("../data/codegptsensor/python/train.jsonl", lines=True)

  from .autonotebook import tqdm as notebook_tqdm


##### Creating a small sample dataset

In [None]:
df_small = df.sample(n=1000, random_state=42)

# orient="records": Each DataFrame row becomes a JSON object. Without lines=True, it outputs a JSON array of objects.
# lines=True: Writes each record on its own line (JSONL), one JSON object per line.
df_small.to_json("../data/codegptsensor/python/train_small.jsonl", orient="records", lines=True)

##### Verifying the small dataset


In [4]:
df_small = pd.read_json("../data/codegptsensor/python/train_small.jsonl", lines=True)
df_small.head(3)


Unnamed: 0,index,code,contrast,label
0,gp130060,"def save_file(filename, data, mk_parents=True):\n """"""Save file to disk.\n Paramaters\n ----------\n filename : pathlib.Path\n Path to the file.\n data : str\n File contents.\n mk_parents : bool, optional\n If to create parent directories.\n """"""\n parent = filename.parent\n if not parent.exists() and mk_parents:\n logger.debug(""Creating directory: %s"", parent.as_posix())\n parent.mkdir(parents=True)\n with open(filename, mode=""w"") as f:\n logger.debug(""Saving file: %s"", filename.as_posix())\n f.write(data)","import pathlib\n\ndef save_to_disk(filename: pathlib.Path, data: str, mk_parents: bool = False) -> None:\n if mk_parents:\n filename.parent.mkdir(parents=True, exist_ok=True)\n with open(filename, 'w') as f:\n f.write(data)\n",0
1,gp191806,"import functools\nimport logging\n\ndef wrap_callbacks(callback_fn):\n @functools.wraps(callback_fn)\n def wrapper(*args, **kwargs):\n try:\n return callback_fn(*args, **kwargs)\n except Exception as e:\n logging.exception(f""Error in callback: {e}"")\n return ""An error occurred in the callback""\n return wrapper\n","def dont_crash(fn):\n """"""\n Wraps callbacks: a simple information is raised in place of a program crash.\n """"""\n def safe_exec(self, *args, **kwargs):\n try:\n return fn(self, *args, **kwargs)\n except Exception as e:\n logging.exception(e)\n QMessageBox.information(\n self, type(e).__name__, "" "".join(str(x) for x in e.args)\n )\n return safe_exec",1
2,gp166948,"def normalizeGlyphUnicodes(value):\n """"""\n Normalizes glyph unicodes.\n * **value** must be a ``list``.\n * **value** items must normalize as glyph unicodes with\n :func:`normalizeGlyphUnicode`.\n * **value** must not repeat unicode values.\n * Returned value will be a ``tuple`` of ints.\n """"""\n if not isinstance(value, (tuple, list)):\n raise TypeError(""Glyph unicodes must be a list, not %s.""\n % type(value).__name__)\n values = [normalizeGlyphUnicode(v) for v in value]\n duplicates = [v for v, count in Counter(value).items() if count > 1]\n if len(duplicates) != 0:\n raise ValueError(""Duplicate unicode values are not allowed."")\n return tuple(values)","def normalize_glyph_unicodes(value):\n """"""\n Normalizes glyph unicodes.\n * **value** must be a ``list``.\n * **value** items must normalize as glyph unicodes with\n :func:`normalizeGlyphUnicode`.\n * **value** must not repeat unicode values.\n * Returned value will be a ``tuple`` of ints.\n """"""\n from fontTools.misc.transform import Transform\n glyphs = []\n for glyph in value:\n glyph_norm = normalizeGlyphUnicode(glyph)\n if glyph_norm not in glyphs:\n glyphs.append(glyph_norm)\n return tuple(map(ord, glyphs))\n",0


##### Load UniXcoder

In [5]:
# A tokenizer converts code (text) into numbers that the model can process.
tokenizer = AutoTokenizer.from_pretrained("microsoft/unixcoder-base")
# Unixcoder is a neural network that takes numbers and outputs predictions.
model = AutoModel.from_pretrained("microsoft/unixcoder-base")


print("✓ Model loaded successfully!")
print(f"Model type: {type(model)}")
print(f"Model size: {sum(p.numel() for p in model.parameters()) / 1e6:.2f}M parameters")

✓ Model loaded successfully!
Model type: <class 'transformers.models.roberta.modeling_roberta.RobertaModel'>
Model size: 125.93M parameters


##### Test with a simple code snippet

In [6]:
test_code = "def hello():\n    print('Hello world')"
inputs = tokenizer(test_code, return_tensors="pt", truncation=True, max_length=512)

print("\n✓ Tokenizer working!")
print(f"Input shape: {inputs['input_ids'].shape}")


✓ Tokenizer working!
Input shape: torch.Size([1, 12])


##### Embeddings

In [None]:
with torch.no_grad():
    outputs = model(**inputs)
    embeddings = outputs.last_hidden_state


print(f"✓ Model forward pass working!")
print(f"Embedding shape: {embeddings.shape}")

✓ Model forward pass working!
Embedding shape: torch.Size([1, 12, 768])


##### Input and output splitting

In [8]:
# Create data for both columns
human_samples = []
ai_samples = []
for _, row in df_small.iterrows():
    if row['label'] == 0:
        human_samples.append(row['code'])
        ai_samples.append(row['contrast'])
    else:
        ai_samples.append(row['code'])
        human_samples.append(row['contrast'])

# Create a label list (same length as the samples)
samples = human_samples + ai_samples
labels = [0]*len(human_samples) + [1]*len(ai_samples)

print(f"Total samples: {len(samples)} (Human: {len(human_samples)}, AI: {len(ai_samples)})")
print(f"Label counts: {pd.Series(labels).value_counts().to_dict()}")

Total samples: 2000 (Human: 1000, AI: 1000)
Label counts: {0: 1000, 1: 1000}


##### Generate embeddings

In [9]:
model.eval()  # Set model to evaluation mode

embeddings = []
batch_size = 32  # Small batch for safety

with torch.no_grad():
    for i in tqdm(range(0, len(samples), batch_size)):
        batch_text = samples[i:i+batch_size]
        # Tokenize and pad
        inputs = tokenizer(batch_text, padding='max_length', truncation=True, max_length=256, return_tensors="pt")
        # Forward pass
        outputs = model(**inputs)
        # Use the [CLS] token (first in the sequence) as embedding
        batch_emb = outputs.last_hidden_state[:, 0, :].cpu()  # shape: [batch, 768]
        embeddings.append(batch_emb)

# Concatenate over all batches
X = torch.cat(embeddings, dim=0).numpy()
y = labels

print(f"X shape: {X.shape}")  # Should be (2000, 768)
print(f"y length: {len(y)}")


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
100%|██████████| 63/63 [01:48<00:00,  1.72s/it]

X shape: (2000, 768)
y length: 2000





##### Train a Simple Classifier

In [10]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

# X and y came from previous step
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print(f"Train: {X_train.shape}, Test: {X_test.shape}")

# Initialize and train the model
model = LogisticRegression(max_iter=1000, verbose=1)
model.fit(X_train, y_train)

Train: (1600, 768), Test: (400, 768)


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s finished


0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'lbfgs'
,max_iter,1000


In [12]:
from sklearn.metrics import classification_report, accuracy_score

# Predict on test set
y_pred = model.predict(X_test)

# Print accuracy and detailed metrics
print(f"Test accuracy: {accuracy_score(y_test, y_pred):.3f}")
print(classification_report(y_test, y_pred, target_names=['Human', 'AI']))


Test accuracy: 0.850
              precision    recall  f1-score   support

       Human       0.85      0.85      0.85       199
          AI       0.85      0.85      0.85       201

    accuracy                           0.85       400
   macro avg       0.85      0.85      0.85       400
weighted avg       0.85      0.85      0.85       400



##### Neural Network Training

In [13]:
import torch.nn as nn

class CodeClassifier(nn.Module):
    def __init__(self, input_dim=768, hidden_dim=256, num_classes=2):
        super(CodeClassifier, self).__init__()
        self.fc1 = nn.Linear(input_dim, hidden_dim)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(0.3)
        self.fc2 = nn.Linear(hidden_dim, num_classes)
    
    def forward(self, x):
        x = self.fc1(x)
        x = self.relu(x)
        x = self.dropout(x)
        x = self.fc2(x)
        return x

# Create the model
classifier = CodeClassifier()
print(classifier)
print(f"Total parameters: {sum(p.numel() for p in classifier.parameters())}")


CodeClassifier(
  (fc1): Linear(in_features=768, out_features=256, bias=True)
  (relu): ReLU()
  (dropout): Dropout(p=0.3, inplace=False)
  (fc2): Linear(in_features=256, out_features=2, bias=True)
)
Total parameters: 197378


## Paper Recreation

In [14]:
import torch
from torch.utils.data import TensorDataset, DataLoader

# Convert to PyTorch tensors
X_train_tensor = torch.FloatTensor(X_train)
y_train_tensor = torch.LongTensor(y_train)
X_test_tensor = torch.FloatTensor(X_test)
y_test_tensor = torch.LongTensor(y_test)

# Create datasets
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
test_dataset = TensorDataset(X_test_tensor, y_test_tensor)

# Create data loaders
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

print(f"Train batches: {len(train_loader)}")
print(f"Test batches: {len(test_loader)}")


Train batches: 50
Test batches: 13


In [15]:
import torch.optim as optim
from tqdm import tqdm

# Training setup
criterion = nn.CrossEntropyLoss()  # Standard loss for classification
optimizer = optim.Adam(classifier.parameters(), lr=0.001)
num_epochs = 10

# Training loop
for epoch in range(num_epochs):
    classifier.train()
    train_loss = 0
    correct = 0
    total = 0
    
    for batch_X, batch_y in tqdm(train_loader, desc=f"Epoch {epoch+1}/{num_epochs}"):
        # Forward pass
        outputs = classifier(batch_X)
        loss = criterion(outputs, batch_y)
        
        # Backward pass
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        # Track metrics
        train_loss += loss.item()
        _, predicted = torch.max(outputs.data, 1)
        total += batch_y.size(0)
        correct += (predicted == batch_y).sum().item()
    
    train_acc = 100 * correct / total
    print(f"Epoch {epoch+1}: Loss = {train_loss/len(train_loader):.4f}, Accuracy = {train_acc:.2f}%")


Epoch 1/10: 100%|██████████| 50/50 [00:00<00:00, 148.33it/s]


Epoch 1: Loss = 0.4578, Accuracy = 77.06%


Epoch 2/10: 100%|██████████| 50/50 [00:00<00:00, 1282.15it/s]


Epoch 2: Loss = 0.2853, Accuracy = 88.38%


Epoch 3/10: 100%|██████████| 50/50 [00:00<00:00, 1225.59it/s]


Epoch 3: Loss = 0.2167, Accuracy = 91.38%


Epoch 4/10: 100%|██████████| 50/50 [00:00<00:00, 1206.30it/s]


Epoch 4: Loss = 0.1689, Accuracy = 93.50%


Epoch 5/10: 100%|██████████| 50/50 [00:00<00:00, 1207.93it/s]


Epoch 5: Loss = 0.1311, Accuracy = 94.69%


Epoch 6/10: 100%|██████████| 50/50 [00:00<00:00, 1338.90it/s]


Epoch 6: Loss = 0.1044, Accuracy = 96.25%


Epoch 7/10: 100%|██████████| 50/50 [00:00<00:00, 1306.55it/s]


Epoch 7: Loss = 0.1017, Accuracy = 96.38%


Epoch 8/10: 100%|██████████| 50/50 [00:00<00:00, 1356.34it/s]


Epoch 8: Loss = 0.0676, Accuracy = 98.31%


Epoch 9/10: 100%|██████████| 50/50 [00:00<00:00, 1323.23it/s]


Epoch 9: Loss = 0.0632, Accuracy = 98.12%


Epoch 10/10: 100%|██████████| 50/50 [00:00<00:00, 1238.70it/s]

Epoch 10: Loss = 0.0372, Accuracy = 98.88%





In [16]:
classifier.eval()
correct = 0
total = 0
all_preds = []
all_labels = []

with torch.no_grad():
    for batch_X, batch_y in test_loader:
        outputs = classifier(batch_X)
        _, predicted = torch.max(outputs.data, 1)
        total += batch_y.size(0)
        correct += (predicted == batch_y).sum().item()
        all_preds.extend(predicted.numpy())
        all_labels.extend(batch_y.numpy())

test_acc = 100 * correct / total
print(f"\nTest Accuracy: {test_acc:.2f}%")

# Detailed metrics
from sklearn.metrics import classification_report
print(classification_report(all_labels, all_preds, target_names=['Human', 'AI']))



Test Accuracy: 85.50%
              precision    recall  f1-score   support

       Human       0.84      0.87      0.86       199
          AI       0.87      0.84      0.85       201

    accuracy                           0.85       400
   macro avg       0.86      0.86      0.85       400
weighted avg       0.86      0.85      0.85       400

