CPU VERSION --

In [None]:
import os
import torch
import pandas as pd
import numpy as np
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModel, get_scheduler
from torch import nn
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
from tqdm import tqdm
import re
import keyword
import pickle

# Force CPU usage
device = torch.device("cpu")

# Model and Tokenizer
model_name = "microsoft/codebert-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)
base_model = AutoModel.from_pretrained(model_name)
base_model.to(device)  # Optional since already on CPU

# Stylometric Features
JAVA_PYTHON_KEYWORDS = set([
    "def", "return", "if", "else", "elif", "while", "for", "break", "continue", "try", "except",
    "import", "from", "as", "class", "pass", "with", "yield", "lambda", "global", "nonlocal", "assert",
    "public", "private", "protected", "static", "final", "void", "int", "double", "float", "char", "boolean",
    "new", "catch", "finally", "throws", "throw", "switch", "case", "package", "interface", "implements", "extends"
])

def extract_stylometric_features(code: str) -> np.ndarray:
    if not isinstance(code, str) or not code.strip():
        return np.zeros(17, dtype=np.float32)

    lines = code.split('\n')
    num_lines = len(lines)
    line_lengths = [len(line) for line in lines]
    avg_line_length = np.mean(line_lengths) if lines else 0
    blank_lines = sum(1 for line in lines if not line.strip())

    tokens = re.findall(r'\b\w+\b', code)
    num_tokens = len(tokens)
    avg_token_length = np.mean([len(tok) for tok in tokens]) if tokens else 0
    num_keywords = sum(1 for tok in tokens if tok in JAVA_PYTHON_KEYWORDS)
    keyword_ratio = num_keywords / num_tokens if num_tokens else 0

    comment_lines = sum(1 for line in lines if re.match(r'^\s*(#|//|/\*|\*)', line.strip()))
    comment_ratio = comment_lines / num_lines if num_lines else 0

    num_assignments = len(re.findall(r'\w+\s*=+', code))
    num_function_defs = len(re.findall(r'\b(def|void|public\s+|private\s+|protected\s+).*?\(', code))

    whitespace_ratio = len(re.findall(r'\s', code)) / len(code) if code else 0
    uses_tabs = int('\t' in code)

    indent_levels = [len(re.match(r'^\s*', line).group()) for line in lines if line.strip()]
    indent_variance = np.var(indent_levels) if indent_levels else 0
    max_indent_level = max(indent_levels) if indent_levels else 0

    num_brackets = code.count('{') + code.count('}') + code.count('(') + code.count(')') + code.count('[') + code.count(']')

    return np.array([
        num_lines,
        avg_line_length,
        blank_lines,
        num_tokens,
        avg_token_length,
        num_keywords,
        keyword_ratio,
        comment_lines,
        comment_ratio,
        num_assignments,
        num_function_defs,
        whitespace_ratio,
        uses_tabs,
        indent_variance,
        max_indent_level,
        num_brackets,
        len(code)
    ], dtype=np.float32)

# Classifier using CodeBERT
class CodeBERTClassifier(nn.Module):
    def __init__(self, base_model, hidden_size=768, num_labels=2):
        super(CodeBERTClassifier, self).__init__()
        self.encoder = base_model
        self.classifier = nn.Linear(hidden_size, num_labels)

    def forward(self, input_ids, attention_mask, labels=None):
        outputs = self.encoder(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.last_hidden_state[:, 0, :]
        logits = self.classifier(pooled_output)
        loss = None
        if labels is not None:
            loss_fn = nn.CrossEntropyLoss()
            loss = loss_fn(logits, labels)
        return {"logits": logits, "loss": loss}

model = CodeBERTClassifier(base_model)
model.to(device)

# Dataset for Fine-tuning
class CodeDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_length=512):
        self.data = dataframe.dropna(subset=["clean_code", "label"])
        self.data = self.data[self.data["clean_code"].str.strip().astype(bool)]
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.samples = []
        for _, row in self.data.iterrows():
            tokens = self.tokenizer(row["clean_code"], padding="max_length", truncation=True, max_length=self.max_length)
            if len(tokens["input_ids"]) > 0 and sum(tokens["attention_mask"]) > 0:
                self.samples.append((tokens, row["label"]))

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        tokens, label = self.samples[idx]
        input_ids = torch.tensor(tokens["input_ids"])
        attention_mask = torch.tensor(tokens["attention_mask"])
        return {
            "input_ids": input_ids,
            "attention_mask": attention_mask,
            "labels": torch.tensor(label, dtype=torch.long)
        }

# Load training data
train_df = pd.read_csv("Train.csv", usecols=["clean_code", "label"])
train_dataset = CodeDataset(train_df, tokenizer)
train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)

# Optimizer and Scheduler
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)
num_training_steps = len(train_loader) * 3
lr_scheduler = get_scheduler("linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps)

# Fine-tune
model.train()
for epoch in range(3):
    total_loss = 0
    for batch in tqdm(train_loader, desc=f"Epoch {epoch+1}"):
        batch = {k: v.to(device) for k, v in batch.items()}  # sends to CPU
        outputs = model(input_ids=batch["input_ids"], attention_mask=batch["attention_mask"], labels=batch["labels"])
        loss = outputs["loss"]
        total_loss += loss.item()
        loss.backward()
        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
    print(f"Epoch {epoch+1} loss: {total_loss:.4f}")

# Dataset for Feature Extraction
class EmbeddingDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_length=512):
        self.data = dataframe.dropna(subset=["clean_code", "label"])
        self.data = self.data[self.data["clean_code"].str.strip().astype(bool)]
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        row = self.data.iloc[idx]
        code = row["clean_code"]
        label = row["label"]
        style_feat = extract_stylometric_features(code)
        tokens = self.tokenizer(code, padding="max_length", truncation=True, max_length=self.max_length, return_tensors="pt")
        return {
            "input_ids": tokens["input_ids"].squeeze(0),
            "attention_mask": tokens["attention_mask"].squeeze(0),
            "stylometric": torch.tensor(style_feat, dtype=torch.float32),
            "label": label
        }

# Feature Extraction
def extract_features(model, dataset):
    loader = DataLoader(dataset, batch_size=8)
    model.eval()
    features = []
    labels = []
    with torch.no_grad():
        for batch in tqdm(loader, desc="Extracting Features"):
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            stylometric = batch["stylometric"].numpy()
            label_batch = batch["label"]
            outputs = model.encoder(input_ids=input_ids, attention_mask=attention_mask)
            cls_embed = outputs.last_hidden_state[:, 0, :].cpu().numpy()
            combined = np.concatenate([cls_embed, stylometric], axis=1)
            features.append(combined)
            labels.extend(label_batch)
    return np.vstack(features), np.array(labels)

# Train RF
embedding_train_dataset = EmbeddingDataset(train_df, tokenizer)
X_train, y_train = extract_features(model, embedding_train_dataset)
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

# Evaluate
for i in range(10):
    test_path = f"Test_{i}.csv"
    test_df = pd.read_csv(test_path, usecols=["clean_code", "label"])
    test_dataset = EmbeddingDataset(test_df, tokenizer)
    X_test, y_test = extract_features(model, test_dataset)
    y_pred = rf.predict(X_test)
    print(f"\n=== Results for {test_path} ===")
    print("Accuracy:", accuracy_score(y_test, y_pred))
    print("Precision:", precision_score(y_test, y_pred))
    print("Recall:", recall_score(y_test, y_pred))
    print("F1 Score:", f1_score(y_test, y_pred))
    print("Classification Report:")
    print(classification_report(y_test, y_pred))

# Save classifier
with open("human_ai_classifier.pkl", "wb") as f:
    pickle.dump(rf, f)



  from .autonotebook import tqdm as notebook_tqdm
Epoch 1: 100%|██████████| 749/749 [52:10<00:00,  4.18s/it]


Epoch 1 loss: 278.5767


Epoch 2: 100%|██████████| 749/749 [52:11<00:00,  4.18s/it]


Epoch 2 loss: 181.4119


Epoch 3: 100%|██████████| 749/749 [52:18<00:00,  4.19s/it]


Epoch 3 loss: 131.7644


Extracting Features: 100%|██████████| 749/749 [13:24<00:00,  1.07s/it]
Extracting Features: 100%|██████████| 122/122 [02:11<00:00,  1.08s/it]



=== Results for Test_0.csv ===
Accuracy: 0.8135245901639344
Precision: 0.836864406779661
Recall: 0.79
F1 Score: 0.8127572016460906
Classification Report:
              precision    recall  f1-score   support

           0       0.79      0.84      0.81       476
           1       0.84      0.79      0.81       500

    accuracy                           0.81       976
   macro avg       0.81      0.81      0.81       976
weighted avg       0.81      0.81      0.81       976



Extracting Features: 100%|██████████| 125/125 [02:14<00:00,  1.07s/it]



=== Results for Test_1.csv ===
Accuracy: 0.77
Precision: 0.7596153846153846
Recall: 0.79
F1 Score: 0.7745098039215687
Classification Report:
              precision    recall  f1-score   support

           0       0.78      0.75      0.77       500
           1       0.76      0.79      0.77       500

    accuracy                           0.77      1000
   macro avg       0.77      0.77      0.77      1000
weighted avg       0.77      0.77      0.77      1000



Extracting Features: 100%|██████████| 126/126 [02:14<00:00,  1.07s/it]



=== Results for Test_2.csv ===
Accuracy: 0.7392607392607392
Precision: 0.7168784029038112
Recall: 0.79
F1 Score: 0.7516650808753568
Classification Report:
              precision    recall  f1-score   support

           0       0.77      0.69      0.73       501
           1       0.72      0.79      0.75       500

    accuracy                           0.74      1001
   macro avg       0.74      0.74      0.74      1001
weighted avg       0.74      0.74      0.74      1001



Extracting Features: 100%|██████████| 125/125 [02:14<00:00,  1.07s/it]



=== Results for Test_3.csv ===
Accuracy: 0.7377377377377378
Precision: 0.7155797101449275
Recall: 0.79
F1 Score: 0.7509505703422054
Classification Report:
              precision    recall  f1-score   support

           0       0.77      0.69      0.72       499
           1       0.72      0.79      0.75       500

    accuracy                           0.74       999
   macro avg       0.74      0.74      0.74       999
weighted avg       0.74      0.74      0.74       999



Extracting Features: 100%|██████████| 125/125 [02:14<00:00,  1.07s/it]



=== Results for Test_4.csv ===
Accuracy: 0.734
Precision: 0.710431654676259
Recall: 0.79
F1 Score: 0.7481060606060606
Classification Report:
              precision    recall  f1-score   support

           0       0.76      0.68      0.72       500
           1       0.71      0.79      0.75       500

    accuracy                           0.73      1000
   macro avg       0.74      0.73      0.73      1000
weighted avg       0.74      0.73      0.73      1000



Extracting Features: 100%|██████████| 125/125 [02:15<00:00,  1.08s/it]



=== Results for Test_5.csv ===
Accuracy: 0.841
Precision: 0.8797327394209354
Recall: 0.79
F1 Score: 0.8324552160168599
Classification Report:
              precision    recall  f1-score   support

           0       0.81      0.89      0.85       500
           1       0.88      0.79      0.83       500

    accuracy                           0.84      1000
   macro avg       0.84      0.84      0.84      1000
weighted avg       0.84      0.84      0.84      1000



Extracting Features: 100%|██████████| 120/120 [02:08<00:00,  1.07s/it]



=== Results for Test_6.csv ===
Accuracy: 0.6073298429319371
Precision: 0.5939849624060151
Recall: 0.79
F1 Score: 0.6781115879828327
Classification Report:
              precision    recall  f1-score   support

           0       0.64      0.41      0.50       455
           1       0.59      0.79      0.68       500

    accuracy                           0.61       955
   macro avg       0.62      0.60      0.59       955
weighted avg       0.61      0.61      0.59       955



Extracting Features: 100%|██████████| 122/122 [02:09<00:00,  1.06s/it]



=== Results for Test_7.csv ===
Accuracy: 0.6618556701030928
Precision: 0.63915857605178
Recall: 0.79
F1 Score: 0.7066189624329159
Classification Report:
              precision    recall  f1-score   support

           0       0.70      0.53      0.60       470
           1       0.64      0.79      0.71       500

    accuracy                           0.66       970
   macro avg       0.67      0.66      0.65       970
weighted avg       0.67      0.66      0.66       970



Extracting Features: 100%|██████████| 125/125 [02:13<00:00,  1.07s/it]



=== Results for Test_8.csv ===
Accuracy: 0.7226130653266332
Precision: 0.6978798586572438
Recall: 0.79
F1 Score: 0.7410881801125704
Classification Report:
              precision    recall  f1-score   support

           0       0.76      0.65      0.70       495
           1       0.70      0.79      0.74       500

    accuracy                           0.72       995
   macro avg       0.73      0.72      0.72       995
weighted avg       0.73      0.72      0.72       995



Extracting Features: 100%|██████████| 125/125 [02:14<00:00,  1.07s/it]


=== Results for Test_9.csv ===
Accuracy: 0.6683366733466933
Precision: 0.6360708534621579
Recall: 0.79
F1 Score: 0.7047279214986619
Classification Report:
              precision    recall  f1-score   support

           0       0.72      0.55      0.62       498
           1       0.64      0.79      0.70       500

    accuracy                           0.67       998
   macro avg       0.68      0.67      0.66       998
weighted avg       0.68      0.67      0.66       998




