In [1]:
%pip install pylint
%pip install transformers datasets torch pandas
%pip install torch torchvision torchaudio
%pip install transformers[torch] accelerate>=0.26.0



Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.

Note: you may need to restart the kernel to use updated packages.


In [2]:
import os
import torch
import json
import numpy as np
from transformers import RobertaTokenizer, RobertaForSequenceClassification, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from joblib import dump
import subprocess

# Ensure required folders exist
os.makedirs("./models", exist_ok=True)
os.makedirs("./codebert_finetuned", exist_ok=True)

# Load tokenizer
MODEL_NAME = "microsoft/codebert-base"
tokenizer = RobertaTokenizer.from_pretrained(MODEL_NAME)

# Define directories for training data
good_code_dir = os.path.abspath(r"C:\Users\KIIT\Desktop\MinorProject\good_code")
bad_code_dir = os.path.abspath(r"C:\Users\KIIT\Desktop\MinorProject\bad_code")

# Function to load Python files from a directory
def load_code_from_directory(directory):
    code_samples = []
    for filename in os.listdir(directory):
        if filename.endswith(".py"):
            file_path = os.path.join(directory, filename)
            with open(file_path, "r", encoding="utf-8") as f:
                code_samples.append(f.read())
    return code_samples

# Load good and bad code samples
good_code_samples = load_code_from_directory(good_code_dir)
bad_code_samples = load_code_from_directory(bad_code_dir)

# Labels: 1 for bad code, 0 for good code
code_samples = good_code_samples + bad_code_samples
labels = [0] * len(good_code_samples) + [1] * len(bad_code_samples)

# Tokenize code
tokenized_inputs = tokenizer(code_samples, padding="max_length", truncation=True, max_length=512, return_tensors="pt")

# Split dataset
X_train, X_test, y_train, y_test = train_test_split(tokenized_inputs["input_ids"], labels, test_size=0.2, random_state=42)

# Define dataset class
class CodeDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item["labels"] = torch.tensor(self.labels[idx], dtype=torch.long)
        return item

# Create datasets
train_dataset = CodeDataset({"input_ids": X_train}, y_train)
test_dataset = CodeDataset({"input_ids": X_test}, y_test)

# Load CodeBERT model
codebert_model = RobertaForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=2)

# Define training arguments
training_args = TrainingArguments(
    output_dir="./codebert_finetuned",
    num_train_epochs=3,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    logging_dir="./logs",
    logging_steps=10,
    load_best_model_at_end=True,
)

# Trainer setup
trainer = Trainer(
    model=codebert_model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset
)

# Train CodeBERT
trainer.train()

# Save fine-tuned CodeBERT model
codebert_model.save_pretrained("./codebert_finetuned")
print("✅ CodeBERT model fine-tuned and saved.")

# Function to run Pylint and extract features
def extract_pylint_features(code_samples):
    features = []
    
    for idx, code in enumerate(code_samples):
        file_path = f"./temp_code_{idx}.py"
        with open(file_path, "w", encoding="utf-8") as f:
            f.write(code)
        
        try:
            result = subprocess.run(
                ['pylint', file_path, '--output-format=json'],
                capture_output=True, text=True
            )
            output = result.stdout.strip() if result.stdout.strip() else result.stderr.strip()
            
            pylint_json = json.loads(output) if output else []
            errors = {"E": 0, "W": 0, "C": 0}
            
            for entry in pylint_json:
                msg_id = entry.get("message-id", "").upper()
                if msg_id.startswith("E"):  # Error
                    errors['E'] += 1
                elif msg_id.startswith("W"):  # Warning
                    errors['W'] += 1
                elif msg_id.startswith("C"):  # Convention
                    errors['C'] += 1
            
            features.append([errors['E'], errors['W'], errors['C']])
        except Exception as e:
            print(f"Error running pylint: {e}")
            features.append([0, 0, 0])  # Default to no errors if Pylint fails
        
        os.remove(file_path)
    
    return np.array(features)

# Extract features for Logistic Regression
X_features = extract_pylint_features(code_samples)

# Train Logistic Regression
log_reg = LogisticRegression()
log_reg.fit(X_features, labels)

# Save trained model
dump(log_reg, "./models/trained_model.joblib")
print("✅ Logistic Regression model trained and saved.")


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at microsoft/codebert-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
We strongly recommend passing in an `attention_mask` since your input_ids may be padded. See https://huggingface.co/docs/transformers/troubleshooting#incorrect-output-when-padding-tokens-arent-masked.


Epoch,Training Loss,Validation Loss
1,No log,0.63122
2,No log,0.652028
3,No log,0.667275


✅ CodeBERT model fine-tuned and saved.
✅ Logistic Regression model trained and saved.
