In [1]:
# Python Bug Detection Model
# Author: Jagriti Srivastava

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
import joblib

# Load dataset
df = pd.read_csv("/kaggle/input/python-code-snippets-for-bug-detection/dataset_realistic_bug.csv")

# Prepare features and labels
X = df['snippet']
y = df['label']

# Vectorize the code snippets
vectorizer = TfidfVectorizer(token_pattern=r"(?u)\b\w+\b", max_features=5000)
X_vec = vectorizer.fit_transform(X)

# Split data
X_train, X_test, y_train, y_test = train_test_split(X_vec, y, test_size=0.2, random_state=42)

# Train model
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

# Evaluate
y_pred = model.predict(X_test)
print("Classification Report:\n", classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

# Save the model and vectorizer
joblib.dump(model, "bug_detector_model.pkl")
joblib.dump(vectorizer, "bug_vectorizer.pkl")

print("Model and vectorizer saved successfully!")


Classification Report:
               precision    recall  f1-score   support

           0       0.20      0.40      0.27        15
           1       0.40      0.20      0.27        30

    accuracy                           0.27        45
   macro avg       0.30      0.30      0.27        45
weighted avg       0.33      0.27      0.27        45

Confusion Matrix:
 [[ 6  9]
 [24  6]]
Model and vectorizer saved successfully!
