In [1]:
import pandas as pd
from scipy.io import arff

data, meta = arff.loadarff("JM1.arff")
df = pd.DataFrame(data)

In [2]:
df['defects'] = df['label'].apply(
    lambda x: 1 if x == b'Y' else 0
)
df = df.drop('label', axis=1)

In [3]:
df.info()
df['defects'].value_counts()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7720 entries, 0 to 7719
Data columns (total 22 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   LOC_BLANK              7720 non-null   float64
 1   BRANCH_COUNT           7720 non-null   float64
 2   LOC_CODE_AND_COMMENT   7720 non-null   float64
 3   LOC_COMMENTS           7720 non-null   float64
 4   CYCLOMATIC_COMPLEXITY  7720 non-null   float64
 5   DESIGN_COMPLEXITY      7720 non-null   float64
 6   ESSENTIAL_COMPLEXITY   7720 non-null   float64
 7   LOC_EXECUTABLE         7720 non-null   float64
 8   HALSTEAD_CONTENT       7720 non-null   float64
 9   HALSTEAD_DIFFICULTY    7720 non-null   float64
 10  HALSTEAD_EFFORT        7720 non-null   float64
 11  HALSTEAD_ERROR_EST     7720 non-null   float64
 12  HALSTEAD_LENGTH        7720 non-null   float64
 13  HALSTEAD_LEVEL         7720 non-null   float64
 14  HALSTEAD_PROG_TIME     7720 non-null   float64
 15  HALS

defects
0    6108
1    1612
Name: count, dtype: int64

In [4]:
X = df.drop(columns=['defects'])
y = df['defects']

In [5]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

In [6]:
from imblearn.over_sampling import SMOTE
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

In [7]:
y_train_resampled.value_counts()

defects
1    4886
0    4886
Name: count, dtype: int64

In [8]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train_resampled_scaled = scaler.fit_transform(X_train_resampled)
X_test_scaled = scaler.transform(X_test)

In [9]:
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier(random_state=42, n_estimators=300, n_jobs=-1, class_weight={0:1, 1:3})

model.fit(X_train_resampled_scaled, y_train_resampled)

from sklearn.model_selection import GridSearchCV

param_grid = {
    'n_estimators': [200, 300],
    'max_depth': [None, 15, 25],
    'min_samples_leaf': [1, 3, 5]
}

grid = GridSearchCV(
    RandomForestClassifier(
        random_state=42,
        class_weight={0:1, 1:3}
    ),
    param_grid,
    scoring='recall',
    cv=3,
    n_jobs=-1
)

grid.fit(X_train_resampled_scaled, y_train_resampled)
best_model = grid.best_estimator_

In [10]:
y_prob = model.predict_proba(X_test_scaled)[:,1]
y_pred_custom = (y_prob >= 0.3).astype(int)

In [11]:
from sklearn.metrics import classification_report, confusion_matrix
print(classification_report(y_test, y_pred_custom))

              precision    recall  f1-score   support

           0       0.88      0.68      0.76      1222
           1       0.34      0.65      0.45       322

    accuracy                           0.67      1544
   macro avg       0.61      0.66      0.61      1544
weighted avg       0.77      0.67      0.70      1544



In [12]:
from sklearn.metrics import roc_auc_score

y_prob = model.predict_proba(X_test_scaled)[:, 1]
roc_auc = roc_auc_score(y_test, y_prob)

print("ROC-AUC:", roc_auc)


ROC-AUC: 0.7097200902705065


In [13]:
#save model
import joblib
joblib.dump(model, 'random_forest_jm1_model.pkl')

#svae sclaer
joblib.dump(scaler, 'scaler_jm1.pkl')

['scaler_jm1.pkl']

In [14]:
# Train a simplified model using only features that lizard can extract
# lizard can extract: LOC_EXECUTABLE, CYCLOMATIC_COMPLEXITY

LIZARD_FEATURES = ["LOC_EXECUTABLE", "CYCLOMATIC_COMPLEXITY"]

X_lizard = df[LIZARD_FEATURES]
y_lizard = df['defects']

X_train_lz, X_test_lz, y_train_lz, y_test_lz = train_test_split(
    X_lizard, y_lizard, test_size=0.2, random_state=42, stratify=y_lizard
)

# Apply SMOTE
smote_lz = SMOTE(random_state=42)
X_train_lz_resampled, y_train_lz_resampled = smote_lz.fit_resample(X_train_lz, y_train_lz)

# Scale
scaler_lizard = StandardScaler()
X_train_lz_scaled = scaler_lizard.fit_transform(X_train_lz_resampled)
X_test_lz_scaled = scaler_lizard.transform(X_test_lz)

# Train model
model_lizard = RandomForestClassifier(
    random_state=42, 
    n_estimators=300, 
    class_weight={0:1, 1:3}
)
model_lizard.fit(X_train_lz_scaled, y_train_lz_resampled)

# Evaluate
y_prob_lz = model_lizard.predict_proba(X_test_lz_scaled)[:, 1]
y_pred_lz = (y_prob_lz >= 0.3).astype(int)
print("Simplified Model (lizard-compatible):")
print(classification_report(y_test_lz, y_pred_lz))
print("ROC-AUC:", roc_auc_score(y_test_lz, y_prob_lz))

# Save the lizard-compatible model
joblib.dump(model_lizard, 'final_model.pkl')
joblib.dump(scaler_lizard, 'scaler_final.pkl')
print("\nSaved: random_forest_jm1_model.pkl and scaler_jm1.pkl")

Simplified Model (lizard-compatible):
              precision    recall  f1-score   support

           0       0.79      0.26      0.39      1222
           1       0.21      0.73      0.32       322

    accuracy                           0.36      1544
   macro avg       0.50      0.50      0.36      1544
weighted avg       0.67      0.36      0.38      1544

ROC-AUC: 0.5312808144676785

Saved: random_forest_jm1_model.pkl and scaler_jm1.pkl
