In [2]:
import numpy as np
import pandas as pd
from sklearn.naive_bayes import GaussianNB
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import confusion_matrix, roc_auc_score
import random

class HoneyBadgerOptimizer:
    def __init__(self, n_badgers=10, max_iter=50, lb=0, ub=1):
        self.n_badgers = n_badgers
        self.max_iter = max_iter
        self.lb = lb
        self.ub = ub
        
    def initialize_population(self, n_features):
        return np.random.uniform(self.lb, self.ub, (self.n_badgers, n_features))
    
    def binary_conversion(self, positions):
        return np.where(positions > 0.5, 1, 0)
    
    def fitness_function(self, X_train, X_val, y_train, y_val, position):
        binary_pos = self.binary_conversion(position)
        if np.sum(binary_pos) == 0:
            return 0
        
        selected_features_train = X_train[:, binary_pos == 1]
        selected_features_val = X_val[:, binary_pos == 1]
        
        # Train Naive Bayes
        nb = GaussianNB()
        nb.fit(selected_features_train, y_train)
        
        # Predict and calculate accuracy
        y_pred = nb.predict(selected_features_val)
        accuracy = accuracy_score(y_val, y_pred)
        
        # Penalty for using too many features
        feature_penalty = 0.001 * np.sum(binary_pos)
        
        return accuracy - feature_penalty
    
    def optimize(self, X_train, X_val, y_train, y_val):
        n_features = X_train.shape[1]
        population = self.initialize_population(n_features)
        best_position = None
        best_fitness = float('-inf')
        
        for iteration in range(self.max_iter):
            # Evaluate current population
            for i in range(self.n_badgers):
                current_fitness = self.fitness_function(X_train, X_val, y_train, y_val, population[i])
                
                if current_fitness > best_fitness:
                    best_fitness = current_fitness
                    best_position = population[i].copy()
            
            # Update positions
            for i in range(self.n_badgers):
                # Random walk
                r1 = random.random()
                r2 = random.random()
                
                if r1 > 0.5:  # Exploitation
                    population[i] = population[i] + r2 * (best_position - population[i])
                else:  # Exploration
                    population[i] = population[i] + r2 * (self.ub - self.lb) * \
                                  (np.random.random(n_features) - 0.5) * 2
                
                # Ensure bounds
                population[i] = np.clip(population[i], self.lb, self.ub)
        
        return best_position

def evaluate_model(y_true, y_pred):
    acc = accuracy_score(y_true, y_pred)
    prec = precision_score(y_true, y_pred, pos_label='Y')
    rec = recall_score(y_true, y_pred, pos_label='Y')
    f1 = f1_score(y_true, y_pred, pos_label='Y')
    conf_matrix = confusion_matrix(y_true, y_pred)
    auc = roc_auc_score(y_true == 'Y', y_pred == 'Y')
    
    return {
        'accuracy': acc,
        'precision': prec,
        'recall': rec,
        'f1_score': f1,
        'confusion_matrix': conf_matrix,
        'auc_score': auc
    }

# Load and preprocess data
def main():
    # Read training and test data
    train_data = pd.read_csv('Training Data/CM1_FS_TrainData.csv')
    test_data = pd.read_csv('Testing Data/CM1_FS_TestData.csv')
    
    # Separate features and target for both datasets
    X_train = train_data.drop('Defective', axis=1).values
    y_train = train_data['Defective'].values
    X_test = test_data.drop('Defective', axis=1).values
    y_test = test_data['Defective'].values
    
    # Scale features
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    
    # Split training data for validation during feature selection
    train_size = int(0.8 * len(X_train_scaled))
    X_train_opt = X_train_scaled[:train_size]
    X_val_opt = X_train_scaled[train_size:]
    y_train_opt = y_train[:train_size]
    y_val_opt = y_train[train_size:]
    
    # Initialize and run HBDA
    hbda = HoneyBadgerOptimizer(n_badgers=20, max_iter=100)
    best_position = hbda.optimize(X_train_opt, X_val_opt, y_train_opt, y_val_opt)
    
    # Get selected features
    selected_features = hbda.binary_conversion(best_position)
    X_train_selected = X_train_scaled[:, selected_features == 1]
    X_test_selected = X_test_scaled[:, selected_features == 1]
    
    # Train final model using all training data
    nb = GaussianNB()
    nb.fit(X_train_selected, y_train)
    
    # Make predictions on test set
    y_pred = nb.predict(X_test_selected)
    
    # Evaluate model
    metrics = evaluate_model(y_test, y_pred)
    
    # Print results
    print("Model Performance Metrics on Test Data:")
    print(f"Accuracy: {metrics['accuracy']:.4f}")
    print(f"Precision: {metrics['precision']:.4f}")
    print(f"Recall: {metrics['recall']:.4f}")
    print(f"F1-Score: {metrics['f1_score']:.4f}")
    print(f"AUC Score: {metrics['auc_score']:.4f}")
    print("\nConfusion Matrix:")
    print(metrics['confusion_matrix'])
    
    # Print selected features
    feature_names = train_data.drop('Defective', axis=1).columns
    selected_feature_names = feature_names[selected_features == 1]
    print("\nSelected Features:")
    for feature in selected_feature_names:
        print(f"- {feature}")

if __name__ == "__main__":
    main()

Model Performance Metrics on Test Data:
Accuracy: 0.8485
Precision: 0.3333
Recall: 0.1538
F1-Score: 0.2105
AUC Score: 0.5537

Confusion Matrix:
[[82  4]
 [11  2]]

Selected Features:
- CALL_PAIRS
- LOC_COMMENTS
- HALSTEAD_CONTENT


In [4]:
import numpy as np
import pandas as pd
from sklearn.naive_bayes import GaussianNB
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import confusion_matrix, roc_auc_score
import random

class HoneyBadgerOptimizer:
    def __init__(self, n_badgers=10, max_iter=50, lb=0, ub=1):
        self.n_badgers = n_badgers
        self.max_iter = max_iter
        self.lb = lb
        self.ub = ub
        
    def initialize_population(self, n_features):
        return np.random.uniform(self.lb, self.ub, (self.n_badgers, n_features))
    
    def binary_conversion(self, positions):
        return np.where(positions > 0.5, 1, 0)
    
    def fitness_function(self, X_train, X_val, y_train, y_val, position):
        binary_pos = self.binary_conversion(position)
        if np.sum(binary_pos) == 0:
            return 0
        
        selected_features_train = X_train[:, binary_pos == 1]
        selected_features_val = X_val[:, binary_pos == 1]
        
        # Train Naive Bayes
        nb = GaussianNB()
        nb.fit(selected_features_train, y_train)
        
        # Predict and calculate accuracy
        y_pred = nb.predict(selected_features_val)
        accuracy = accuracy_score(y_val, y_pred)
        
        # Penalty for using too many features
        feature_penalty = 0.001 * np.sum(binary_pos)
        
        return accuracy - feature_penalty
    
    def optimize(self, X_train, X_val, y_train, y_val):
        n_features = X_train.shape[1]
        population = self.initialize_population(n_features)
        best_position = None
        best_fitness = float('-inf')
        
        for iteration in range(self.max_iter):
            # Evaluate current population
            for i in range(self.n_badgers):
                current_fitness = self.fitness_function(X_train, X_val, y_train, y_val, population[i])
                
                if current_fitness > best_fitness:
                    best_fitness = current_fitness
                    best_position = population[i].copy()
            
            # Update positions
            for i in range(self.n_badgers):
                # Random walk
                r1 = random.random()
                r2 = random.random()
                
                if r1 > 0.5:  # Exploitation
                    population[i] = population[i] + r2 * (best_position - population[i])
                else:  # Exploration
                    population[i] = population[i] + r2 * (self.ub - self.lb) * \
                                  (np.random.random(n_features) - 0.5) * 2
                
                # Ensure bounds
                population[i] = np.clip(population[i], self.lb, self.ub)
        
        return best_position

def evaluate_model(y_true, y_pred):
    acc = accuracy_score(y_true, y_pred)
    prec = precision_score(y_true, y_pred, pos_label='Y')
    rec = recall_score(y_true, y_pred, pos_label='Y')
    f1 = f1_score(y_true, y_pred, pos_label='Y')
    conf_matrix = confusion_matrix(y_true, y_pred)
    auc = roc_auc_score(y_true == 'Y', y_pred == 'Y')
    
    return {
        'accuracy': acc,
        'precision': prec,
        'recall': rec,
        'f1_score': f1,
        'confusion_matrix': conf_matrix,
        'auc_score': auc
    }

# Load and preprocess data
def main():
    # Read training and test data
    train_data = pd.read_csv('Training Data/MC2_FS_TrainData.csv')
    test_data = pd.read_csv('Testing Data/MC2_FS_TestData.csv')
    
    # Separate features and target for both datasets
    X_train = train_data.drop('Defective', axis=1).values
    y_train = train_data['Defective'].values
    X_test = test_data.drop('Defective', axis=1).values
    y_test = test_data['Defective'].values
    
    # Scale features
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    
    # Split training data for validation during feature selection
    train_size = int(0.8 * len(X_train_scaled))
    X_train_opt = X_train_scaled[:train_size]
    X_val_opt = X_train_scaled[train_size:]
    y_train_opt = y_train[:train_size]
    y_val_opt = y_train[train_size:]
    
    # Initialize and run HBDA
    hbda = HoneyBadgerOptimizer(n_badgers=20, max_iter=100)
    best_position = hbda.optimize(X_train_opt, X_val_opt, y_train_opt, y_val_opt)
    
    # Get selected features
    selected_features = hbda.binary_conversion(best_position)
    X_train_selected = X_train_scaled[:, selected_features == 1]
    X_test_selected = X_test_scaled[:, selected_features == 1]
    
    # Train final model using all training data
    nb = GaussianNB()
    nb.fit(X_train_selected, y_train)
    
    # Make predictions on test set
    y_pred = nb.predict(X_test_selected)
    
    # Evaluate model
    metrics = evaluate_model(y_test, y_pred)
    
    # Print results
    print("Model Performance Metrics on Test Data:")
    print(f"Accuracy: {metrics['accuracy']:.4f}")
    print(f"Precision: {metrics['precision']:.4f}")
    print(f"Recall: {metrics['recall']:.4f}")
    print(f"F1-Score: {metrics['f1_score']:.4f}")
    print(f"AUC Score: {metrics['auc_score']:.4f}")
    print("\nConfusion Matrix:")
    print(metrics['confusion_matrix'])
    
    # Print selected features
    feature_names = train_data.drop('Defective', axis=1).columns
    selected_feature_names = feature_names[selected_features == 1]
    print("\nSelected Features:")
    for feature in selected_feature_names:
        print(f"- {feature}")

if __name__ == "__main__":
    main()

Model Performance Metrics on Test Data:
Accuracy: 0.6316
Precision: 0.4286
Recall: 0.2308
F1-Score: 0.3000
AUC Score: 0.5354

Confusion Matrix:
[[21  4]
 [10  3]]

Selected Features:
- LOC_BLANK


In [5]:
import numpy as np
import pandas as pd
from sklearn.naive_bayes import GaussianNB
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import confusion_matrix, roc_auc_score
import random

class HoneyBadgerOptimizer:
    def __init__(self, n_badgers=10, max_iter=50, lb=0, ub=1):
        self.n_badgers = n_badgers
        self.max_iter = max_iter
        self.lb = lb
        self.ub = ub
        
    def initialize_population(self, n_features):
        return np.random.uniform(self.lb, self.ub, (self.n_badgers, n_features))
    
    def binary_conversion(self, positions):
        return np.where(positions > 0.5, 1, 0)
    
    def fitness_function(self, X_train, X_val, y_train, y_val, position):
        binary_pos = self.binary_conversion(position)
        if np.sum(binary_pos) == 0:
            return 0
        
        selected_features_train = X_train[:, binary_pos == 1]
        selected_features_val = X_val[:, binary_pos == 1]
        
        # Train Naive Bayes
        nb = GaussianNB()
        nb.fit(selected_features_train, y_train)
        
        # Predict and calculate accuracy
        y_pred = nb.predict(selected_features_val)
        accuracy = accuracy_score(y_val, y_pred)
        
        # Penalty for using too many features
        feature_penalty = 0.001 * np.sum(binary_pos)
        
        return accuracy - feature_penalty
    
    def optimize(self, X_train, X_val, y_train, y_val):
        n_features = X_train.shape[1]
        population = self.initialize_population(n_features)
        best_position = None
        best_fitness = float('-inf')
        
        for iteration in range(self.max_iter):
            # Evaluate current population
            for i in range(self.n_badgers):
                current_fitness = self.fitness_function(X_train, X_val, y_train, y_val, population[i])
                
                if current_fitness > best_fitness:
                    best_fitness = current_fitness
                    best_position = population[i].copy()
            
            # Update positions
            for i in range(self.n_badgers):
                # Random walk
                r1 = random.random()
                r2 = random.random()
                
                if r1 > 0.5:  # Exploitation
                    population[i] = population[i] + r2 * (best_position - population[i])
                else:  # Exploration
                    population[i] = population[i] + r2 * (self.ub - self.lb) * \
                                  (np.random.random(n_features) - 0.5) * 2
                
                # Ensure bounds
                population[i] = np.clip(population[i], self.lb, self.ub)
        
        return best_position

def evaluate_model(y_true, y_pred):
    acc = accuracy_score(y_true, y_pred)
    prec = precision_score(y_true, y_pred, pos_label='Y')
    rec = recall_score(y_true, y_pred, pos_label='Y')
    f1 = f1_score(y_true, y_pred, pos_label='Y')
    conf_matrix = confusion_matrix(y_true, y_pred)
    auc = roc_auc_score(y_true == 'Y', y_pred == 'Y')
    
    return {
        'accuracy': acc,
        'precision': prec,
        'recall': rec,
        'f1_score': f1,
        'confusion_matrix': conf_matrix,
        'auc_score': auc
    }

# Load and preprocess data
def main():
    # Read training and test data
    train_data = pd.read_csv('Training Data/MW1_FS_TrainData.csv')
    test_data = pd.read_csv('Testing Data/MW1_FS_TestData.csv')
    
    # Separate features and target for both datasets
    X_train = train_data.drop('Defective', axis=1).values
    y_train = train_data['Defective'].values
    X_test = test_data.drop('Defective', axis=1).values
    y_test = test_data['Defective'].values
    
    # Scale features
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    
    # Split training data for validation during feature selection
    train_size = int(0.8 * len(X_train_scaled))
    X_train_opt = X_train_scaled[:train_size]
    X_val_opt = X_train_scaled[train_size:]
    y_train_opt = y_train[:train_size]
    y_val_opt = y_train[train_size:]
    
    # Initialize and run HBDA
    hbda = HoneyBadgerOptimizer(n_badgers=20, max_iter=100)
    best_position = hbda.optimize(X_train_opt, X_val_opt, y_train_opt, y_val_opt)
    
    # Get selected features
    selected_features = hbda.binary_conversion(best_position)
    X_train_selected = X_train_scaled[:, selected_features == 1]
    X_test_selected = X_test_scaled[:, selected_features == 1]
    
    # Train final model using all training data
    nb = GaussianNB()
    nb.fit(X_train_selected, y_train)
    
    # Make predictions on test set
    y_pred = nb.predict(X_test_selected)
    
    # Evaluate model
    metrics = evaluate_model(y_test, y_pred)
    
    # Print results
    print("Model Performance Metrics on Test Data:")
    print(f"Accuracy: {metrics['accuracy']:.4f}")
    print(f"Precision: {metrics['precision']:.4f}")
    print(f"Recall: {metrics['recall']:.4f}")
    print(f"F1-Score: {metrics['f1_score']:.4f}")
    print(f"AUC Score: {metrics['auc_score']:.4f}")
    print("\nConfusion Matrix:")
    print(metrics['confusion_matrix'])
    
    # Print selected features
    feature_names = train_data.drop('Defective', axis=1).columns
    selected_feature_names = feature_names[selected_features == 1]
    print("\nSelected Features:")
    for feature in selected_feature_names:
        print(f"- {feature}")

if __name__ == "__main__":
    main()

Model Performance Metrics on Test Data:
Accuracy: 0.8800
Precision: 0.4000
Recall: 0.2500
F1-Score: 0.3077
AUC Score: 0.6026

Confusion Matrix:
[[64  3]
 [ 6  2]]

Selected Features:
- CALL_PAIRS


In [6]:
import numpy as np
import pandas as pd
from sklearn.naive_bayes import GaussianNB
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import confusion_matrix, roc_auc_score
import random

class HoneyBadgerOptimizer:
    def __init__(self, n_badgers=10, max_iter=50, lb=0, ub=1):
        self.n_badgers = n_badgers
        self.max_iter = max_iter
        self.lb = lb
        self.ub = ub
        
    def initialize_population(self, n_features):
        return np.random.uniform(self.lb, self.ub, (self.n_badgers, n_features))
    
    def binary_conversion(self, positions):
        return np.where(positions > 0.5, 1, 0)
    
    def fitness_function(self, X_train, X_val, y_train, y_val, position):
        binary_pos = self.binary_conversion(position)
        if np.sum(binary_pos) == 0:
            return 0
        
        selected_features_train = X_train[:, binary_pos == 1]
        selected_features_val = X_val[:, binary_pos == 1]
        
        # Train Naive Bayes
        nb = GaussianNB()
        nb.fit(selected_features_train, y_train)
        
        # Predict and calculate accuracy
        y_pred = nb.predict(selected_features_val)
        accuracy = accuracy_score(y_val, y_pred)
        
        # Penalty for using too many features
        feature_penalty = 0.001 * np.sum(binary_pos)
        
        return accuracy - feature_penalty
    
    def optimize(self, X_train, X_val, y_train, y_val):
        n_features = X_train.shape[1]
        population = self.initialize_population(n_features)
        best_position = None
        best_fitness = float('-inf')
        
        for iteration in range(self.max_iter):
            # Evaluate current population
            for i in range(self.n_badgers):
                current_fitness = self.fitness_function(X_train, X_val, y_train, y_val, population[i])
                
                if current_fitness > best_fitness:
                    best_fitness = current_fitness
                    best_position = population[i].copy()
            
            # Update positions
            for i in range(self.n_badgers):
                # Random walk
                r1 = random.random()
                r2 = random.random()
                
                if r1 > 0.5:  # Exploitation
                    population[i] = population[i] + r2 * (best_position - population[i])
                else:  # Exploration
                    population[i] = population[i] + r2 * (self.ub - self.lb) * \
                                  (np.random.random(n_features) - 0.5) * 2
                
                # Ensure bounds
                population[i] = np.clip(population[i], self.lb, self.ub)
        
        return best_position

def evaluate_model(y_true, y_pred):
    acc = accuracy_score(y_true, y_pred)
    prec = precision_score(y_true, y_pred, pos_label='Y')
    rec = recall_score(y_true, y_pred, pos_label='Y')
    f1 = f1_score(y_true, y_pred, pos_label='Y')
    conf_matrix = confusion_matrix(y_true, y_pred)
    auc = roc_auc_score(y_true == 'Y', y_pred == 'Y')
    
    return {
        'accuracy': acc,
        'precision': prec,
        'recall': rec,
        'f1_score': f1,
        'confusion_matrix': conf_matrix,
        'auc_score': auc
    }

# Load and preprocess data
def main():
    # Read training and test data
    train_data = pd.read_csv('Training Data/PC1_FS_TrainData.csv')
    test_data = pd.read_csv('Testing Data/PC1_FS_TestData.csv')
    
    # Separate features and target for both datasets
    X_train = train_data.drop('Defective', axis=1).values
    y_train = train_data['Defective'].values
    X_test = test_data.drop('Defective', axis=1).values
    y_test = test_data['Defective'].values
    
    # Scale features
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    
    # Split training data for validation during feature selection
    train_size = int(0.8 * len(X_train_scaled))
    X_train_opt = X_train_scaled[:train_size]
    X_val_opt = X_train_scaled[train_size:]
    y_train_opt = y_train[:train_size]
    y_val_opt = y_train[train_size:]
    
    # Initialize and run HBDA
    hbda = HoneyBadgerOptimizer(n_badgers=20, max_iter=100)
    best_position = hbda.optimize(X_train_opt, X_val_opt, y_train_opt, y_val_opt)
    
    # Get selected features
    selected_features = hbda.binary_conversion(best_position)
    X_train_selected = X_train_scaled[:, selected_features == 1]
    X_test_selected = X_test_scaled[:, selected_features == 1]
    
    # Train final model using all training data
    nb = GaussianNB()
    nb.fit(X_train_selected, y_train)
    
    # Make predictions on test set
    y_pred = nb.predict(X_test_selected)
    
    # Evaluate model
    metrics = evaluate_model(y_test, y_pred)
    
    # Print results
    print("Model Performance Metrics on Test Data:")
    print(f"Accuracy: {metrics['accuracy']:.4f}")
    print(f"Precision: {metrics['precision']:.4f}")
    print(f"Recall: {metrics['recall']:.4f}")
    print(f"F1-Score: {metrics['f1_score']:.4f}")
    print(f"AUC Score: {metrics['auc_score']:.4f}")
    print("\nConfusion Matrix:")
    print(metrics['confusion_matrix'])
    
    # Print selected features
    feature_names = train_data.drop('Defective', axis=1).columns
    selected_feature_names = feature_names[selected_features == 1]
    print("\nSelected Features:")
    for feature in selected_feature_names:
        print(f"- {feature}")

if __name__ == "__main__":
    main()

Model Performance Metrics on Test Data:
Accuracy: 0.9363
Precision: 0.7000
Recall: 0.4118
F1-Score: 0.5185
AUC Score: 0.6979

Confusion Matrix:
[[184   3]
 [ 10   7]]

Selected Features:
- LOC_TOTAL


In [7]:
import numpy as np
import pandas as pd
from sklearn.naive_bayes import GaussianNB
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import confusion_matrix, roc_auc_score
import random

class HoneyBadgerOptimizer:
    def __init__(self, n_badgers=10, max_iter=50, lb=0, ub=1):
        self.n_badgers = n_badgers
        self.max_iter = max_iter
        self.lb = lb
        self.ub = ub
        
    def initialize_population(self, n_features):
        return np.random.uniform(self.lb, self.ub, (self.n_badgers, n_features))
    
    def binary_conversion(self, positions):
        return np.where(positions > 0.5, 1, 0)
    
    def fitness_function(self, X_train, X_val, y_train, y_val, position):
        binary_pos = self.binary_conversion(position)
        if np.sum(binary_pos) == 0:
            return 0
        
        selected_features_train = X_train[:, binary_pos == 1]
        selected_features_val = X_val[:, binary_pos == 1]
        
        # Train Naive Bayes
        nb = GaussianNB()
        nb.fit(selected_features_train, y_train)
        
        # Predict and calculate accuracy
        y_pred = nb.predict(selected_features_val)
        accuracy = accuracy_score(y_val, y_pred)
        
        # Penalty for using too many features
        feature_penalty = 0.001 * np.sum(binary_pos)
        
        return accuracy - feature_penalty
    
    def optimize(self, X_train, X_val, y_train, y_val):
        n_features = X_train.shape[1]
        population = self.initialize_population(n_features)
        best_position = None
        best_fitness = float('-inf')
        
        for iteration in range(self.max_iter):
            # Evaluate current population
            for i in range(self.n_badgers):
                current_fitness = self.fitness_function(X_train, X_val, y_train, y_val, population[i])
                
                if current_fitness > best_fitness:
                    best_fitness = current_fitness
                    best_position = population[i].copy()
            
            # Update positions
            for i in range(self.n_badgers):
                # Random walk
                r1 = random.random()
                r2 = random.random()
                
                if r1 > 0.5:  # Exploitation
                    population[i] = population[i] + r2 * (best_position - population[i])
                else:  # Exploration
                    population[i] = population[i] + r2 * (self.ub - self.lb) * \
                                  (np.random.random(n_features) - 0.5) * 2
                
                # Ensure bounds
                population[i] = np.clip(population[i], self.lb, self.ub)
        
        return best_position

def evaluate_model(y_true, y_pred):
    acc = accuracy_score(y_true, y_pred)
    prec = precision_score(y_true, y_pred, pos_label='Y')
    rec = recall_score(y_true, y_pred, pos_label='Y')
    f1 = f1_score(y_true, y_pred, pos_label='Y')
    conf_matrix = confusion_matrix(y_true, y_pred)
    auc = roc_auc_score(y_true == 'Y', y_pred == 'Y')
    
    return {
        'accuracy': acc,
        'precision': prec,
        'recall': rec,
        'f1_score': f1,
        'confusion_matrix': conf_matrix,
        'auc_score': auc
    }

# Load and preprocess data
def main():
    # Read training and test data
    train_data = pd.read_csv('Training Data/PC3_FS_TrainData.csv')
    test_data = pd.read_csv('Testing Data/PC3_FS_TestData.csv')
    
    # Separate features and target for both datasets
    X_train = train_data.drop('Defective', axis=1).values
    y_train = train_data['Defective'].values
    X_test = test_data.drop('Defective', axis=1).values
    y_test = test_data['Defective'].values
    
    # Scale features
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    
    # Split training data for validation during feature selection
    train_size = int(0.8 * len(X_train_scaled))
    X_train_opt = X_train_scaled[:train_size]
    X_val_opt = X_train_scaled[train_size:]
    y_train_opt = y_train[:train_size]
    y_val_opt = y_train[train_size:]
    
    # Initialize and run HBDA
    hbda = HoneyBadgerOptimizer(n_badgers=20, max_iter=100)
    best_position = hbda.optimize(X_train_opt, X_val_opt, y_train_opt, y_val_opt)
    
    # Get selected features
    selected_features = hbda.binary_conversion(best_position)
    X_train_selected = X_train_scaled[:, selected_features == 1]
    X_test_selected = X_test_scaled[:, selected_features == 1]
    
    # Train final model using all training data
    nb = GaussianNB()
    nb.fit(X_train_selected, y_train)
    
    # Make predictions on test set
    y_pred = nb.predict(X_test_selected)
    
    # Evaluate model
    metrics = evaluate_model(y_test, y_pred)
    
    # Print results
    print("Model Performance Metrics on Test Data:")
    print(f"Accuracy: {metrics['accuracy']:.4f}")
    print(f"Precision: {metrics['precision']:.4f}")
    print(f"Recall: {metrics['recall']:.4f}")
    print(f"F1-Score: {metrics['f1_score']:.4f}")
    print(f"AUC Score: {metrics['auc_score']:.4f}")
    print("\nConfusion Matrix:")
    print(metrics['confusion_matrix'])
    
    # Print selected features
    feature_names = train_data.drop('Defective', axis=1).columns
    selected_feature_names = feature_names[selected_features == 1]
    print("\nSelected Features:")
    for feature in selected_feature_names:
        print(f"- {feature}")

if __name__ == "__main__":
    main()

Model Performance Metrics on Test Data:
Accuracy: 0.8418
Precision: 0.3659
Recall: 0.3846
F1-Score: 0.3750
AUC Score: 0.6454

Confusion Matrix:
[[251  26]
 [ 24  15]]

Selected Features:
- HALSTEAD_CONTENT
- NORMALIZED_CYLOMATIC_COMPLEXITY
- PERCENT_COMMENTS


In [8]:
import numpy as np
import pandas as pd
from sklearn.naive_bayes import GaussianNB
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import confusion_matrix, roc_auc_score
import random

class HoneyBadgerOptimizer:
    def __init__(self, n_badgers=10, max_iter=50, lb=0, ub=1):
        self.n_badgers = n_badgers
        self.max_iter = max_iter
        self.lb = lb
        self.ub = ub
        
    def initialize_population(self, n_features):
        return np.random.uniform(self.lb, self.ub, (self.n_badgers, n_features))
    
    def binary_conversion(self, positions):
        return np.where(positions > 0.5, 1, 0)
    
    def fitness_function(self, X_train, X_val, y_train, y_val, position):
        binary_pos = self.binary_conversion(position)
        if np.sum(binary_pos) == 0:
            return 0
        
        selected_features_train = X_train[:, binary_pos == 1]
        selected_features_val = X_val[:, binary_pos == 1]
        
        # Train Naive Bayes
        nb = GaussianNB()
        nb.fit(selected_features_train, y_train)
        
        # Predict and calculate accuracy
        y_pred = nb.predict(selected_features_val)
        accuracy = accuracy_score(y_val, y_pred)
        
        # Penalty for using too many features
        feature_penalty = 0.001 * np.sum(binary_pos)
        
        return accuracy - feature_penalty
    
    def optimize(self, X_train, X_val, y_train, y_val):
        n_features = X_train.shape[1]
        population = self.initialize_population(n_features)
        best_position = None
        best_fitness = float('-inf')
        
        for iteration in range(self.max_iter):
            # Evaluate current population
            for i in range(self.n_badgers):
                current_fitness = self.fitness_function(X_train, X_val, y_train, y_val, population[i])
                
                if current_fitness > best_fitness:
                    best_fitness = current_fitness
                    best_position = population[i].copy()
            
            # Update positions
            for i in range(self.n_badgers):
                # Random walk
                r1 = random.random()
                r2 = random.random()
                
                if r1 > 0.5:  # Exploitation
                    population[i] = population[i] + r2 * (best_position - population[i])
                else:  # Exploration
                    population[i] = population[i] + r2 * (self.ub - self.lb) * \
                                  (np.random.random(n_features) - 0.5) * 2
                
                # Ensure bounds
                population[i] = np.clip(population[i], self.lb, self.ub)
        
        return best_position

def evaluate_model(y_true, y_pred):
    acc = accuracy_score(y_true, y_pred)
    prec = precision_score(y_true, y_pred, pos_label='Y')
    rec = recall_score(y_true, y_pred, pos_label='Y')
    f1 = f1_score(y_true, y_pred, pos_label='Y')
    conf_matrix = confusion_matrix(y_true, y_pred)
    auc = roc_auc_score(y_true == 'Y', y_pred == 'Y')
    
    return {
        'accuracy': acc,
        'precision': prec,
        'recall': rec,
        'f1_score': f1,
        'confusion_matrix': conf_matrix,
        'auc_score': auc
    }

# Load and preprocess data
def main():
    # Read training and test data
    train_data = pd.read_csv('Training Data/PC4_FS_TrainData.csv')
    test_data = pd.read_csv('Testing Data/PC4_FS_TestData.csv')
    
    # Separate features and target for both datasets
    X_train = train_data.drop('Defective', axis=1).values
    y_train = train_data['Defective'].values
    X_test = test_data.drop('Defective', axis=1).values
    y_test = test_data['Defective'].values
    
    # Scale features
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    
    # Split training data for validation during feature selection
    train_size = int(0.8 * len(X_train_scaled))
    X_train_opt = X_train_scaled[:train_size]
    X_val_opt = X_train_scaled[train_size:]
    y_train_opt = y_train[:train_size]
    y_val_opt = y_train[train_size:]
    
    # Initialize and run HBDA
    hbda = HoneyBadgerOptimizer(n_badgers=20, max_iter=100)
    best_position = hbda.optimize(X_train_opt, X_val_opt, y_train_opt, y_val_opt)
    
    # Get selected features
    selected_features = hbda.binary_conversion(best_position)
    X_train_selected = X_train_scaled[:, selected_features == 1]
    X_test_selected = X_test_scaled[:, selected_features == 1]
    
    # Train final model using all training data
    nb = GaussianNB()
    nb.fit(X_train_selected, y_train)
    
    # Make predictions on test set
    y_pred = nb.predict(X_test_selected)
    
    # Evaluate model
    metrics = evaluate_model(y_test, y_pred)
    
    # Print results
    print("Model Performance Metrics on Test Data:")
    print(f"Accuracy: {metrics['accuracy']:.4f}")
    print(f"Precision: {metrics['precision']:.4f}")
    print(f"Recall: {metrics['recall']:.4f}")
    print(f"F1-Score: {metrics['f1_score']:.4f}")
    print(f"AUC Score: {metrics['auc_score']:.4f}")
    print("\nConfusion Matrix:")
    print(metrics['confusion_matrix'])
    
    # Print selected features
    feature_names = train_data.drop('Defective', axis=1).columns
    selected_feature_names = feature_names[selected_features == 1]
    print("\nSelected Features:")
    for feature in selected_feature_names:
        print(f"- {feature}")

if __name__ == "__main__":
    main()

Model Performance Metrics on Test Data:
Accuracy: 0.8976
Precision: 0.7917
Recall: 0.3585
F1-Score: 0.4935
AUC Score: 0.6716

Confusion Matrix:
[[323   5]
 [ 34  19]]

Selected Features:
- LOC_CODE_AND_COMMENT
- ESSENTIAL_COMPLEXITY
