In [1]:
import pandas as pd
import numpy as np
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix
import warnings
warnings.filterwarnings('ignore')

class HoneyBadgerFeatureSelection:
    def __init__(self, n_features_to_select=5, max_iter=50, population_size=20):
        self.n_features = n_features_to_select
        self.max_iter = max_iter
        self.population_size = population_size
        self.best_features = None
        self.best_fitness = float('-inf')
    
    def fitness_function(self, selected_features, X, y, classifier):
        if np.sum(selected_features) == 0:
            return float('-inf')
            
        X_selected = X[:, selected_features == 1]
        try:
            classifier.fit(X_selected, y)
            y_pred = classifier.predict(X_selected)
            return accuracy_score(y, y_pred)
        except:
            return float('-inf')
    
    def honey_badger_movement(self, current_position, best_position):
        r = np.random.random()
        if r < 0.5:
            return np.where(np.random.random(len(current_position)) < 0.3,
                          1 - current_position,
                          current_position)
        else:
            return np.where(np.random.random(len(current_position)) < 0.5,
                          best_position,
                          current_position)
    
    def fit(self, X, y, classifier):
        n_total_features = X.shape[1]
        population = np.random.randint(2, size=(self.population_size, n_total_features))
        for i in range(self.population_size):
            if np.sum(population[i]) < self.n_features:
                random_indices = np.random.choice(
                    n_total_features, 
                    self.n_features - np.sum(population[i]),
                    replace=False
                )
                population[i, random_indices] = 1
        
        global_best_position = None
        global_best_fitness = float('-inf')
        
        for iteration in range(self.max_iter):
            fitness_values = np.array([
                self.fitness_function(position, X, y, classifier)
                for position in population
            ])
            
            current_best_idx = np.argmax(fitness_values)
            if fitness_values[current_best_idx] > global_best_fitness:
                global_best_fitness = fitness_values[current_best_idx]
                global_best_position = population[current_best_idx].copy()
            
            new_population = np.array([
                self.honey_badger_movement(position, global_best_position)
                for position in population
            ])
            
            population = new_population
        
        self.best_features = global_best_position
        self.best_fitness = global_best_fitness
        return self

def load_and_preprocess_data():
    # Read training and testing data
    train_data = pd.read_csv('Training Data/CM1_FS_TrainData.csv')
    test_data = pd.read_csv('Testing Data/CM1_FS_TestData.csv')
    
    # Separate features and target for both datasets
    X_train = train_data.drop('Defective', axis=1)
    y_train = train_data['Defective']
    X_test = test_data.drop('Defective', axis=1)
    y_test = test_data['Defective']
    
    # Convert target to numeric
    le = LabelEncoder()
    y_train = le.fit_transform(y_train)
    y_test = le.transform(y_test)
    
    return X_train, X_test, y_train, y_test

def train_and_evaluate_defect_predictor(X_train, X_test, y_train, y_test, n_features=5):
    # Scale features
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    
    # Initialize SVM
    svm = SVC(kernel='rbf', random_state=42, probability=True)
    
    # Feature selection using training data
    print("Starting feature selection...")
    hb = HoneyBadgerFeatureSelection(n_features_to_select=n_features)
    hb.fit(X_train_scaled, y_train, svm)
    
    # Get selected features
    selected_features = np.where(hb.best_features == 1)[0]
    selected_feature_names = X_train.columns[selected_features].tolist()
    
    # Train final model with selected features
    X_train_selected = X_train_scaled[:, selected_features]
    X_test_selected = X_test_scaled[:, selected_features]
    
    final_svm = SVC(kernel='rbf', random_state=42, probability=True)
    final_svm.fit(X_train_selected, y_train)
    
    # Make predictions on test set
    y_pred = final_svm.predict(X_test_selected)
    y_pred_proba = final_svm.predict_proba(X_test_selected)[:, 1]
    
    # Calculate metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f_measure = f1_score(y_test, y_pred)
    auc_score = roc_auc_score(y_test, y_pred_proba)
    conf_matrix = confusion_matrix(y_test, y_pred)
    
    return {
        'model': final_svm,
        'selected_features': selected_feature_names,
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f_measure': f_measure,
        'auc_score': auc_score,
        'confusion_matrix': conf_matrix,
        'scaler': scaler,
        'selected_indices': selected_features
    }

if __name__ == "__main__":
    # Load and preprocess data
    X_train, X_test, y_train, y_test = load_and_preprocess_data()
    
    # Train model and get results
    results = train_and_evaluate_defect_predictor(X_train, X_test, y_train, y_test, n_features=5)
    
    # Print results
    print("\nSelected Features:")
    for i, feature in enumerate(results['selected_features'], 1):
        print(f"{i}. {feature}")
    
    print(f"\nModel Performance Metrics:")
    print(f"Accuracy: {results['accuracy']:.4f}")
    print(f"Precision: {results['precision']:.4f}")
    print(f"Recall: {results['recall']:.4f}")
    print(f"F-measure: {results['f_measure']:.4f}")
    print(f"AUC Score: {results['auc_score']:.4f}")
    
    print("\nConfusion Matrix:")
    print("[True Negatives  False Positives]")
    print("[False Negatives True Positives]")
    print(results['confusion_matrix'])
    
    # Additional analysis of selected features
    print("\nFeature Importance Analysis:")
    for feature, idx in zip(results['selected_features'], results['selected_indices']):
        print(f"{feature}: Original index {idx}")

Starting feature selection...

Selected Features:
1. HALSTEAD_CONTENT
2. NUMBER_OF_LINES

Model Performance Metrics:
Accuracy: 0.8485
Precision: 0.0000
Recall: 0.0000
F-measure: 0.0000
AUC Score: 0.3855

Confusion Matrix:
[True Negatives  False Positives]
[False Negatives True Positives]
[[84  2]
 [13  0]]

Feature Importance Analysis:
HALSTEAD_CONTENT: Original index 2
NUMBER_OF_LINES: Original index 6


In [10]:
import pandas as pd
import numpy as np
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix
import warnings
warnings.filterwarnings('ignore')

class HoneyBadgerFeatureSelection:
    def __init__(self, max_iter=50, population_size=20):
        self.max_iter = max_iter
        self.population_size = population_size
        self.best_features = None
        self.best_fitness = float('-inf')
    
    def fitness_function(self, selected_features, X, y, classifier):
        if np.sum(selected_features) == 0:
            return float('-inf')
            
        X_selected = X[:, selected_features == 1]
        try:
            classifier.fit(X_selected, y)
            y_pred = classifier.predict(X_selected)
            # Modified fitness function to balance accuracy and feature count
            accuracy = accuracy_score(y, y_pred)
            feature_penalty = 0.01 * np.sum(selected_features) / len(selected_features)
            return accuracy - feature_penalty
        except:
            return float('-inf')
    
    def honey_badger_movement(self, current_position, best_position):
        r = np.random.random()
        if r < 0.5:
            # Exploration phase: flip bits with 30% probability
            return np.where(np.random.random(len(current_position)) < 0.3,
                          1 - current_position,
                          current_position)
        else:
            # Exploitation phase: follow best position with 50% probability
            return np.where(np.random.random(len(current_position)) < 0.5,
                          best_position,
                          current_position)
    
    def fit(self, X, y, classifier):
        n_total_features = X.shape[1]
        # Initialize population with random binary vectors
        population = np.random.randint(2, size=(self.population_size, n_total_features))
        
        global_best_position = None
        global_best_fitness = float('-inf')
        
        for iteration in range(self.max_iter):
            fitness_values = np.array([
                self.fitness_function(position, X, y, classifier)
                for position in population
            ])
            
            current_best_idx = np.argmax(fitness_values)
            if fitness_values[current_best_idx] > global_best_fitness:
                global_best_fitness = fitness_values[current_best_idx]
                global_best_position = population[current_best_idx].copy()
            
            new_population = np.array([
                self.honey_badger_movement(position, global_best_position)
                for position in population
            ])
            
            population = new_population
        
        self.best_features = global_best_position
        self.best_fitness = global_best_fitness
        return self

def load_and_preprocess_data():
    # Read training and testing data
    train_data = pd.read_csv('Training Data/MC2_FS_TrainData.csv')
    test_data = pd.read_csv('Testing Data/MC2_FS_TestData.csv')
    
    # Separate features and target for both datasets
    X_train = train_data.drop('Defective', axis=1)
    y_train = train_data['Defective']
    X_test = test_data.drop('Defective', axis=1)
    y_test = test_data['Defective']
    
    # Convert target to numeric
    le = LabelEncoder()
    y_train = le.fit_transform(y_train)
    y_test = le.transform(y_test)
    
    return X_train, X_test, y_train, y_test

def train_and_evaluate_defect_predictor(X_train, X_test, y_train, y_test):
    # Scale features
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    
    # Initialize SVM
    svm = SVC(kernel='rbf', random_state=42, probability=True)
    
    # Feature selection using training data
    print("Starting feature selection...")
    hb = HoneyBadgerFeatureSelection()
    hb.fit(X_train_scaled, y_train, svm)
    
    # Get selected features
    selected_features = np.where(hb.best_features == 1)[0]
    selected_feature_names = X_train.columns[selected_features].tolist()
    
    # Train final model with selected features
    X_train_selected = X_train_scaled[:, selected_features]
    X_test_selected = X_test_scaled[:, selected_features]
    
    final_svm = SVC(kernel='rbf', random_state=42, probability=True)
    final_svm.fit(X_train_selected, y_train)
    
    # Make predictions on test set
    y_pred = final_svm.predict(X_test_selected)
    y_pred_proba = final_svm.predict_proba(X_test_selected)[:, 1]
    
    # Calculate metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f_measure = f1_score(y_test, y_pred)
    auc_score = roc_auc_score(y_test, y_pred_proba)
    conf_matrix = confusion_matrix(y_test, y_pred)
    
    return {
        'model': final_svm,
        'selected_features': selected_feature_names,
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f_measure': f_measure,
        'auc_score': auc_score,
        'confusion_matrix': conf_matrix,
        'scaler': scaler,
        'selected_indices': selected_features
    }

if __name__ == "__main__":
    # Load and preprocess data
    X_train, X_test, y_train, y_test = load_and_preprocess_data()
    
    # Train model and get results
    results = train_and_evaluate_defect_predictor(X_train, X_test, y_train, y_test)
    
    # Print results
    print("\nSelected Features:")
    print(f"Number of features selected: {len(results['selected_features'])}")
    for i, feature in enumerate(results['selected_features'], 1):
        print(f"{i}. {feature}")
    
    print(f"\nModel Performance Metrics:")
    print(f"Accuracy: {results['accuracy']:.4f}")
    print(f"Precision: {results['precision']:.4f}")
    print(f"Recall: {results['recall']:.4f}")
    print(f"F-measure: {results['f_measure']:.4f}")
    print(f"AUC Score: {results['auc_score']:.4f}")
    
    print("\nConfusion Matrix:")
    print("[True Negatives  False Positives]")
    print("[False Negatives True Positives]")
    print(results['confusion_matrix'])
    
    # Additional analysis of selected features
    print("\nFeature Importance Analysis:")
    for feature, idx in zip(results['selected_features'], results['selected_indices']):
        print(f"{feature}: Original index {idx}")

Starting feature selection...

Selected Features:
Number of features selected: 4
1. LOC_COMMENTS
2. DESIGN_COMPLEXITY
3. HALSTEAD_EFFORT
4. MULTIPLE_CONDITION_COUNT

Model Performance Metrics:
Accuracy: 0.6053
Precision: 0.4000
Recall: 0.3077
F-measure: 0.3478
AUC Score: 0.6185

Confusion Matrix:
[True Negatives  False Positives]
[False Negatives True Positives]
[[19  6]
 [ 9  4]]

Feature Importance Analysis:
LOC_COMMENTS: Original index 1
DESIGN_COMPLEXITY: Original index 2
HALSTEAD_EFFORT: Original index 6
MULTIPLE_CONDITION_COUNT: Original index 7


In [3]:
import pandas as pd
import numpy as np
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix
import warnings
warnings.filterwarnings('ignore')

class HoneyBadgerFeatureSelection:
    def __init__(self, n_features_to_select=5, max_iter=50, population_size=20):
        self.n_features = n_features_to_select
        self.max_iter = max_iter
        self.population_size = population_size
        self.best_features = None
        self.best_fitness = float('-inf')
    
    def fitness_function(self, selected_features, X, y, classifier):
        if np.sum(selected_features) == 0:
            return float('-inf')
            
        X_selected = X[:, selected_features == 1]
        try:
            classifier.fit(X_selected, y)
            y_pred = classifier.predict(X_selected)
            return accuracy_score(y, y_pred)
        except:
            return float('-inf')
    
    def honey_badger_movement(self, current_position, best_position):
        r = np.random.random()
        if r < 0.5:
            return np.where(np.random.random(len(current_position)) < 0.3,
                          1 - current_position,
                          current_position)
        else:
            return np.where(np.random.random(len(current_position)) < 0.5,
                          best_position,
                          current_position)
    
    def fit(self, X, y, classifier):
        n_total_features = X.shape[1]
        population = np.random.randint(2, size=(self.population_size, n_total_features))
        for i in range(self.population_size):
            if np.sum(population[i]) < self.n_features:
                random_indices = np.random.choice(
                    n_total_features, 
                    self.n_features - np.sum(population[i]),
                    replace=False
                )
                population[i, random_indices] = 1
        
        global_best_position = None
        global_best_fitness = float('-inf')
        
        for iteration in range(self.max_iter):
            fitness_values = np.array([
                self.fitness_function(position, X, y, classifier)
                for position in population
            ])
            
            current_best_idx = np.argmax(fitness_values)
            if fitness_values[current_best_idx] > global_best_fitness:
                global_best_fitness = fitness_values[current_best_idx]
                global_best_position = population[current_best_idx].copy()
            
            new_population = np.array([
                self.honey_badger_movement(position, global_best_position)
                for position in population
            ])
            
            population = new_population
        
        self.best_features = global_best_position
        self.best_fitness = global_best_fitness
        return self

def load_and_preprocess_data():
    # Read training and testing data
    train_data = pd.read_csv('Training Data/MW1_FS_TrainData.csv')
    test_data = pd.read_csv('Testing Data/MW1_FS_TestData.csv')
    
    # Separate features and target for both datasets
    X_train = train_data.drop('Defective', axis=1)
    y_train = train_data['Defective']
    X_test = test_data.drop('Defective', axis=1)
    y_test = test_data['Defective']
    
    # Convert target to numeric
    le = LabelEncoder()
    y_train = le.fit_transform(y_train)
    y_test = le.transform(y_test)
    
    return X_train, X_test, y_train, y_test

def train_and_evaluate_defect_predictor(X_train, X_test, y_train, y_test, n_features=5):
    # Scale features
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    
    # Initialize SVM
    svm = SVC(kernel='rbf', random_state=42, probability=True)
    
    # Feature selection using training data
    print("Starting feature selection...")
    hb = HoneyBadgerFeatureSelection(n_features_to_select=n_features)
    hb.fit(X_train_scaled, y_train, svm)
    
    # Get selected features
    selected_features = np.where(hb.best_features == 1)[0]
    selected_feature_names = X_train.columns[selected_features].tolist()
    
    # Train final model with selected features
    X_train_selected = X_train_scaled[:, selected_features]
    X_test_selected = X_test_scaled[:, selected_features]
    
    final_svm = SVC(kernel='rbf', random_state=42, probability=True)
    final_svm.fit(X_train_selected, y_train)
    
    # Make predictions on test set
    y_pred = final_svm.predict(X_test_selected)
    y_pred_proba = final_svm.predict_proba(X_test_selected)[:, 1]
    
    # Calculate metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f_measure = f1_score(y_test, y_pred)
    auc_score = roc_auc_score(y_test, y_pred_proba)
    conf_matrix = confusion_matrix(y_test, y_pred)
    
    return {
        'model': final_svm,
        'selected_features': selected_feature_names,
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f_measure': f_measure,
        'auc_score': auc_score,
        'confusion_matrix': conf_matrix,
        'scaler': scaler,
        'selected_indices': selected_features
    }

if __name__ == "__main__":
    # Load and preprocess data
    X_train, X_test, y_train, y_test = load_and_preprocess_data()
    
    # Train model and get results
    results = train_and_evaluate_defect_predictor(X_train, X_test, y_train, y_test, n_features=5)
    
    # Print results
    print("\nSelected Features:")
    for i, feature in enumerate(results['selected_features'], 1):
        print(f"{i}. {feature}")
    
    print(f"\nModel Performance Metrics:")
    print(f"Accuracy: {results['accuracy']:.4f}")
    print(f"Precision: {results['precision']:.4f}")
    print(f"Recall: {results['recall']:.4f}")
    print(f"F-measure: {results['f_measure']:.4f}")
    print(f"AUC Score: {results['auc_score']:.4f}")
    
    print("\nConfusion Matrix:")
    print("[True Negatives  False Positives]")
    print("[False Negatives True Positives]")
    print(results['confusion_matrix'])
    
    # Additional analysis of selected features
    print("\nFeature Importance Analysis:")
    for feature, idx in zip(results['selected_features'], results['selected_indices']):
        print(f"{feature}: Original index {idx}")

Starting feature selection...

Selected Features:
1. CALL_PAIRS
2. EDGE_COUNT
3. HALSTEAD_LENGTH
4. NODE_COUNT
5. LOC_TOTAL

Model Performance Metrics:
Accuracy: 0.9067
Precision: 1.0000
Recall: 0.1250
F-measure: 0.2222
AUC Score: 0.7631

Confusion Matrix:
[True Negatives  False Positives]
[False Negatives True Positives]
[[67  0]
 [ 7  1]]

Feature Importance Analysis:
CALL_PAIRS: Original index 0
EDGE_COUNT: Original index 2
HALSTEAD_LENGTH: Original index 4
NODE_COUNT: Original index 5
LOC_TOTAL: Original index 7


In [4]:
import pandas as pd
import numpy as np
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix
import warnings
warnings.filterwarnings('ignore')

class HoneyBadgerFeatureSelection:
    def __init__(self, n_features_to_select=5, max_iter=50, population_size=20):
        self.n_features = n_features_to_select
        self.max_iter = max_iter
        self.population_size = population_size
        self.best_features = None
        self.best_fitness = float('-inf')
    
    def fitness_function(self, selected_features, X, y, classifier):
        if np.sum(selected_features) == 0:
            return float('-inf')
            
        X_selected = X[:, selected_features == 1]
        try:
            classifier.fit(X_selected, y)
            y_pred = classifier.predict(X_selected)
            return accuracy_score(y, y_pred)
        except:
            return float('-inf')
    
    def honey_badger_movement(self, current_position, best_position):
        r = np.random.random()
        if r < 0.5:
            return np.where(np.random.random(len(current_position)) < 0.3,
                          1 - current_position,
                          current_position)
        else:
            return np.where(np.random.random(len(current_position)) < 0.5,
                          best_position,
                          current_position)
    
    def fit(self, X, y, classifier):
        n_total_features = X.shape[1]
        population = np.random.randint(2, size=(self.population_size, n_total_features))
        for i in range(self.population_size):
            if np.sum(population[i]) < self.n_features:
                random_indices = np.random.choice(
                    n_total_features, 
                    self.n_features - np.sum(population[i]),
                    replace=False
                )
                population[i, random_indices] = 1
        
        global_best_position = None
        global_best_fitness = float('-inf')
        
        for iteration in range(self.max_iter):
            fitness_values = np.array([
                self.fitness_function(position, X, y, classifier)
                for position in population
            ])
            
            current_best_idx = np.argmax(fitness_values)
            if fitness_values[current_best_idx] > global_best_fitness:
                global_best_fitness = fitness_values[current_best_idx]
                global_best_position = population[current_best_idx].copy()
            
            new_population = np.array([
                self.honey_badger_movement(position, global_best_position)
                for position in population
            ])
            
            population = new_population
        
        self.best_features = global_best_position
        self.best_fitness = global_best_fitness
        return self

def load_and_preprocess_data():
    # Read training and testing data
    train_data = pd.read_csv('Training Data/PC1_FS_TrainData.csv')
    test_data = pd.read_csv('Testing Data/PC1_FS_TestData.csv')
    
    # Separate features and target for both datasets
    X_train = train_data.drop('Defective', axis=1)
    y_train = train_data['Defective']
    X_test = test_data.drop('Defective', axis=1)
    y_test = test_data['Defective']
    
    # Convert target to numeric
    le = LabelEncoder()
    y_train = le.fit_transform(y_train)
    y_test = le.transform(y_test)
    
    return X_train, X_test, y_train, y_test

def train_and_evaluate_defect_predictor(X_train, X_test, y_train, y_test, n_features=5):
    # Scale features
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    
    # Initialize SVM
    svm = SVC(kernel='rbf', random_state=42, probability=True)
    
    # Feature selection using training data
    print("Starting feature selection...")
    hb = HoneyBadgerFeatureSelection(n_features_to_select=n_features)
    hb.fit(X_train_scaled, y_train, svm)
    
    # Get selected features
    selected_features = np.where(hb.best_features == 1)[0]
    selected_feature_names = X_train.columns[selected_features].tolist()
    
    # Train final model with selected features
    X_train_selected = X_train_scaled[:, selected_features]
    X_test_selected = X_test_scaled[:, selected_features]
    
    final_svm = SVC(kernel='rbf', random_state=42, probability=True)
    final_svm.fit(X_train_selected, y_train)
    
    # Make predictions on test set
    y_pred = final_svm.predict(X_test_selected)
    y_pred_proba = final_svm.predict_proba(X_test_selected)[:, 1]
    
    # Calculate metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f_measure = f1_score(y_test, y_pred)
    auc_score = roc_auc_score(y_test, y_pred_proba)
    conf_matrix = confusion_matrix(y_test, y_pred)
    
    return {
        'model': final_svm,
        'selected_features': selected_feature_names,
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f_measure': f_measure,
        'auc_score': auc_score,
        'confusion_matrix': conf_matrix,
        'scaler': scaler,
        'selected_indices': selected_features
    }

if __name__ == "__main__":
    # Load and preprocess data
    X_train, X_test, y_train, y_test = load_and_preprocess_data()
    
    # Train model and get results
    results = train_and_evaluate_defect_predictor(X_train, X_test, y_train, y_test, n_features=5)
    
    # Print results
    print("\nSelected Features:")
    for i, feature in enumerate(results['selected_features'], 1):
        print(f"{i}. {feature}")
    
    print(f"\nModel Performance Metrics:")
    print(f"Accuracy: {results['accuracy']:.4f}")
    print(f"Precision: {results['precision']:.4f}")
    print(f"Recall: {results['recall']:.4f}")
    print(f"F-measure: {results['f_measure']:.4f}")
    print(f"AUC Score: {results['auc_score']:.4f}")
    
    print("\nConfusion Matrix:")
    print("[True Negatives  False Positives]")
    print("[False Negatives True Positives]")
    print(results['confusion_matrix'])
    
    # Additional analysis of selected features
    print("\nFeature Importance Analysis:")
    for feature, idx in zip(results['selected_features'], results['selected_indices']):
        print(f"{feature}: Original index {idx}")

Starting feature selection...

Selected Features:
1. LOC_CODE_AND_COMMENT
2. NUMBER_OF_LINES
3. LOC_TOTAL

Model Performance Metrics:
Accuracy: 0.9314
Precision: 1.0000
Recall: 0.1765
F-measure: 0.3000
AUC Score: 0.7408

Confusion Matrix:
[True Negatives  False Positives]
[False Negatives True Positives]
[[187   0]
 [ 14   3]]

Feature Importance Analysis:
LOC_CODE_AND_COMMENT: Original index 1
NUMBER_OF_LINES: Original index 5
LOC_TOTAL: Original index 6


In [5]:
import pandas as pd
import numpy as np
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix
import warnings
warnings.filterwarnings('ignore')

class HoneyBadgerFeatureSelection:
    def __init__(self, n_features_to_select=5, max_iter=50, population_size=20):
        self.n_features = n_features_to_select
        self.max_iter = max_iter
        self.population_size = population_size
        self.best_features = None
        self.best_fitness = float('-inf')
    
    def fitness_function(self, selected_features, X, y, classifier):
        if np.sum(selected_features) == 0:
            return float('-inf')
            
        X_selected = X[:, selected_features == 1]
        try:
            classifier.fit(X_selected, y)
            y_pred = classifier.predict(X_selected)
            return accuracy_score(y, y_pred)
        except:
            return float('-inf')
    
    def honey_badger_movement(self, current_position, best_position):
        r = np.random.random()
        if r < 0.5:
            return np.where(np.random.random(len(current_position)) < 0.3,
                          1 - current_position,
                          current_position)
        else:
            return np.where(np.random.random(len(current_position)) < 0.5,
                          best_position,
                          current_position)
    
    def fit(self, X, y, classifier):
        n_total_features = X.shape[1]
        population = np.random.randint(2, size=(self.population_size, n_total_features))
        for i in range(self.population_size):
            if np.sum(population[i]) < self.n_features:
                random_indices = np.random.choice(
                    n_total_features, 
                    self.n_features - np.sum(population[i]),
                    replace=False
                )
                population[i, random_indices] = 1
        
        global_best_position = None
        global_best_fitness = float('-inf')
        
        for iteration in range(self.max_iter):
            fitness_values = np.array([
                self.fitness_function(position, X, y, classifier)
                for position in population
            ])
            
            current_best_idx = np.argmax(fitness_values)
            if fitness_values[current_best_idx] > global_best_fitness:
                global_best_fitness = fitness_values[current_best_idx]
                global_best_position = population[current_best_idx].copy()
            
            new_population = np.array([
                self.honey_badger_movement(position, global_best_position)
                for position in population
            ])
            
            population = new_population
        
        self.best_features = global_best_position
        self.best_fitness = global_best_fitness
        return self

def load_and_preprocess_data():
    # Read training and testing data
    train_data = pd.read_csv('Training Data/PC3_FS_TrainData.csv')
    test_data = pd.read_csv('Testing Data/PC3_FS_TestData.csv')
    
    # Separate features and target for both datasets
    X_train = train_data.drop('Defective', axis=1)
    y_train = train_data['Defective']
    X_test = test_data.drop('Defective', axis=1)
    y_test = test_data['Defective']
    
    # Convert target to numeric
    le = LabelEncoder()
    y_train = le.fit_transform(y_train)
    y_test = le.transform(y_test)
    
    return X_train, X_test, y_train, y_test

def train_and_evaluate_defect_predictor(X_train, X_test, y_train, y_test, n_features=5):
    # Scale features
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    
    # Initialize SVM
    svm = SVC(kernel='rbf', random_state=42, probability=True)
    
    # Feature selection using training data
    print("Starting feature selection...")
    hb = HoneyBadgerFeatureSelection(n_features_to_select=n_features)
    hb.fit(X_train_scaled, y_train, svm)
    
    # Get selected features
    selected_features = np.where(hb.best_features == 1)[0]
    selected_feature_names = X_train.columns[selected_features].tolist()
    
    # Train final model with selected features
    X_train_selected = X_train_scaled[:, selected_features]
    X_test_selected = X_test_scaled[:, selected_features]
    
    final_svm = SVC(kernel='rbf', random_state=42, probability=True)
    final_svm.fit(X_train_selected, y_train)
    
    # Make predictions on test set
    y_pred = final_svm.predict(X_test_selected)
    y_pred_proba = final_svm.predict_proba(X_test_selected)[:, 1]
    
    # Calculate metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f_measure = f1_score(y_test, y_pred)
    auc_score = roc_auc_score(y_test, y_pred_proba)
    conf_matrix = confusion_matrix(y_test, y_pred)
    
    return {
        'model': final_svm,
        'selected_features': selected_feature_names,
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f_measure': f_measure,
        'auc_score': auc_score,
        'confusion_matrix': conf_matrix,
        'scaler': scaler,
        'selected_indices': selected_features
    }

if __name__ == "__main__":
    # Load and preprocess data
    X_train, X_test, y_train, y_test = load_and_preprocess_data()
    
    # Train model and get results
    results = train_and_evaluate_defect_predictor(X_train, X_test, y_train, y_test, n_features=5)
    
    # Print results
    print("\nSelected Features:")
    for i, feature in enumerate(results['selected_features'], 1):
        print(f"{i}. {feature}")
    
    print(f"\nModel Performance Metrics:")
    print(f"Accuracy: {results['accuracy']:.4f}")
    print(f"Precision: {results['precision']:.4f}")
    print(f"Recall: {results['recall']:.4f}")
    print(f"F-measure: {results['f_measure']:.4f}")
    print(f"AUC Score: {results['auc_score']:.4f}")
    
    print("\nConfusion Matrix:")
    print("[True Negatives  False Positives]")
    print("[False Negatives True Positives]")
    print(results['confusion_matrix'])
    
    # Additional analysis of selected features
    print("\nFeature Importance Analysis:")
    for feature, idx in zip(results['selected_features'], results['selected_indices']):
        print(f"{feature}: Original index {idx}")

Starting feature selection...

Selected Features:
1. LOC_BLANK
2. LOC_CODE_AND_COMMENT
3. HALSTEAD_CONTENT
4. NORMALIZED_CYLOMATIC_COMPLEXITY
5. NUM_UNIQUE_OPERATORS
6. NUMBER_OF_LINES

Model Performance Metrics:
Accuracy: 0.8766
Precision: 0.0000
Recall: 0.0000
F-measure: 0.0000
AUC Score: 0.7561

Confusion Matrix:
[True Negatives  False Positives]
[False Negatives True Positives]
[[277   0]
 [ 39   0]]

Feature Importance Analysis:
LOC_BLANK: Original index 0
LOC_CODE_AND_COMMENT: Original index 1
HALSTEAD_CONTENT: Original index 2
NORMALIZED_CYLOMATIC_COMPLEXITY: Original index 3
NUM_UNIQUE_OPERATORS: Original index 6
NUMBER_OF_LINES: Original index 7


In [6]:
import pandas as pd
import numpy as np
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix
import warnings
warnings.filterwarnings('ignore')

class HoneyBadgerFeatureSelection:
    def __init__(self, n_features_to_select=5, max_iter=50, population_size=20):
        self.n_features = n_features_to_select
        self.max_iter = max_iter
        self.population_size = population_size
        self.best_features = None
        self.best_fitness = float('-inf')
    
    def fitness_function(self, selected_features, X, y, classifier):
        if np.sum(selected_features) == 0:
            return float('-inf')
            
        X_selected = X[:, selected_features == 1]
        try:
            classifier.fit(X_selected, y)
            y_pred = classifier.predict(X_selected)
            return accuracy_score(y, y_pred)
        except:
            return float('-inf')
    
    def honey_badger_movement(self, current_position, best_position):
        r = np.random.random()
        if r < 0.5:
            return np.where(np.random.random(len(current_position)) < 0.3,
                          1 - current_position,
                          current_position)
        else:
            return np.where(np.random.random(len(current_position)) < 0.5,
                          best_position,
                          current_position)
    
    def fit(self, X, y, classifier):
        n_total_features = X.shape[1]
        population = np.random.randint(2, size=(self.population_size, n_total_features))
        for i in range(self.population_size):
            if np.sum(population[i]) < self.n_features:
                random_indices = np.random.choice(
                    n_total_features, 
                    self.n_features - np.sum(population[i]),
                    replace=False
                )
                population[i, random_indices] = 1
        
        global_best_position = None
        global_best_fitness = float('-inf')
        
        for iteration in range(self.max_iter):
            fitness_values = np.array([
                self.fitness_function(position, X, y, classifier)
                for position in population
            ])
            
            current_best_idx = np.argmax(fitness_values)
            if fitness_values[current_best_idx] > global_best_fitness:
                global_best_fitness = fitness_values[current_best_idx]
                global_best_position = population[current_best_idx].copy()
            
            new_population = np.array([
                self.honey_badger_movement(position, global_best_position)
                for position in population
            ])
            
            population = new_population
        
        self.best_features = global_best_position
        self.best_fitness = global_best_fitness
        return self

def load_and_preprocess_data():
    # Read training and testing data
    train_data = pd.read_csv('Training Data/PC4_FS_TrainData.csv')
    test_data = pd.read_csv('Testing Data/PC4_FS_TestData.csv')
    
    # Separate features and target for both datasets
    X_train = train_data.drop('Defective', axis=1)
    y_train = train_data['Defective']
    X_test = test_data.drop('Defective', axis=1)
    y_test = test_data['Defective']
    
    # Convert target to numeric
    le = LabelEncoder()
    y_train = le.fit_transform(y_train)
    y_test = le.transform(y_test)
    
    return X_train, X_test, y_train, y_test

def train_and_evaluate_defect_predictor(X_train, X_test, y_train, y_test, n_features=5):
    # Scale features
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    
    # Initialize SVM
    svm = SVC(kernel='rbf', random_state=42, probability=True)
    
    # Feature selection using training data
    print("Starting feature selection...")
    hb = HoneyBadgerFeatureSelection(n_features_to_select=n_features)
    hb.fit(X_train_scaled, y_train, svm)
    
    # Get selected features
    selected_features = np.where(hb.best_features == 1)[0]
    selected_feature_names = X_train.columns[selected_features].tolist()
    
    # Train final model with selected features
    X_train_selected = X_train_scaled[:, selected_features]
    X_test_selected = X_test_scaled[:, selected_features]
    
    final_svm = SVC(kernel='rbf', random_state=42, probability=True)
    final_svm.fit(X_train_selected, y_train)
    
    # Make predictions on test set
    y_pred = final_svm.predict(X_test_selected)
    y_pred_proba = final_svm.predict_proba(X_test_selected)[:, 1]
    
    # Calculate metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f_measure = f1_score(y_test, y_pred)
    auc_score = roc_auc_score(y_test, y_pred_proba)
    conf_matrix = confusion_matrix(y_test, y_pred)
    
    return {
        'model': final_svm,
        'selected_features': selected_feature_names,
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f_measure': f_measure,
        'auc_score': auc_score,
        'confusion_matrix': conf_matrix,
        'scaler': scaler,
        'selected_indices': selected_features
    }

if __name__ == "__main__":
    # Load and preprocess data
    X_train, X_test, y_train, y_test = load_and_preprocess_data()
    
    # Train model and get results
    results = train_and_evaluate_defect_predictor(X_train, X_test, y_train, y_test, n_features=5)
    
    # Print results
    print("\nSelected Features:")
    for i, feature in enumerate(results['selected_features'], 1):
        print(f"{i}. {feature}")
    
    print(f"\nModel Performance Metrics:")
    print(f"Accuracy: {results['accuracy']:.4f}")
    print(f"Precision: {results['precision']:.4f}")
    print(f"Recall: {results['recall']:.4f}")
    print(f"F-measure: {results['f_measure']:.4f}")
    print(f"AUC Score: {results['auc_score']:.4f}")
    
    print("\nConfusion Matrix:")
    print("[True Negatives  False Positives]")
    print("[False Negatives True Positives]")
    print(results['confusion_matrix'])
    
    # Additional analysis of selected features
    print("\nFeature Importance Analysis:")
    for feature, idx in zip(results['selected_features'], results['selected_indices']):
        print(f"{feature}: Original index {idx}")

Starting feature selection...

Selected Features:
1. LOC_BLANK
2. LOC_CODE_AND_COMMENT
3. HALSTEAD_LENGTH
4. MODIFIED_CONDITION_COUNT

Model Performance Metrics:
Accuracy: 0.8845
Precision: 0.8462
Recall: 0.2075
F-measure: 0.3333
AUC Score: 0.6689

Confusion Matrix:
[True Negatives  False Positives]
[False Negatives True Positives]
[[326   2]
 [ 42  11]]

Feature Importance Analysis:
LOC_BLANK: Original index 0
LOC_CODE_AND_COMMENT: Original index 1
HALSTEAD_LENGTH: Original index 4
MODIFIED_CONDITION_COUNT: Original index 5
