In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import (accuracy_score, confusion_matrix, precision_score, 
                           recall_score, f1_score, roc_auc_score)
import warnings
warnings.filterwarnings('ignore')
import time

class HoneyBadgerFeatureSelection:
    def __init__(self, n_features_to_select=5, max_iter=50, population_size=20):
        self.n_features = n_features_to_select
        self.max_iter = max_iter
        self.population_size = population_size
        self.best_features = None
        self.best_fitness = float('-inf')
    
    def fitness_function(self, selected_features, X, y, classifier):
        if np.sum(selected_features) == 0:
            return float('-inf')
            
        X_selected = X[:, selected_features == 1]
        try:
            classifier.fit(X_selected, y)
            y_pred = classifier.predict(X_selected)
            return accuracy_score(y, y_pred)
        except:
            return float('-inf')
    
    def honey_badger_movement(self, current_position, best_position):
        r = np.random.random()
        if r < 0.5:
            return np.where(np.random.random(len(current_position)) < 0.3,
                          1 - current_position,
                          current_position)
        else:
            return np.where(np.random.random(len(current_position)) < 0.5,
                          best_position,
                          current_position)
    
    def fit(self, X, y, classifier):
        n_total_features = X.shape[1]
        population = np.random.randint(2, size=(self.population_size, n_total_features))
        for i in range(self.population_size):
            if np.sum(population[i]) < self.n_features:
                random_indices = np.random.choice(
                    n_total_features, 
                    self.n_features - np.sum(population[i]),
                    replace=False
                )
                population[i, random_indices] = 1
        
        global_best_position = None
        global_best_fitness = float('-inf')
        
        for iteration in range(self.max_iter):
            fitness_values = np.array([
                self.fitness_function(position, X, y, classifier)
                for position in population
            ])
            
            current_best_idx = np.argmax(fitness_values)
            if fitness_values[current_best_idx] > global_best_fitness:
                global_best_fitness = fitness_values[current_best_idx]
                global_best_position = population[current_best_idx].copy()
            
            new_population = np.array([
                self.honey_badger_movement(position, global_best_position)
                for position in population
            ])
            
            population = new_population
        
        self.best_features = global_best_position
        self.best_fitness = global_best_fitness
        return self

def load_and_preprocess_data():
    # Read training and testing data
    train_data = pd.read_csv('Training Data/CM1_FS_TrainData.csv')
    test_data = pd.read_csv('Testing Data/CM1_FS_TestData.csv')
    
    # Separate features and target for both datasets
    X_train = train_data.drop('Defective', axis=1)
    y_train = train_data['Defective']
    X_test = test_data.drop('Defective', axis=1)
    y_test = test_data['Defective']
    
    # Convert target to numeric
    le = LabelEncoder()
    y_train = le.fit_transform(y_train)
    y_test = le.transform(y_test)
    
    return X_train, X_test, y_train, y_test

def train_and_evaluate_defect_predictor(X_train, X_test, y_train, y_test, n_features=5):
    # Scale features
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    
    # Initialize Random Forest
    rf = RandomForestClassifier(n_estimators=100, random_state=42)
    
    # Feature selection using training data
    print(f"Starting feature selection at: {time.time()}")
    hb = HoneyBadgerFeatureSelection(n_features_to_select=n_features)
    hb.fit(X_train_scaled, y_train, rf)
    
    # Get selected features
    selected_features = np.where(hb.best_features == 1)[0]
    selected_feature_names = X_train.columns[selected_features].tolist()
    
    # Train final model with selected features
    X_train_selected = X_train_scaled[:, selected_features]
    X_test_selected = X_test_scaled[:, selected_features]
    
    final_rf = RandomForestClassifier(n_estimators=100, random_state=42)
    final_rf.fit(X_train_selected, y_train)
    
    # Make predictions
    y_pred = final_rf.predict(X_test_selected)
    y_pred_proba = final_rf.predict_proba(X_test_selected)[:, 1]  # For AUC score
    
    # Calculate metrics
    metrics = {
        'model': final_rf,
        'selected_features': selected_feature_names,
        'accuracy': accuracy_score(y_test, y_pred),
        'precision': precision_score(y_test, y_pred),
        'recall': recall_score(y_test, y_pred),
        'f1': f1_score(y_test, y_pred),
        'auc': roc_auc_score(y_test, y_pred_proba),
        'confusion_matrix': confusion_matrix(y_test, y_pred),
        'selected_indices': selected_features,
        'feature_importance': final_rf.feature_importances_
    }
    
    return metrics

if __name__ == "__main__":
    # Load and preprocess data
    X_train, X_test, y_train, y_test = load_and_preprocess_data()
    
    # Train model and get results
    results = train_and_evaluate_defect_predictor(X_train, X_test, y_train, y_test, n_features=5)
    
    # Print selected features and their importance
    print("\nSelected Features:")
    for i, (feature, importance) in enumerate(zip(results['selected_features'], 
                                                results['feature_importance']), 1):
        print(f"{i}. {feature} (Importance: {importance:.4f})")
    
    # Print metrics
    print(f"\nModel Performance Metrics:")
    print(f"Accuracy: {results['accuracy']:.4f}")
    print(f"Precision: {results['precision']:.4f}")
    print(f"Recall: {results['recall']:.4f}")
    print(f"F1-measure: {results['f1']:.4f}")
    print(f"AUC Score: {results['auc']:.4f}")
    
    print("\nConfusion Matrix:")
    print(results['confusion_matrix'])
    
    # Feature importance analysis
    print("\nFeature Importance Analysis:")
    for feature, idx, importance in zip(results['selected_features'], 
                                      results['selected_indices'],
                                      results['feature_importance']):
        print(f"{feature}: Original index {idx}, Importance {importance:.4f}")

Starting feature selection at: 1737296827.1214557

Selected Features:
1. LOC_COMMENTS (Importance: 0.2507)
2. HALSTEAD_CONTENT (Importance: 0.3138)
3. HALSTEAD_ERROR_EST (Importance: 0.2191)
4. NUMBER_OF_LINES (Importance: 0.2164)

Model Performance Metrics:
Accuracy: 0.8485
Precision: 0.2500
Recall: 0.0769
F1-measure: 0.1176
AUC Score: 0.6377

Confusion Matrix:
[[83  3]
 [12  1]]

Feature Importance Analysis:
LOC_COMMENTS: Original index 1, Importance 0.2507
HALSTEAD_CONTENT: Original index 2, Importance 0.3138
HALSTEAD_ERROR_EST: Original index 3, Importance 0.2191
NUMBER_OF_LINES: Original index 6, Importance 0.2164


In [2]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import (accuracy_score, confusion_matrix, precision_score, 
                           recall_score, f1_score, roc_auc_score)
import warnings
warnings.filterwarnings('ignore')
import time

class HoneyBadgerFeatureSelection:
    def __init__(self, n_features_to_select=5, max_iter=50, population_size=20):
        self.n_features = n_features_to_select
        self.max_iter = max_iter
        self.population_size = population_size
        self.best_features = None
        self.best_fitness = float('-inf')
    
    def fitness_function(self, selected_features, X, y, classifier):
        if np.sum(selected_features) == 0:
            return float('-inf')
            
        X_selected = X[:, selected_features == 1]
        try:
            classifier.fit(X_selected, y)
            y_pred = classifier.predict(X_selected)
            return accuracy_score(y, y_pred)
        except:
            return float('-inf')
    
    def honey_badger_movement(self, current_position, best_position):
        r = np.random.random()
        if r < 0.5:
            return np.where(np.random.random(len(current_position)) < 0.3,
                          1 - current_position,
                          current_position)
        else:
            return np.where(np.random.random(len(current_position)) < 0.5,
                          best_position,
                          current_position)
    
    def fit(self, X, y, classifier):
        n_total_features = X.shape[1]
        population = np.random.randint(2, size=(self.population_size, n_total_features))
        for i in range(self.population_size):
            if np.sum(population[i]) < self.n_features:
                random_indices = np.random.choice(
                    n_total_features, 
                    self.n_features - np.sum(population[i]),
                    replace=False
                )
                population[i, random_indices] = 1
        
        global_best_position = None
        global_best_fitness = float('-inf')
        
        for iteration in range(self.max_iter):
            fitness_values = np.array([
                self.fitness_function(position, X, y, classifier)
                for position in population
            ])
            
            current_best_idx = np.argmax(fitness_values)
            if fitness_values[current_best_idx] > global_best_fitness:
                global_best_fitness = fitness_values[current_best_idx]
                global_best_position = population[current_best_idx].copy()
            
            new_population = np.array([
                self.honey_badger_movement(position, global_best_position)
                for position in population
            ])
            
            population = new_population
        
        self.best_features = global_best_position
        self.best_fitness = global_best_fitness
        return self

def load_and_preprocess_data():
    # Read training and testing data
    train_data = pd.read_csv('Training Data/MC2_FS_TrainData.csv')
    test_data = pd.read_csv('Testing Data/MC2_FS_TestData.csv')
    
    # Separate features and target for both datasets
    X_train = train_data.drop('Defective', axis=1)
    y_train = train_data['Defective']
    X_test = test_data.drop('Defective', axis=1)
    y_test = test_data['Defective']
    
    # Convert target to numeric
    le = LabelEncoder()
    y_train = le.fit_transform(y_train)
    y_test = le.transform(y_test)
    
    return X_train, X_test, y_train, y_test

def train_and_evaluate_defect_predictor(X_train, X_test, y_train, y_test, n_features=5):
    # Scale features
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    
    # Initialize Random Forest
    rf = RandomForestClassifier(n_estimators=100, random_state=42)
    
    # Feature selection using training data
    print(f"Starting feature selection at: {time.time()}")
    hb = HoneyBadgerFeatureSelection(n_features_to_select=n_features)
    hb.fit(X_train_scaled, y_train, rf)
    
    # Get selected features
    selected_features = np.where(hb.best_features == 1)[0]
    selected_feature_names = X_train.columns[selected_features].tolist()
    
    # Train final model with selected features
    X_train_selected = X_train_scaled[:, selected_features]
    X_test_selected = X_test_scaled[:, selected_features]
    
    final_rf = RandomForestClassifier(n_estimators=100, random_state=42)
    final_rf.fit(X_train_selected, y_train)
    
    # Make predictions
    y_pred = final_rf.predict(X_test_selected)
    y_pred_proba = final_rf.predict_proba(X_test_selected)[:, 1]  # For AUC score
    
    # Calculate metrics
    metrics = {
        'model': final_rf,
        'selected_features': selected_feature_names,
        'accuracy': accuracy_score(y_test, y_pred),
        'precision': precision_score(y_test, y_pred),
        'recall': recall_score(y_test, y_pred),
        'f1': f1_score(y_test, y_pred),
        'auc': roc_auc_score(y_test, y_pred_proba),
        'confusion_matrix': confusion_matrix(y_test, y_pred),
        'selected_indices': selected_features,
        'feature_importance': final_rf.feature_importances_
    }
    
    return metrics

if __name__ == "__main__":
    # Load and preprocess data
    X_train, X_test, y_train, y_test = load_and_preprocess_data()
    
    # Train model and get results
    results = train_and_evaluate_defect_predictor(X_train, X_test, y_train, y_test, n_features=5)
    
    # Print selected features and their importance
    print("\nSelected Features:")
    for i, (feature, importance) in enumerate(zip(results['selected_features'], 
                                                results['feature_importance']), 1):
        print(f"{i}. {feature} (Importance: {importance:.4f})")
    
    # Print metrics
    print(f"\nModel Performance Metrics:")
    print(f"Accuracy: {results['accuracy']:.4f}")
    print(f"Precision: {results['precision']:.4f}")
    print(f"Recall: {results['recall']:.4f}")
    print(f"F1-measure: {results['f1']:.4f}")
    print(f"AUC Score: {results['auc']:.4f}")
    
    print("\nConfusion Matrix:")
    print(results['confusion_matrix'])
    
    # Feature importance analysis
    print("\nFeature Importance Analysis:")
    for feature, idx, importance in zip(results['selected_features'], 
                                      results['selected_indices'],
                                      results['feature_importance']):
        print(f"{feature}: Original index {idx}, Importance {importance:.4f}")

Starting feature selection at: 1737297090.29487

Selected Features:
1. LOC_BLANK (Importance: 0.1988)
2. LOC_COMMENTS (Importance: 0.1786)
3. DESIGN_COMPLEXITY (Importance: 0.1805)
4. GLOBAL_DATA_COMPLEXITY (Importance: 0.1960)
5. LOC_TOTAL (Importance: 0.2462)

Model Performance Metrics:
Accuracy: 0.6053
Precision: 0.3333
Recall: 0.1538
F1-measure: 0.2105
AUC Score: 0.5277

Confusion Matrix:
[[21  4]
 [11  2]]

Feature Importance Analysis:
LOC_BLANK: Original index 0, Importance 0.1988
LOC_COMMENTS: Original index 1, Importance 0.1786
DESIGN_COMPLEXITY: Original index 2, Importance 0.1805
GLOBAL_DATA_COMPLEXITY: Original index 5, Importance 0.1960
LOC_TOTAL: Original index 9, Importance 0.2462


In [3]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import (accuracy_score, confusion_matrix, precision_score, 
                           recall_score, f1_score, roc_auc_score)
import warnings
warnings.filterwarnings('ignore')
import time

class HoneyBadgerFeatureSelection:
    def __init__(self, n_features_to_select=5, max_iter=50, population_size=20):
        self.n_features = n_features_to_select
        self.max_iter = max_iter
        self.population_size = population_size
        self.best_features = None
        self.best_fitness = float('-inf')
    
    def fitness_function(self, selected_features, X, y, classifier):
        if np.sum(selected_features) == 0:
            return float('-inf')
            
        X_selected = X[:, selected_features == 1]
        try:
            classifier.fit(X_selected, y)
            y_pred = classifier.predict(X_selected)
            return accuracy_score(y, y_pred)
        except:
            return float('-inf')
    
    def honey_badger_movement(self, current_position, best_position):
        r = np.random.random()
        if r < 0.5:
            return np.where(np.random.random(len(current_position)) < 0.3,
                          1 - current_position,
                          current_position)
        else:
            return np.where(np.random.random(len(current_position)) < 0.5,
                          best_position,
                          current_position)
    
    def fit(self, X, y, classifier):
        n_total_features = X.shape[1]
        population = np.random.randint(2, size=(self.population_size, n_total_features))
        for i in range(self.population_size):
            if np.sum(population[i]) < self.n_features:
                random_indices = np.random.choice(
                    n_total_features, 
                    self.n_features - np.sum(population[i]),
                    replace=False
                )
                population[i, random_indices] = 1
        
        global_best_position = None
        global_best_fitness = float('-inf')
        
        for iteration in range(self.max_iter):
            fitness_values = np.array([
                self.fitness_function(position, X, y, classifier)
                for position in population
            ])
            
            current_best_idx = np.argmax(fitness_values)
            if fitness_values[current_best_idx] > global_best_fitness:
                global_best_fitness = fitness_values[current_best_idx]
                global_best_position = population[current_best_idx].copy()
            
            new_population = np.array([
                self.honey_badger_movement(position, global_best_position)
                for position in population
            ])
            
            population = new_population
        
        self.best_features = global_best_position
        self.best_fitness = global_best_fitness
        return self

def load_and_preprocess_data():
    # Read training and testing data
    train_data = pd.read_csv('Training Data/MW1_FS_TrainData.csv')
    test_data = pd.read_csv('Testing Data/MW1_FS_TestData.csv')
    
    # Separate features and target for both datasets
    X_train = train_data.drop('Defective', axis=1)
    y_train = train_data['Defective']
    X_test = test_data.drop('Defective', axis=1)
    y_test = test_data['Defective']
    
    # Convert target to numeric
    le = LabelEncoder()
    y_train = le.fit_transform(y_train)
    y_test = le.transform(y_test)
    
    return X_train, X_test, y_train, y_test

def train_and_evaluate_defect_predictor(X_train, X_test, y_train, y_test, n_features=5):
    # Scale features
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    
    # Initialize Random Forest
    rf = RandomForestClassifier(n_estimators=100, random_state=42)
    
    # Feature selection using training data
    print(f"Starting feature selection at: {time.time()}")
    hb = HoneyBadgerFeatureSelection(n_features_to_select=n_features)
    hb.fit(X_train_scaled, y_train, rf)
    
    # Get selected features
    selected_features = np.where(hb.best_features == 1)[0]
    selected_feature_names = X_train.columns[selected_features].tolist()
    
    # Train final model with selected features
    X_train_selected = X_train_scaled[:, selected_features]
    X_test_selected = X_test_scaled[:, selected_features]
    
    final_rf = RandomForestClassifier(n_estimators=100, random_state=42)
    final_rf.fit(X_train_selected, y_train)
    
    # Make predictions
    y_pred = final_rf.predict(X_test_selected)
    y_pred_proba = final_rf.predict_proba(X_test_selected)[:, 1]  # For AUC score
    
    # Calculate metrics
    metrics = {
        'model': final_rf,
        'selected_features': selected_feature_names,
        'accuracy': accuracy_score(y_test, y_pred),
        'precision': precision_score(y_test, y_pred),
        'recall': recall_score(y_test, y_pred),
        'f1': f1_score(y_test, y_pred),
        'auc': roc_auc_score(y_test, y_pred_proba),
        'confusion_matrix': confusion_matrix(y_test, y_pred),
        'selected_indices': selected_features,
        'feature_importance': final_rf.feature_importances_
    }
    
    return metrics

if __name__ == "__main__":
    # Load and preprocess data
    X_train, X_test, y_train, y_test = load_and_preprocess_data()
    
    # Train model and get results
    results = train_and_evaluate_defect_predictor(X_train, X_test, y_train, y_test, n_features=5)
    
    # Print selected features and their importance
    print("\nSelected Features:")
    for i, (feature, importance) in enumerate(zip(results['selected_features'], 
                                                results['feature_importance']), 1):
        print(f"{i}. {feature} (Importance: {importance:.4f})")
    
    # Print metrics
    print(f"\nModel Performance Metrics:")
    print(f"Accuracy: {results['accuracy']:.4f}")
    print(f"Precision: {results['precision']:.4f}")
    print(f"Recall: {results['recall']:.4f}")
    print(f"F1-measure: {results['f1']:.4f}")
    print(f"AUC Score: {results['auc']:.4f}")
    
    print("\nConfusion Matrix:")
    print(results['confusion_matrix'])
    
    # Feature importance analysis
    print("\nFeature Importance Analysis:")
    for feature, idx, importance in zip(results['selected_features'], 
                                      results['selected_indices'],
                                      results['feature_importance']):
        print(f"{feature}: Original index {idx}, Importance {importance:.4f}")

Starting feature selection at: 1737297331.954764

Selected Features:
1. DESIGN_COMPLEXITY (Importance: 0.1504)
2. HALSTEAD_LENGTH (Importance: 0.2664)
3. NUMBER_OF_LINES (Importance: 0.3129)
4. LOC_TOTAL (Importance: 0.2702)

Model Performance Metrics:
Accuracy: 0.8933
Precision: 0.5000
Recall: 0.1250
F1-measure: 0.2000
AUC Score: 0.7043

Confusion Matrix:
[[66  1]
 [ 7  1]]

Feature Importance Analysis:
DESIGN_COMPLEXITY: Original index 1, Importance 0.1504
HALSTEAD_LENGTH: Original index 4, Importance 0.2664
NUMBER_OF_LINES: Original index 6, Importance 0.3129
LOC_TOTAL: Original index 7, Importance 0.2702


In [4]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import (accuracy_score, confusion_matrix, precision_score, 
                           recall_score, f1_score, roc_auc_score)
import warnings
warnings.filterwarnings('ignore')
import time

class HoneyBadgerFeatureSelection:
    def __init__(self, n_features_to_select=5, max_iter=50, population_size=20):
        self.n_features = n_features_to_select
        self.max_iter = max_iter
        self.population_size = population_size
        self.best_features = None
        self.best_fitness = float('-inf')
    
    def fitness_function(self, selected_features, X, y, classifier):
        if np.sum(selected_features) == 0:
            return float('-inf')
            
        X_selected = X[:, selected_features == 1]
        try:
            classifier.fit(X_selected, y)
            y_pred = classifier.predict(X_selected)
            return accuracy_score(y, y_pred)
        except:
            return float('-inf')
    
    def honey_badger_movement(self, current_position, best_position):
        r = np.random.random()
        if r < 0.5:
            return np.where(np.random.random(len(current_position)) < 0.3,
                          1 - current_position,
                          current_position)
        else:
            return np.where(np.random.random(len(current_position)) < 0.5,
                          best_position,
                          current_position)
    
    def fit(self, X, y, classifier):
        n_total_features = X.shape[1]
        population = np.random.randint(2, size=(self.population_size, n_total_features))
        for i in range(self.population_size):
            if np.sum(population[i]) < self.n_features:
                random_indices = np.random.choice(
                    n_total_features, 
                    self.n_features - np.sum(population[i]),
                    replace=False
                )
                population[i, random_indices] = 1
        
        global_best_position = None
        global_best_fitness = float('-inf')
        
        for iteration in range(self.max_iter):
            fitness_values = np.array([
                self.fitness_function(position, X, y, classifier)
                for position in population
            ])
            
            current_best_idx = np.argmax(fitness_values)
            if fitness_values[current_best_idx] > global_best_fitness:
                global_best_fitness = fitness_values[current_best_idx]
                global_best_position = population[current_best_idx].copy()
            
            new_population = np.array([
                self.honey_badger_movement(position, global_best_position)
                for position in population
            ])
            
            population = new_population
        
        self.best_features = global_best_position
        self.best_fitness = global_best_fitness
        return self

def load_and_preprocess_data():
    # Read training and testing data
    train_data = pd.read_csv('Training Data/PC1_FS_TrainData.csv')
    test_data = pd.read_csv('Testing Data/PC1_FS_TestData.csv')
    
    # Separate features and target for both datasets
    X_train = train_data.drop('Defective', axis=1)
    y_train = train_data['Defective']
    X_test = test_data.drop('Defective', axis=1)
    y_test = test_data['Defective']
    
    # Convert target to numeric
    le = LabelEncoder()
    y_train = le.fit_transform(y_train)
    y_test = le.transform(y_test)
    
    return X_train, X_test, y_train, y_test

def train_and_evaluate_defect_predictor(X_train, X_test, y_train, y_test, n_features=5):
    # Scale features
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    
    # Initialize Random Forest
    rf = RandomForestClassifier(n_estimators=100, random_state=42)
    
    # Feature selection using training data
    print(f"Starting feature selection at: {time.time()}")
    hb = HoneyBadgerFeatureSelection(n_features_to_select=n_features)
    hb.fit(X_train_scaled, y_train, rf)
    
    # Get selected features
    selected_features = np.where(hb.best_features == 1)[0]
    selected_feature_names = X_train.columns[selected_features].tolist()
    
    # Train final model with selected features
    X_train_selected = X_train_scaled[:, selected_features]
    X_test_selected = X_test_scaled[:, selected_features]
    
    final_rf = RandomForestClassifier(n_estimators=100, random_state=42)
    final_rf.fit(X_train_selected, y_train)
    
    # Make predictions
    y_pred = final_rf.predict(X_test_selected)
    y_pred_proba = final_rf.predict_proba(X_test_selected)[:, 1]  # For AUC score
    
    # Calculate metrics
    metrics = {
        'model': final_rf,
        'selected_features': selected_feature_names,
        'accuracy': accuracy_score(y_test, y_pred),
        'precision': precision_score(y_test, y_pred),
        'recall': recall_score(y_test, y_pred),
        'f1': f1_score(y_test, y_pred),
        'auc': roc_auc_score(y_test, y_pred_proba),
        'confusion_matrix': confusion_matrix(y_test, y_pred),
        'selected_indices': selected_features,
        'feature_importance': final_rf.feature_importances_
    }
    
    return metrics

if __name__ == "__main__":
    # Load and preprocess data
    X_train, X_test, y_train, y_test = load_and_preprocess_data()
    
    # Train model and get results
    results = train_and_evaluate_defect_predictor(X_train, X_test, y_train, y_test, n_features=5)
    
    # Print selected features and their importance
    print("\nSelected Features:")
    for i, (feature, importance) in enumerate(zip(results['selected_features'], 
                                                results['feature_importance']), 1):
        print(f"{i}. {feature} (Importance: {importance:.4f})")
    
    # Print metrics
    print(f"\nModel Performance Metrics:")
    print(f"Accuracy: {results['accuracy']:.4f}")
    print(f"Precision: {results['precision']:.4f}")
    print(f"Recall: {results['recall']:.4f}")
    print(f"F1-measure: {results['f1']:.4f}")
    print(f"AUC Score: {results['auc']:.4f}")
    
    print("\nConfusion Matrix:")
    print(results['confusion_matrix'])
    
    # Feature importance analysis
    print("\nFeature Importance Analysis:")
    for feature, idx, importance in zip(results['selected_features'], 
                                      results['selected_indices'],
                                      results['feature_importance']):
        print(f"{feature}: Original index {idx}, Importance {importance:.4f}")

Starting feature selection at: 1737297586.5775292

Selected Features:
1. LOC_CODE_AND_COMMENT (Importance: 0.1798)
2. EDGE_COUNT (Importance: 0.2385)
3. HALSTEAD_CONTENT (Importance: 0.2815)
4. NUMBER_OF_LINES (Importance: 0.3002)

Model Performance Metrics:
Accuracy: 0.9216
Precision: 0.5714
Recall: 0.2353
F1-measure: 0.3333
AUC Score: 0.8989

Confusion Matrix:
[[184   3]
 [ 13   4]]

Feature Importance Analysis:
LOC_CODE_AND_COMMENT: Original index 1, Importance 0.1798
EDGE_COUNT: Original index 2, Importance 0.2385
HALSTEAD_CONTENT: Original index 3, Importance 0.2815
NUMBER_OF_LINES: Original index 5, Importance 0.3002


In [5]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import (accuracy_score, confusion_matrix, precision_score, 
                           recall_score, f1_score, roc_auc_score)
import warnings
warnings.filterwarnings('ignore')
import time

class HoneyBadgerFeatureSelection:
    def __init__(self, n_features_to_select=5, max_iter=50, population_size=20):
        self.n_features = n_features_to_select
        self.max_iter = max_iter
        self.population_size = population_size
        self.best_features = None
        self.best_fitness = float('-inf')
    
    def fitness_function(self, selected_features, X, y, classifier):
        if np.sum(selected_features) == 0:
            return float('-inf')
            
        X_selected = X[:, selected_features == 1]
        try:
            classifier.fit(X_selected, y)
            y_pred = classifier.predict(X_selected)
            return accuracy_score(y, y_pred)
        except:
            return float('-inf')
    
    def honey_badger_movement(self, current_position, best_position):
        r = np.random.random()
        if r < 0.5:
            return np.where(np.random.random(len(current_position)) < 0.3,
                          1 - current_position,
                          current_position)
        else:
            return np.where(np.random.random(len(current_position)) < 0.5,
                          best_position,
                          current_position)
    
    def fit(self, X, y, classifier):
        n_total_features = X.shape[1]
        population = np.random.randint(2, size=(self.population_size, n_total_features))
        for i in range(self.population_size):
            if np.sum(population[i]) < self.n_features:
                random_indices = np.random.choice(
                    n_total_features, 
                    self.n_features - np.sum(population[i]),
                    replace=False
                )
                population[i, random_indices] = 1
        
        global_best_position = None
        global_best_fitness = float('-inf')
        
        for iteration in range(self.max_iter):
            fitness_values = np.array([
                self.fitness_function(position, X, y, classifier)
                for position in population
            ])
            
            current_best_idx = np.argmax(fitness_values)
            if fitness_values[current_best_idx] > global_best_fitness:
                global_best_fitness = fitness_values[current_best_idx]
                global_best_position = population[current_best_idx].copy()
            
            new_population = np.array([
                self.honey_badger_movement(position, global_best_position)
                for position in population
            ])
            
            population = new_population
        
        self.best_features = global_best_position
        self.best_fitness = global_best_fitness
        return self

def load_and_preprocess_data():
    # Read training and testing data
    train_data = pd.read_csv('Training Data/PC3_FS_TrainData.csv')
    test_data = pd.read_csv('Testing Data/PC3_FS_TestData.csv')
    
    # Separate features and target for both datasets
    X_train = train_data.drop('Defective', axis=1)
    y_train = train_data['Defective']
    X_test = test_data.drop('Defective', axis=1)
    y_test = test_data['Defective']
    
    # Convert target to numeric
    le = LabelEncoder()
    y_train = le.fit_transform(y_train)
    y_test = le.transform(y_test)
    
    return X_train, X_test, y_train, y_test

def train_and_evaluate_defect_predictor(X_train, X_test, y_train, y_test, n_features=5):
    # Scale features
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    
    # Initialize Random Forest
    rf = RandomForestClassifier(n_estimators=100, random_state=42)
    
    # Feature selection using training data
    print(f"Starting feature selection at: {time.time()}")
    hb = HoneyBadgerFeatureSelection(n_features_to_select=n_features)
    hb.fit(X_train_scaled, y_train, rf)
    
    # Get selected features
    selected_features = np.where(hb.best_features == 1)[0]
    selected_feature_names = X_train.columns[selected_features].tolist()
    
    # Train final model with selected features
    X_train_selected = X_train_scaled[:, selected_features]
    X_test_selected = X_test_scaled[:, selected_features]
    
    final_rf = RandomForestClassifier(n_estimators=100, random_state=42)
    final_rf.fit(X_train_selected, y_train)
    
    # Make predictions
    y_pred = final_rf.predict(X_test_selected)
    y_pred_proba = final_rf.predict_proba(X_test_selected)[:, 1]  # For AUC score
    
    # Calculate metrics
    metrics = {
        'model': final_rf,
        'selected_features': selected_feature_names,
        'accuracy': accuracy_score(y_test, y_pred),
        'precision': precision_score(y_test, y_pred),
        'recall': recall_score(y_test, y_pred),
        'f1': f1_score(y_test, y_pred),
        'auc': roc_auc_score(y_test, y_pred_proba),
        'confusion_matrix': confusion_matrix(y_test, y_pred),
        'selected_indices': selected_features,
        'feature_importance': final_rf.feature_importances_
    }
    
    return metrics

if __name__ == "__main__":
    # Load and preprocess data
    X_train, X_test, y_train, y_test = load_and_preprocess_data()
    
    # Train model and get results
    results = train_and_evaluate_defect_predictor(X_train, X_test, y_train, y_test, n_features=5)
    
    # Print selected features and their importance
    print("\nSelected Features:")
    for i, (feature, importance) in enumerate(zip(results['selected_features'], 
                                                results['feature_importance']), 1):
        print(f"{i}. {feature} (Importance: {importance:.4f})")
    
    # Print metrics
    print(f"\nModel Performance Metrics:")
    print(f"Accuracy: {results['accuracy']:.4f}")
    print(f"Precision: {results['precision']:.4f}")
    print(f"Recall: {results['recall']:.4f}")
    print(f"F1-measure: {results['f1']:.4f}")
    print(f"AUC Score: {results['auc']:.4f}")
    
    print("\nConfusion Matrix:")
    print(results['confusion_matrix'])
    
    # Feature importance analysis
    print("\nFeature Importance Analysis:")
    for feature, idx, importance in zip(results['selected_features'], 
                                      results['selected_indices'],
                                      results['feature_importance']):
        print(f"{feature}: Original index {idx}, Importance {importance:.4f}")

Starting feature selection at: 1737297874.7572331

Selected Features:
1. LOC_BLANK (Importance: 0.2189)
2. LOC_CODE_AND_COMMENT (Importance: 0.1092)
3. HALSTEAD_CONTENT (Importance: 0.2477)
4. NUM_OPERANDS (Importance: 0.1908)
5. PERCENT_COMMENTS (Importance: 0.2335)

Model Performance Metrics:
Accuracy: 0.8861
Precision: 0.6000
Recall: 0.2308
F1-measure: 0.3333
AUC Score: 0.7814

Confusion Matrix:
[[271   6]
 [ 30   9]]

Feature Importance Analysis:
LOC_BLANK: Original index 0, Importance 0.2189
LOC_CODE_AND_COMMENT: Original index 1, Importance 0.1092
HALSTEAD_CONTENT: Original index 2, Importance 0.2477
NUM_OPERANDS: Original index 4, Importance 0.1908
PERCENT_COMMENTS: Original index 8, Importance 0.2335


In [6]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import (accuracy_score, confusion_matrix, precision_score, 
                           recall_score, f1_score, roc_auc_score)
import warnings
warnings.filterwarnings('ignore')
import time

class HoneyBadgerFeatureSelection:
    def __init__(self, n_features_to_select=5, max_iter=50, population_size=20):
        self.n_features = n_features_to_select
        self.max_iter = max_iter
        self.population_size = population_size
        self.best_features = None
        self.best_fitness = float('-inf')
    
    def fitness_function(self, selected_features, X, y, classifier):
        if np.sum(selected_features) == 0:
            return float('-inf')
            
        X_selected = X[:, selected_features == 1]
        try:
            classifier.fit(X_selected, y)
            y_pred = classifier.predict(X_selected)
            return accuracy_score(y, y_pred)
        except:
            return float('-inf')
    
    def honey_badger_movement(self, current_position, best_position):
        r = np.random.random()
        if r < 0.5:
            return np.where(np.random.random(len(current_position)) < 0.3,
                          1 - current_position,
                          current_position)
        else:
            return np.where(np.random.random(len(current_position)) < 0.5,
                          best_position,
                          current_position)
    
    def fit(self, X, y, classifier):
        n_total_features = X.shape[1]
        population = np.random.randint(2, size=(self.population_size, n_total_features))
        for i in range(self.population_size):
            if np.sum(population[i]) < self.n_features:
                random_indices = np.random.choice(
                    n_total_features, 
                    self.n_features - np.sum(population[i]),
                    replace=False
                )
                population[i, random_indices] = 1
        
        global_best_position = None
        global_best_fitness = float('-inf')
        
        for iteration in range(self.max_iter):
            fitness_values = np.array([
                self.fitness_function(position, X, y, classifier)
                for position in population
            ])
            
            current_best_idx = np.argmax(fitness_values)
            if fitness_values[current_best_idx] > global_best_fitness:
                global_best_fitness = fitness_values[current_best_idx]
                global_best_position = population[current_best_idx].copy()
            
            new_population = np.array([
                self.honey_badger_movement(position, global_best_position)
                for position in population
            ])
            
            population = new_population
        
        self.best_features = global_best_position
        self.best_fitness = global_best_fitness
        return self

def load_and_preprocess_data():
    # Read training and testing data
    train_data = pd.read_csv('Training Data/PC4_FS_TrainData.csv')
    test_data = pd.read_csv('Testing Data/PC4_FS_TestData.csv')
    
    # Separate features and target for both datasets
    X_train = train_data.drop('Defective', axis=1)
    y_train = train_data['Defective']
    X_test = test_data.drop('Defective', axis=1)
    y_test = test_data['Defective']
    
    # Convert target to numeric
    le = LabelEncoder()
    y_train = le.fit_transform(y_train)
    y_test = le.transform(y_test)
    
    return X_train, X_test, y_train, y_test

def train_and_evaluate_defect_predictor(X_train, X_test, y_train, y_test, n_features=5):
    # Scale features
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    
    # Initialize Random Forest
    rf = RandomForestClassifier(n_estimators=100, random_state=42)
    
    # Feature selection using training data
    print(f"Starting feature selection at: {time.time()}")
    hb = HoneyBadgerFeatureSelection(n_features_to_select=n_features)
    hb.fit(X_train_scaled, y_train, rf)
    
    # Get selected features
    selected_features = np.where(hb.best_features == 1)[0]
    selected_feature_names = X_train.columns[selected_features].tolist()
    
    # Train final model with selected features
    X_train_selected = X_train_scaled[:, selected_features]
    X_test_selected = X_test_scaled[:, selected_features]
    
    final_rf = RandomForestClassifier(n_estimators=100, random_state=42)
    final_rf.fit(X_train_selected, y_train)
    
    # Make predictions
    y_pred = final_rf.predict(X_test_selected)
    y_pred_proba = final_rf.predict_proba(X_test_selected)[:, 1]  # For AUC score
    
    # Calculate metrics
    metrics = {
        'model': final_rf,
        'selected_features': selected_feature_names,
        'accuracy': accuracy_score(y_test, y_pred),
        'precision': precision_score(y_test, y_pred),
        'recall': recall_score(y_test, y_pred),
        'f1': f1_score(y_test, y_pred),
        'auc': roc_auc_score(y_test, y_pred_proba),
        'confusion_matrix': confusion_matrix(y_test, y_pred),
        'selected_indices': selected_features,
        'feature_importance': final_rf.feature_importances_
    }
    
    return metrics

if __name__ == "__main__":
    # Load and preprocess data
    X_train, X_test, y_train, y_test = load_and_preprocess_data()
    
    # Train model and get results
    results = train_and_evaluate_defect_predictor(X_train, X_test, y_train, y_test, n_features=5)
    
    # Print selected features and their importance
    print("\nSelected Features:")
    for i, (feature, importance) in enumerate(zip(results['selected_features'], 
                                                results['feature_importance']), 1):
        print(f"{i}. {feature} (Importance: {importance:.4f})")
    
    # Print metrics
    print(f"\nModel Performance Metrics:")
    print(f"Accuracy: {results['accuracy']:.4f}")
    print(f"Precision: {results['precision']:.4f}")
    print(f"Recall: {results['recall']:.4f}")
    print(f"F1-measure: {results['f1']:.4f}")
    print(f"AUC Score: {results['auc']:.4f}")
    
    print("\nConfusion Matrix:")
    print(results['confusion_matrix'])
    
    # Feature importance analysis
    print("\nFeature Importance Analysis:")
    for feature, idx, importance in zip(results['selected_features'], 
                                      results['selected_indices'],
                                      results['feature_importance']):
        print(f"{feature}: Original index {idx}, Importance {importance:.4f}")

Starting feature selection at: 1737298210.4255514

Selected Features:
1. LOC_CODE_AND_COMMENT (Importance: 0.2584)
2. ESSENTIAL_COMPLEXITY (Importance: 0.0462)
3. HALSTEAD_LENGTH (Importance: 0.1781)
4. MODIFIED_CONDITION_COUNT (Importance: 0.0784)
5. MULTIPLE_CONDITION_COUNT (Importance: 0.0940)
6. NORMALIZED_CYLOMATIC_COMPLEXITY (Importance: 0.1306)
7. PERCENT_COMMENTS (Importance: 0.2142)

Model Performance Metrics:
Accuracy: 0.8819
Precision: 0.6053
Recall: 0.4340
F1-measure: 0.5055
AUC Score: 0.9015

Confusion Matrix:
[[313  15]
 [ 30  23]]

Feature Importance Analysis:
LOC_CODE_AND_COMMENT: Original index 1, Importance 0.2584
ESSENTIAL_COMPLEXITY: Original index 3, Importance 0.0462
HALSTEAD_LENGTH: Original index 4, Importance 0.1781
MODIFIED_CONDITION_COUNT: Original index 5, Importance 0.0784
MULTIPLE_CONDITION_COUNT: Original index 6, Importance 0.0940
NORMALIZED_CYLOMATIC_COMPLEXITY: Original index 7, Importance 0.1306
PERCENT_COMMENTS: Original index 8, Importance 0.2142
