In [1]:
import warnings
warnings.filterwarnings("ignore", category=UserWarning)
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_curve, auc
import matplotlib.pyplot as plt

In [2]:
# Step 1: Load Data
def load_data(filepath):
    data = pd.read_csv(filepath)
    return data

In [3]:
# Step 2: Preprocessing
def preprocess_data(data):
    # Drop rows where 'Label' is missing
    if 'Label' in data.columns:
        data.dropna(subset=['Label'], inplace=True)
    else:
        raise KeyError("The 'Label' column is missing from the dataset.")

    # Drop irrelevant columns like 'Attack Type'
    data.drop(columns=['Attack Type'], errors='ignore', inplace=True)

    # Keep only numeric columns
    data = data.select_dtypes(include=[np.number])

    # Impute missing values
    data.fillna(data.mean(), inplace=True)

    # Label encoding for binary classification
    label_encoder = LabelEncoder()
    data['Label'] = label_encoder.fit_transform(data['Label'])

    # Scaling
    scaler = StandardScaler()
    features = data.drop(columns=['Label'])
    scaled_features = scaler.fit_transform(features)

    return pd.DataFrame(scaled_features, columns=features.columns), data['Label']

In [8]:
class HiddenNaiveBayes:
    def __init__(self):
        self.class_priors = {}
        self.conditional_probs = {}

    def fit(self, X, y):
        num_samples, num_features = X.shape
        classes = np.unique(y)

        # Calculate class priors
        self.class_priors = {c: np.sum(y == c) / num_samples for c in classes}

        # Calculate conditional probabilities
        self.conditional_probs = {}
        for c in classes:
            class_samples = X[y == c].to_numpy()  # Convert to NumPy array
            self.conditional_probs[c] = {}
            for i in range(num_features):
                mean_prob = np.mean(class_samples[:, i])
                self.conditional_probs[c][i] = mean_prob if mean_prob > 0 else 1e-6  # Avoid zero

        return self

    def predict(self, X):
        predictions = []
        X = X.to_numpy()  # Ensure X is a NumPy array
        for x in X:
            posteriors = {}
            for c in self.class_priors:
                # Start with the prior
                posterior = np.log(self.class_priors[c] + 1e-6)  # Add small constant to avoid log(0)
                # Add the conditional probabilities
                for i in range(len(x)):
                    mean_prob = self.conditional_probs[c].get(i, 1e-6)  # Avoid zero or NaN
                    posterior += np.log(mean_prob + 1e-6)
                posteriors[c] = posterior
            # Predict the class with the highest posterior
            predictions.append(max(posteriors, key=posteriors.get))
        return np.array(predictions)

In [9]:
def main():
    filepath = r"C:\Users\HP\Documents\GitHub\intrusion-detection-fewshot-vs-traditional\cleaned_dataset.csv"  
    data = load_data(filepath)

if __name__ == "__main__":
    main()