# ***0. Data Loading***

In [None]:
import pandas as pd

df = pd.read_csv("UNSWNB15_training_coursework.csv")

df.head()

In [None]:
df.info()

In [None]:
print(df.describe())

# ***1. Data Pre-Processing (Task 1)***

In [None]:
df.isnull().sum()

In [None]:
categori_col = []
for i in df.columns:
    if df[i].nunique() < 25:
        categori_col.append(i)
        print(i)

In [None]:
for i in categori_col:
    print(f"Column: {i}")
    print(df[i].value_counts())
    print("=====")

In [None]:
df["proto"].value_counts()

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

categorical_cols = ['service', 'state']

def one_hot_encode(df, categorical_cols):
    df_encoded = df.copy()

    for col in categorical_cols:
        dummies = pd.get_dummies(df_encoded[col], prefix=col, drop_first=False)
        df_encoded = pd.concat([df_encoded, dummies], axis=1)
        df_encoded.drop(col, axis=1, inplace=True)
    
    return df_encoded

df_encoded = one_hot_encode(df, categorical_cols)
df_encoded.info()

In [None]:
import numpy as np
import pandas as pd

# 1. Identify the most common protocols and group the rest
def encode_protocol(df, column='proto', top_n=10):
    # Get value counts
    value_counts = df[column].value_counts()
    
    # Identify top N values
    top_values = value_counts.head(top_n).index.tolist()
    
    # Create a copy of the dataframe
    df_encoded = df.copy()
    
    # Replace all non-top values with 'other'
    df_encoded[column] = df_encoded[column].apply(lambda x: x if x in top_values else 'other')
    
    # One-hot encode the modified column
    proto_encoded = pd.get_dummies(df_encoded[column], prefix=column, drop_first=False)
    
    # Concatenate with original dataframe
    df_encoded = pd.concat([df_encoded, proto_encoded], axis=1)
    
    # Drop the original column
    df_encoded.drop(column, axis=1, inplace=True)
    
    return df_encoded

# Apply the protocol encoding
df_encoded = encode_protocol(df_encoded, top_n=10)

# Check the new dummy columns
proto_columns = [col for col in df_encoded.columns if col.startswith('proto_')]
print(f"Protocol encoded columns: {proto_columns}")
print(f"Shape after encoding proto: {df_encoded.shape}")

In [None]:
def check_feature_correlations(df, target='label'):
    correlations = df.corr()[target].sort_values(ascending=False)
    print("Top positive correlations with target:")
    print(correlations[correlations > 0].head(10))
    print("\nTop negative correlations with target:")
    print(correlations[correlations < 0].head(10))
    
    return correlations

target_correlations = check_feature_correlations(df_encoded)

In [None]:
df["label"].value_counts()

In [None]:
import numpy as np
import pandas as pd
import math
from collections import defaultdict
import random
import time

def standardize_features(df, exclude_cols=None):
    """
    Standardize features to have zero mean and unit variance
    exclude_cols: list of columns to exclude from standardization (e.g., target variable)
    """
    if exclude_cols is None:
        exclude_cols = []
    
    # Create a copy of the dataframe
    df_std = df.copy()
    
    # Get columns to standardize
    cols_to_standardize = [col for col in df.columns if col not in exclude_cols]
    
    # Calculate mean and std, and standardize
    for col in cols_to_standardize:
        mean = df[col].mean()
        std = df[col].std()
        # Avoid division by zero
        if std > 0:
            df_std[col] = (df[col] - mean) / std
        else:
            df_std[col] = 0  # Set to zero if std is zero
    
    # Store the mean and std for later use (e.g., with test data)
    scaler_params = {col: {'mean': df[col].mean(), 'std': df[col].std()} 
                     for col in cols_to_standardize}
    
    return df_std, scaler_params

In [None]:
def train_validation_split(df, val_ratio=0.2, random_state=87):
    """
    Split the dataset into training and validation sets
    val_ratio: proportion of data to use for validation
    """
    # Set random seed for reproducibility
    np.random.seed(random_state)
    
    # Get indices and shuffle
    indices = np.arange(len(df))
    np.random.shuffle(indices)
    
    # Calculate split point
    val_size = int(val_ratio * len(df))
    
    # Split indices
    val_indices = indices[:val_size]
    train_indices = indices[val_size:]
    
    # Create train and validation dataframes
    train_df = df.iloc[train_indices].reset_index(drop=True)
    val_df = df.iloc[val_indices].reset_index(drop=True)
    
    return train_df, val_df

In [None]:
class RBFN:
    def __init__(self, num_centers, sigma=1.0):
        """
        Initialize RBFN
        num_centers: number of RBF neurons/centers
        sigma: width parameter for RBF neurons
        """
        self.num_centers = num_centers
        self.sigma = sigma
        self.centers = None
        self.weights = None
        self.bias = None
    
    def _kmeans(self, X, max_iters=200, tol=1e-4):
        """
        Implement K-means algorithm for finding centers
        """
        n_samples, n_features = X.shape
        
        # Initialize centers by randomly selecting data points
        random_indices = np.random.choice(n_samples, self.num_centers, replace=False)
        centers = X[random_indices].copy()
        
        # Iteratively update centers
        for iteration in range(max_iters):
            # Assign each point to the nearest center
            distances = np.zeros((n_samples, self.num_centers))
            for i, center in enumerate(centers):
                # Calculate squared Euclidean distance
                diff = X - center
                distances[:, i] = np.sum(diff * diff, axis=1)
            
            # Get cluster assignments
            cluster_assignments = np.argmin(distances, axis=1)
            
            # Store old centers for convergence check
            old_centers = centers.copy()
            
            # Update centers
            for i in range(self.num_centers):
                # Get points assigned to this cluster
                cluster_points = X[cluster_assignments == i]
                if len(cluster_points) > 0:
                    centers[i] = np.mean(cluster_points, axis=0)
            
            # Check for convergence
            center_shift = np.sum((centers - old_centers) ** 2)
            if center_shift < tol:
                break
        
        return centers
    
    def _calculate_sigma(self, centers):
        """
        Calculate sigma based on average distance between centers
        """
        # Calculate pairwise distances between centers
        n_centers = centers.shape[0]
        distances = np.zeros((n_centers, n_centers))
        
        for i in range(n_centers):
            for j in range(i+1, n_centers):
                dist = np.sqrt(np.sum((centers[i] - centers[j]) ** 2))
                distances[i, j] = distances[j, i] = dist
        
        # Average distance to closest center
        min_distances = []
        for i in range(n_centers):
            # Filter out self-distance (which is 0)
            center_distances = [d for j, d in enumerate(distances[i]) if j != i and d > 0]
            if center_distances:
                min_distances.append(min(center_distances))
        
        # If can't calculate, use default value
        if not min_distances:
            return self.sigma
        
        # Set sigma as average minimum distance / sqrt(2*num_centers)
        sigma = np.mean(min_distances) / math.sqrt(2 * self.num_centers)
        return max(sigma, 1e-10)  # Avoid too small sigma
    
    def _rbf(self, x, center):
        """
        Apply Gaussian RBF to a data point
        """
        dist = np.sum((x - center) ** 2)
        return np.exp(-dist / (2 * self.sigma ** 2))
    
    def _calculate_interpolation_matrix(self, X):
        """
        Calculate the interpolation matrix (RBF outputs)
        """
        n_samples = X.shape[0]
        G = np.zeros((n_samples, self.num_centers))
        
        for i in range(n_samples):
            for j in range(self.num_centers):
                G[i, j] = self._rbf(X[i], self.centers[j])
        
        return G
    
    def fit(self, X, y):
        """
        Train the RBFN
        X: input features [n_samples, n_features]
        y: target values [n_samples]
        """
        # Convert to numpy arrays
        X = np.array(X)
        y = np.array(y)
        
        # Find centers using K-means
        print("Finding centers using K-means...")
        self.centers = self._kmeans(X)
        
        # Calculate optimal sigma
        # self.sigma = self._calculate_sigma(self.centers)
        # print(f"Using sigma: {self.sigma}")
        
        # Calculate interpolation matrix
        print("Calculating interpolation matrix...")
        G = self._calculate_interpolation_matrix(X)
        
        # Add a column of ones for bias
        G_with_bias = np.column_stack((G, np.ones(X.shape[0])))
        
        # Solve for weights using pseudoinverse (normal equation)
        print("Solving for weights...")
        try:
            # Using pseudoinverse: (G^T G)^(-1) G^T y
            GTG = np.dot(G_with_bias.T, G_with_bias)
            GTG_inv = np.linalg.inv(GTG + np.eye(GTG.shape[0]) * 1e-6)  # Add small regularization
            self.weights = np.dot(np.dot(GTG_inv, G_with_bias.T), y)
            
            # Extract bias term
            self.bias = self.weights[-1]
            # Keep only RBF weights
            self.weights = self.weights[:-1]
            
            # Calculate training error
            y_pred = self.predict(X)
            accuracy = np.mean(y_pred == y)
            print(f"Training accuracy: {accuracy:.4f}")
            
        except np.linalg.LinAlgError:
            print("Error: Matrix inversion failed. Try reducing the number of centers or adding more regularization.")
            # Set random weights as fallback
            self.weights = np.random.randn(self.num_centers)
            self.bias = 0
    
    def predict(self, X):
        """
        Make predictions using the trained RBFN
        X: input features [n_samples, n_features]
        Returns: predicted classes [n_samples]
        """
        X = np.array(X)
        
        # Calculate RBF outputs
        G = self._calculate_interpolation_matrix(X)
        
        # Apply weights and add bias
        y_pred_raw = np.dot(G, self.weights) + self.bias
        
        # Apply threshold for binary classification
        y_pred = (y_pred_raw > 0.5).astype(int)
        
        return y_pred
    
    def predict_proba(self, X):
        """
        Predict probability estimates
        X: input features [n_samples, n_features]
        Returns: probabilities [n_samples]
        """
        X = np.array(X)
        
        # Calculate RBF outputs
        G = self._calculate_interpolation_matrix(X)
        
        # Apply weights and add bias
        y_pred_raw = np.dot(G, self.weights) + self.bias
        
        # Apply sigmoid to get probabilities
        y_proba = 1 / (1 + np.exp(-y_pred_raw))
        
        return y_proba

In [None]:
def evaluate_model(y_true, y_pred):
    """
    Calculate performance metrics
    """
    # Accuracy
    accuracy = np.mean(y_true == y_pred)
    
    # Confusion matrix elements
    tp = np.sum((y_true == 1) & (y_pred == 1))
    tn = np.sum((y_true == 0) & (y_pred == 0))
    fp = np.sum((y_true == 0) & (y_pred == 1))
    fn = np.sum((y_true == 1) & (y_pred == 0))
    
    # Precision, recall, F1
    precision = tp / (tp + fp) if (tp + fp) > 0 else 0
    recall = tp / (tp + fn) if (tp + fn) > 0 else 0
    f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0
    
    # Balanced accuracy
    balanced_acc = ((tp / (tp + fn) if (tp + fn) > 0 else 0) + 
                    (tn / (tn + fp) if (tn + fp) > 0 else 0)) / 2
    
    return {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1_score': f1,
        'balanced_accuracy': balanced_acc,
        'confusion_matrix': {
            'tp': tp, 'tn': tn, 'fp': fp, 'fn': fn
        }
    }

In [None]:
def evaluate_model(y_true, y_pred):
    """
    Calculate performance metrics
    """
    # Accuracy
    accuracy = np.mean(y_true == y_pred)
    
    # Confusion matrix elements
    tp = np.sum((y_true == 1) & (y_pred == 1))
    tn = np.sum((y_true == 0) & (y_pred == 0))
    fp = np.sum((y_true == 0) & (y_pred == 1))
    fn = np.sum((y_true == 1) & (y_pred == 0))
    
    # Precision, recall, F1
    precision = tp / (tp + fp) if (tp + fp) > 0 else 0
    recall = tp / (tp + fn) if (tp + fn) > 0 else 0
    f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0
    
    # Balanced accuracy
    balanced_acc = ((tp / (tp + fn) if (tp + fn) > 0 else 0) + 
                    (tn / (tn + fp) if (tn + fp) > 0 else 0)) / 2
    
    return {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1_score': f1,
        'balanced_accuracy': balanced_acc,
        'confusion_matrix': {
            'tp': tp, 'tn': tn, 'fp': fp, 'fn': fn
        }
    }

In [None]:
def evaluate_model(y_true, y_pred):
    """
    Calculate performance metrics
    """
    # Accuracy
    accuracy = np.mean(y_true == y_pred)
    
    # Confusion matrix elements
    tp = np.sum((y_true == 1) & (y_pred == 1))
    tn = np.sum((y_true == 0) & (y_pred == 0))
    fp = np.sum((y_true == 0) & (y_pred == 1))
    fn = np.sum((y_true == 1) & (y_pred == 0))
    
    # Precision, recall, F1
    precision = tp / (tp + fp) if (tp + fp) > 0 else 0
    recall = tp / (tp + fn) if (tp + fn) > 0 else 0
    f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0
    
    # Balanced accuracy
    balanced_acc = ((tp / (tp + fn) if (tp + fn) > 0 else 0) + 
                    (tn / (tn + fp) if (tn + fp) > 0 else 0)) / 2
    
    return {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1_score': f1,
        'balanced_accuracy': balanced_acc,
        'confusion_matrix': {
            'tp': tp, 'tn': tn, 'fp': fp, 'fn': fn
        }
    }

In [None]:
def grid_search_rbfn(X_train, y_train, X_val, y_val, param_grid):
    """
    Perform grid search to find optimal hyperparameters for RBFN
    
    Parameters:
    -----------
    X_train, y_train: Training data and labels
    X_val, y_val: Validation data and labels
    param_grid: Dictionary with hyperparameters to search
                Example: {'num_centers': [10, 30, 50], 'sigma': [0.1, 1.0, 5.0]}
    
    Returns:
    --------
    best_params: Dictionary with best parameters
    best_model: Trained model with best parameters
    results: Dictionary with all results for analysis
    """
    print("\nStarting grid search for RBFN hyperparameters...")
    
    # Initialize tracking variables
    best_accuracy = 0.0
    best_params = None
    best_model = None
    results = []
    
    # Get all parameter combinations
    param_combinations = []
    
    # Function to recursively build parameter combinations
    def build_param_combinations(current_combo, param_names, param_values, index):
        if index == len(param_names):
            param_combinations.append(current_combo.copy())
            return
        
        param_name = param_names[index]
        for value in param_values[param_name]:
            current_combo[param_name] = value
            build_param_combinations(current_combo, param_names, param_values, index + 1)
    
    # Build all parameter combinations
    param_names = list(param_grid.keys())
    build_param_combinations({}, param_names, param_grid, 0)
    
    total_combinations = len(param_combinations)
    print(f"Testing {total_combinations} parameter combinations")
    
    # Loop through all parameter combinations
    for i, params in enumerate(param_combinations):
        print(f"\nTesting combination {i+1}/{total_combinations}:")
        print(f"Parameters: {params}")
        
        # Create and train model with current parameters
        try:
            start_time = time.time()
            
            # Initialize model with current parameters
            model = RBFN(num_centers=params['num_centers'], sigma=params.get('sigma', 1.0))
            
            # Train the model
            model.fit(X_train, y_train)
            
            # Evaluate on validation set
            y_val_pred = model.predict(X_val)
            val_metrics = evaluate_model(y_val, y_val_pred)
            accuracy = val_metrics['accuracy']
            
            # Record results
            train_time = time.time() - start_time
            result = {
                'params': params.copy(),
                'accuracy': accuracy,
                'precision': val_metrics['precision'],
                'recall': val_metrics['recall'],
                'f1_score': val_metrics['f1_score'],
                'balanced_accuracy': val_metrics['balanced_accuracy'],
                'training_time': train_time
            }
            results.append(result)
            
            print(f"Accuracy: {accuracy:.4f}, Training time: {train_time:.2f}s")
            
            # Update best if improved
            if accuracy > best_accuracy:
                best_accuracy = accuracy
                best_params = params.copy()
                best_model = model
                print(f"New best model found! Accuracy: {best_accuracy:.4f}")
                
        except Exception as e:
            print(f"Error with parameters {params}: {str(e)}")
            results.append({
                'params': params.copy(),
                'error': str(e)
            })
    
    # Sort results by accuracy
    valid_results = [r for r in results if 'accuracy' in r]
    sorted_results = sorted(valid_results, key=lambda x: x['accuracy'], reverse=True)
    
    print("\nGrid Search Results:")
    print(f"Best parameters: {best_params}")
    print(f"Best validation accuracy: {best_accuracy:.4f}")
    
    print("\nTop 5 parameter combinations:")
    for i, result in enumerate(sorted_results[:5]):
        print(f"{i+1}. {result['params']} - Accuracy: {result['accuracy']:.4f}")
    
    return best_params, best_model, results

In [None]:
def test_best_model(best_model, best_params, model_params, X_cols):
    """
    Test the best model from grid search on the test set 1
    
    Parameters:
    -----------
    best_model: Trained RBFN model with best parameters
    best_params: Dictionary of best parameters
    model_params: Dictionary with scaler parameters
    X_cols: List of feature columns used for training
    
    Returns:
    --------
    test_metrics: Dictionary with test metrics
    """
    print("\nTesting best model on test set 1...")
    print(f"Best parameters: {best_params}")
    
    try:
        # Load test data
        test_data = pd.read_csv('UNSWNB15_testing1_coursework.csv')
        print(f"Loaded test set with {test_data.shape[0]} samples")
        
        # Perform one-hot encoding for categorical variables
        categorical_cols = ['service', 'state']
        test_encoded = one_hot_encode(test_data, categorical_cols)
        
        # Encode protocol column
        test_encoded = encode_protocol(test_encoded, column='proto', top_n=10)
        
        # Handle missing columns from training data
        for col in X_cols:
            if col not in test_encoded.columns and col != 'label':
                print(f"Adding missing column: {col}")
                test_encoded[col] = 0
        
        # Handle extra columns in test data
        extra_cols = [col for col in test_encoded.columns if col not in X_cols and col != 'label']
        if extra_cols:
            print(f"Dropping extra columns: {extra_cols}")
            test_encoded.drop(columns=extra_cols, inplace=True)
        
        # Ensure all X_cols are present
        missing_cols = [col for col in X_cols if col not in test_encoded.columns]
        if missing_cols:
            raise ValueError(f"Missing columns in test data: {missing_cols}")
        
        # Standardize features using training parameters
        test_std = pd.DataFrame()
        scaler_params = model_params['scaler_params']
        
        for col in X_cols:
            if col in scaler_params:
                mean = scaler_params[col]['mean']
                std = scaler_params[col]['std']
                if std > 0:
                    test_std[col] = (test_encoded[col] - mean) / std
                else:
                    test_std[col] = 0
            else:
                test_std[col] = test_encoded[col]
        
        # Extract features and target
        X_test = test_std[X_cols].values
        y_test = test_data['label'].values
        
        # Make predictions
        y_pred = best_model.predict(X_test)
        
        # Evaluate
        test_metrics = evaluate_model(y_test, y_pred)
        
        # Print results
        print("\nTest Set Results:")
        print(f"Accuracy: {test_metrics['accuracy']:.4f}")
        print(f"Precision: {test_metrics['precision']:.4f}")
        print(f"Recall: {test_metrics['recall']:.4f}")
        print(f"F1 Score: {test_metrics['f1_score']:.4f}")
        print(f"Balanced Accuracy: {test_metrics['balanced_accuracy']:.4f}")
        print("Confusion Matrix:")
        print(f"TP: {test_metrics['confusion_matrix']['tp']}, TN: {test_metrics['confusion_matrix']['tn']}")
        print(f"FP: {test_metrics['confusion_matrix']['fp']}, FN: {test_metrics['confusion_matrix']['fn']}")
        
        # Calculate threshold requirements
        required_accuracy = 0.85
        achieved = test_metrics['accuracy'] >= required_accuracy
        print(f"\nRequired accuracy: {required_accuracy:.2f}")
        print(f"Achieved accuracy: {test_metrics['accuracy']:.4f}")
        print(f"Requirement met: {achieved}")
        
        # Bonus: Try to find optimal threshold for classification
        if hasattr(best_model, 'predict_proba'):
            print("\nOptimizing decision threshold...")
            probs = best_model.predict_proba(X_test)
            
            # Test different thresholds
            thresholds = np.linspace(0.1, 0.9, 9)
            threshold_results = []
            
            for threshold in thresholds:
                y_pred_threshold = (probs > threshold).astype(int)
                metrics = evaluate_model(y_test, y_pred_threshold)
                threshold_results.append({
                    'threshold': threshold,
                    'accuracy': metrics['accuracy'],
                    'balanced_accuracy': metrics['balanced_accuracy'],
                    'f1_score': metrics['f1_score']
                })
            
            # Find best threshold
            best_threshold = max(threshold_results, key=lambda x: x['accuracy'])
            print(f"Best threshold: {best_threshold['threshold']:.2f} with accuracy: {best_threshold['accuracy']:.4f}")
            
            # Apply best threshold
            y_pred_best = (probs > best_threshold['threshold']).astype(int)
            optimized_metrics = evaluate_model(y_test, y_pred_best)
            
            print("\nResults with optimized threshold:")
            print(f"Accuracy: {optimized_metrics['accuracy']:.4f}")
            print(f"Precision: {optimized_metrics['precision']:.4f}")
            print(f"Recall: {optimized_metrics['recall']:.4f}")
            print(f"F1 Score: {optimized_metrics['f1_score']:.4f}")
            print(f"Balanced Accuracy: {optimized_metrics['balanced_accuracy']:.4f}")
        
        return test_metrics
        
    except Exception as e:
        print(f"Error testing model: {str(e)}")
        import traceback
        traceback.print_exc()
        return None


In [None]:
def predict_test_set2(best_model, model_params, X_cols):
    """
    Generate predictions for test set 2 (no labels)
    """
    print("\nPredicting on test set 2...")
    
    try:
        # Load test data
        test_data = pd.read_csv('UNSWNB15_testing2_coursework_no_label.csv')
        print(f"Loaded test set 2 with {test_data.shape[0]} samples")
        
        # Perform one-hot encoding for categorical variables
        categorical_cols = ['service', 'state']
        test_encoded = one_hot_encode(test_data, categorical_cols)
        
        # Encode protocol column
        test_encoded = encode_protocol(test_encoded, column='proto', top_n=10)
        
        # Handle missing and extra columns
        for col in X_cols:
            if col not in test_encoded.columns:
                print(f"Adding missing column: {col}")
                test_encoded[col] = 0
                
        # Keep only the necessary columns
        test_columns = [col for col in test_encoded.columns if col in X_cols]
        test_encoded = test_encoded[test_columns]
        
        # Check if all features are present
        missing_cols = [col for col in X_cols if col not in test_encoded.columns]
        if missing_cols:
            raise ValueError(f"Missing columns in test data: {missing_cols}")
        
        # Standardize features
        test_std = pd.DataFrame()
        scaler_params = model_params['scaler_params']
        
        for col in X_cols:
            if col in scaler_params:
                mean = scaler_params[col]['mean']
                std = scaler_params[col]['std']
                if std > 0:
                    test_std[col] = (test_encoded[col] - mean) / std
                else:
                    test_std[col] = 0
            else:
                test_std[col] = test_encoded[col]
        
        # Extract features
        X_test = test_std[X_cols].values
        
        # Make predictions
        y_pred = best_model.predict(X_test)
        
        # Print predictions
        print("\nPredictions for test set 2:")
        print(y_pred)
        
        # Create results dataframe
        results_df = pd.DataFrame({
            'Index': range(len(y_pred)),
            'Predicted_Label': y_pred
        })
        
        # Save to CSV
        results_path = 'test_set2_predictions.csv'
        results_df.to_csv(results_path, index=False)
        print(f"Saved predictions to {results_path}")
        
        return y_pred
        
    except Exception as e:
        print(f"Error predicting on test set 2: {str(e)}")
        import traceback
        traceback.print_exc()
        return None

In [None]:
def main(center):
    # Load training data (adjust path as needed)
    print("Loading training data...")
    train_data = df_encoded.copy()

    print("\nPerforming feature selection based on correlations...")
    # Select features with absolute correlation above a threshold
    correlation_threshold = 0.1  # Adjust this threshold as needed
    important_features = target_correlations.abs().sort_values(ascending=False)
    selected_features = important_features[important_features > correlation_threshold].index.tolist()
    
    # Remove 'label' from selected features if present
    if 'label' in selected_features:
        selected_features.remove('label')
    
    print(f"Selected {len(selected_features)} features with correlation > {correlation_threshold}")
    print("Top selected features:")
    print(selected_features[:10])  # Print top 10 features
    
    # Update X_cols to only use selected features
    X_cols = selected_features

    
    # And 'label' is the target column
    # X_cols = [col for col in train_data.columns if col != 'label']
    y_col = 'label'
    
    # Standardize features
    print("Standardizing features...")
    train_data_std, scaler_params = standardize_features(train_data, exclude_cols=[y_col])
    
    # Split into train and validation
    print("Splitting data...")
    train_df, val_df = train_validation_split(train_data_std, val_ratio=0.2)
    
    # Prepare data
    X_train = train_df[X_cols].values
    y_train = train_df[y_col].values
    X_val = val_df[X_cols].values
    y_val = val_df[y_col].values
    
    print(f"Training set: {X_train.shape[0]} samples, {X_train.shape[1]} features")
    print(f"Validation set: {X_val.shape[0]} samples, {X_val.shape[1]} features")
    
    # # Define parameter grid for search
    # param_grid = {
    #     'num_centers': [30, 50, 70, 100],
    #     'sigma': [0.1, 0.5, 1.0, 2.0, 5.0]
    # }
    
    # Define parameter grid for search
    param_grid = {
        'num_centers': [70, 100, 120,140],
        'sigma': [2.0, 5.0, 7.0, 8.0, 9.0]
    }
    
    
    # Perform grid search
    best_params, best_model, grid_results = grid_search_rbfn(X_train, y_train, X_val, y_val, param_grid)
    
    # Use the best model for final evaluation
    rbfn = best_model
    
    print(f"Training set: {X_train.shape[0]} samples")
    print(f"Validation set: {X_val.shape[0]} samples")
    
    # # Train RBFN model
    # print("Training RBFN model...")
    # start_time = time.time()
    # rbfn = RBFN(num_centers=center, sigma=1.0)  # Adjust num_centers as needed
    # rbfn.fit(X_train, y_train)
    # training_time = time.time() - start_time
    # print(f"Training completed in {training_time:.2f} seconds")
    
    # Evaluate on validation set
    print("Evaluating on validation set...")
    y_val_pred = rbfn.predict(X_val)
    val_metrics = evaluate_model(y_val, y_val_pred)
    
    print("\nValidation Results:")
    print(f"Accuracy: {val_metrics['accuracy']:.4f}")
    print(f"Precision: {val_metrics['precision']:.4f}")
    print(f"Recall: {val_metrics['recall']:.4f}")
    print(f"F1 Score: {val_metrics['f1_score']:.4f}")
    print(f"Balanced Accuracy: {val_metrics['balanced_accuracy']:.4f}")
    print("Confusion Matrix:")
    print(f"TP: {val_metrics['confusion_matrix']['tp']}, TN: {val_metrics['confusion_matrix']['tn']}")
    print(f"FP: {val_metrics['confusion_matrix']['fp']}, FN: {val_metrics['confusion_matrix']['fn']}")
    
    # Save the model parameters (centers, weights, sigma) for later use
    model_params = {
        'centers': rbfn.centers,
        'weights': rbfn.weights,
        'bias': rbfn.bias,
        'sigma': rbfn.sigma,
        'scaler_params': scaler_params
    }

    test_metrics = test_best_model(best_model, best_params, model_params, X_cols)
    test2_predictions = predict_test_set2(best_model, model_params, X_cols)
    
    return rbfn, model_params, val_metrics, test_metrics, test2_predictions

In [None]:
# rbfn, model_params, val_metrics, test_metrics, test2_predictions = main(40)

In [None]:
# rbfn, model_params, val_metrics, test_metrics, test2_predictions = main(40)

In [None]:
import numpy as np
import pandas as pd
import math
from collections import defaultdict
import random
import time
import matplotlib.pyplot as plt
import seaborn as sns

# Encoding functions
def one_hot_encode(df, categorical_cols):
    """
    One-hot encode categorical columns
    """
    df_encoded = df.copy()
    for col in categorical_cols:
        if col in df_encoded.columns:
            dummies = pd.get_dummies(df_encoded[col], prefix=col, drop_first=False)
            df_encoded = pd.concat([df_encoded, dummies], axis=1)
            df_encoded.drop(col, axis=1, inplace=True)
    
    return df_encoded

def encode_protocol(df, column='proto', top_n=10):
    """
    Encode protocol column by keeping top N values and grouping others
    """
    if column not in df.columns:
        return df
        
    # Get value counts
    value_counts = df[column].value_counts()
    
    # Identify top N values
    top_values = value_counts.head(top_n).index.tolist()
    
    # Create a copy of the dataframe
    df_encoded = df.copy()
    
    # Replace all non-top values with 'other'
    df_encoded[column] = df_encoded[column].apply(lambda x: x if x in top_values else 'other')
    
    # One-hot encode the modified column
    proto_encoded = pd.get_dummies(df_encoded[column], prefix=column, drop_first=False)
    
    # Concatenate with original dataframe
    df_encoded = pd.concat([df_encoded, proto_encoded], axis=1)
    
    # Drop the original column
    df_encoded.drop(column, axis=1, inplace=True)
    
    return df_encoded

# 5. Main execution
def main():
    # Load training data
    print("Loading training data...")
    train_data = pd.read_csv('UNSWNB15_training_coursework.csv')
    
    # Display basic information
    print("Original data info:")
    train_data.info()
    
    # Encode categorical columns
    print("\nPerforming one-hot encoding for categorical variables...")
    categorical_cols = ['service', 'state']
    train_data_encoded = one_hot_encode(train_data, categorical_cols)
    
    # Encode protocol column
    print("\nEncoding protocol column...")
    train_data_encoded = encode_protocol(train_data_encoded, column='proto', top_n=10)
    
    # Display encoded data info
    print("\nEncoded data info:")
    train_data_encoded.info()
    
    # Check protocol columns
    proto_columns = [col for col in train_data_encoded.columns if col.startswith('proto_')]
    print(f"Protocol encoded columns: {proto_columns}")
    print(f"Shape after encoding: {train_data_encoded.shape}")
    
    # Check correlation with target (optional)
    print("\nChecking correlations with target variable...")
    correlations = train_data_encoded.corr()['label'].sort_values(ascending=False)
    print("Top positive correlations:")
    print(correlations[correlations > 0].head(10))
    print("\nTop negative correlations:")
    print(correlations[correlations < 0].head(10))
    
    # Visualize top correlations (optional)
    plt.figure(figsize=(12, 8))
    top_features = correlations.abs().sort_values(ascending=False).head(20).index
    sns.barplot(x=correlations[top_features], y=top_features)
    plt.title('Top 20 Feature Correlations with Target')
    plt.tight_layout()
    plt.savefig('correlations.png')  # Save for later reference
    
    # Prepare for RBFN
    X_cols = [col for col in train_data_encoded.columns if col != 'label']
    y_col = 'label'
    
    # Standardize features
    print("\nStandardizing features...")
    train_data_std, scaler_params = standardize_features(train_data_encoded, exclude_cols=[y_col])
    
    # Split into train and validation
    print("Splitting data...")
    train_df, val_df = train_validation_split(train_data_std, val_ratio=0.2)
    
    # Prepare data
    X_train = train_df[X_cols].values
    y_train = train_df[y_col].values
    X_val = val_df[X_cols].values
    y_val = val_df[y_col].values
    
    print(f"Training set: {X_train.shape[0]} samples, {X_train.shape[1]} features")
    print(f"Validation set: {X_val.shape[0]} samples, {X_val.shape[1]} features")
    
    # Train RBFN model
    print("\nTraining RBFN model...")
    start_time = time.time()
    # Start with fewer centers to speed up initial testing
    rbfn = RBFN(num_centers=140, sigma=7.0)  # Adjust num_centers as needed
    rbfn.fit(X_train, y_train)
    training_time = time.time() - start_time
    print(f"Training completed in {training_time:.2f} seconds")
    
    # Evaluate on validation set
    print("\nEvaluating on validation set...")
    y_val_pred = rbfn.predict(X_val)
    val_metrics = evaluate_model(y_val, y_val_pred)
    
    print("\nValidation Results:")
    print(f"Accuracy: {val_metrics['accuracy']:.4f}")
    print(f"Precision: {val_metrics['precision']:.4f}")
    print(f"Recall: {val_metrics['recall']:.4f}")
    print(f"F1 Score: {val_metrics['f1_score']:.4f}")
    print(f"Balanced Accuracy: {val_metrics['balanced_accuracy']:.4f}")
    print("Confusion Matrix:")
    print(f"TP: {val_metrics['confusion_matrix']['tp']}, TN: {val_metrics['confusion_matrix']['tn']}")
    print(f"FP: {val_metrics['confusion_matrix']['fp']}, FN: {val_metrics['confusion_matrix']['fn']}")
    
    # Test on test set 1 if available
    try:
        print("\nLoading and evaluating on test set 1...")
        test_data1 = pd.read_csv('UNSWNB15_testing1_coursework.csv')
        
        # Apply the same preprocessing
        test_data1_encoded = one_hot_encode(test_data1, categorical_cols)
        test_data1_encoded = encode_protocol(test_data1_encoded, column='proto', top_n=10)
        
        # Handle missing columns (if any)
        for col in X_cols:
            if col not in test_data1_encoded.columns and col != 'label':
                test_data1_encoded[col] = 0  # Add missing columns with zeros
        
        # Ensure same column order as training data
        test_data1_encoded = test_data1_encoded[X_cols + ['label']]
        
        # Standardize using training parameters
        test_data1_std = pd.DataFrame()
        for col in X_cols:
            if col in scaler_params:
                mean = scaler_params[col]['mean']
                std = scaler_params[col]['std']
                if std > 0:
                    test_data1_std[col] = (test_data1_encoded[col] - mean) / std
                else:
                    test_data1_std[col] = 0
            else:
                test_data1_std[col] = test_data1_encoded[col]
        
        test_data1_std['label'] = test_data1_encoded['label']
        
        # Evaluate
        X_test1 = test_data1_std[X_cols].values
        y_test1 = test_data1_std['label'].values
        y_test1_pred = rbfn.predict(X_test1)
        
        test1_metrics = evaluate_model(y_test1, y_test1_pred)
        
        print("\nTest Set 1 Results:")
        print(f"Accuracy: {test1_metrics['accuracy']:.4f}")
        print(f"Precision: {test1_metrics['precision']:.4f}")
        print(f"Recall: {test1_metrics['recall']:.4f}")
        print(f"F1 Score: {test1_metrics['f1_score']:.4f}")
        print(f"Balanced Accuracy: {test1_metrics['balanced_accuracy']:.4f}")
        
    except Exception as e:
        print(f"Error evaluating test set 1: {e}")
    
    # Save the model parameters for later use
    model_params = {
        'centers': rbfn.centers,
        'weights': rbfn.weights,
        'bias': rbfn.bias,
        'sigma': rbfn.sigma,
        'scaler_params': scaler_params,
        'X_cols': X_cols  # Save column names for consistent preprocessing
    }
    
    return rbfn, model_params, val_metrics

# if __name__ == "__main__":
#     rbfn_model, model_params, validation_metrics = main()

In [None]:
import numpy as np
import pandas as pd
import math
from collections import defaultdict
import random
import time
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import KFold

# Encoding functions
def one_hot_encode(df, categorical_cols):
    """
    One-hot encode categorical columns
    """
    df_encoded = df.copy()
    for col in categorical_cols:
        if col in df_encoded.columns:
            dummies = pd.get_dummies(df_encoded[col], prefix=col, drop_first=False)
            df_encoded = pd.concat([df_encoded, dummies], axis=1)
            df_encoded.drop(col, axis=1, inplace=True)
    
    return df_encoded

def encode_protocol(df, column='proto', top_n=10):
    """
    Encode protocol column by keeping top N values and grouping others
    """
    if column not in df.columns:
        return df
        
    # Get value counts
    value_counts = df[column].value_counts()
    
    # Identify top N values
    top_values = value_counts.head(top_n).index.tolist()
    
    # Create a copy of the dataframe
    df_encoded = df.copy()
    
    # Replace all non-top values with 'other'
    df_encoded[column] = df_encoded[column].apply(lambda x: x if x in top_values else 'other')
    
    # One-hot encode the modified column
    proto_encoded = pd.get_dummies(df_encoded[column], prefix=column, drop_first=False)
    
    # Concatenate with original dataframe
    df_encoded = pd.concat([df_encoded, proto_encoded], axis=1)
    
    # Drop the original column
    df_encoded.drop(column, axis=1, inplace=True)
    
    return df_encoded

def perform_cross_validation(X, y, X_cols, rbfn_class, n_splits=5, num_centers=140, sigma=7.0):
    """
    Perform k-fold cross-validation on the data using the provided RBFN class
    """
    # Initialize KFold cross-validator
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)
    
    # Lists to store results for each fold
    fold_metrics = []
    
    print(f"\nPerforming {n_splits}-fold cross-validation...")
    
    for fold, (train_index, test_index) in enumerate(kf.split(X)):
        print(f"\nTraining on fold {fold+1}/{n_splits}...")
        
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        
        # Initialize and train RBFN model with the pre-tuned parameters
        rbfn = rbfn_class(num_centers=num_centers, sigma=sigma)
        start_time = time.time()
        rbfn.fit(X_train, y_train)
        training_time = time.time() - start_time
        
        # Make predictions
        y_test_pred = rbfn.predict(X_test)
        
        # Evaluate
        metrics = evaluate_model(y_test, y_test_pred)
        metrics['training_time'] = training_time
        fold_metrics.append(metrics)
        
        print(f"Fold {fold+1} results:")
        print(f"  Accuracy: {metrics['accuracy']:.4f}")
        print(f"  Precision: {metrics['precision']:.4f}")
        print(f"  Recall: {metrics['recall']:.4f}")
        print(f"  F1 Score: {metrics['f1_score']:.4f}")
        print(f"  Training time: {metrics['training_time']:.2f} seconds")
    
    # Compute average metrics across folds
    avg_metrics = {
        'accuracy': np.mean([m['accuracy'] for m in fold_metrics]),
        'precision': np.mean([m['precision'] for m in fold_metrics]),
        'recall': np.mean([m['recall'] for m in fold_metrics]),
        'f1_score': np.mean([m['f1_score'] for m in fold_metrics]),
        'balanced_accuracy': np.mean([m['balanced_accuracy'] for m in fold_metrics]),
        'training_time': np.mean([m['training_time'] for m in fold_metrics])
    }
    
    # Compute standard deviations
    std_metrics = {
        'accuracy': np.std([m['accuracy'] for m in fold_metrics]),
        'precision': np.std([m['precision'] for m in fold_metrics]),
        'recall': np.std([m['recall'] for m in fold_metrics]),
        'f1_score': np.std([m['f1_score'] for m in fold_metrics]),
        'balanced_accuracy': np.std([m['balanced_accuracy'] for m in fold_metrics])
    }
    
    return {
        'fold_metrics': fold_metrics,
        'avg_metrics': avg_metrics,
        'std_metrics': std_metrics
    }

# 5. Main execution
def main():
    # Load training data
    print("Loading training data...")
    train_data = pd.read_csv('UNSWNB15_training_coursework.csv')
    
    # Display basic information
    print("Original data info:")
    train_data.info()
    
    # Encode categorical columns
    print("\nPerforming one-hot encoding for categorical variables...")
    categorical_cols = ['service', 'state']
    train_data_encoded = one_hot_encode(train_data, categorical_cols)
    
    # Encode protocol column
    print("\nEncoding protocol column...")
    train_data_encoded = encode_protocol(train_data_encoded, column='proto', top_n=10)
    
    # Display encoded data info
    print("\nEncoded data info:")
    train_data_encoded.info()
    
    # Check protocol columns
    proto_columns = [col for col in train_data_encoded.columns if col.startswith('proto_')]
    print(f"Protocol encoded columns: {proto_columns}")
    print(f"Shape after encoding: {train_data_encoded.shape}")
    
    # Check correlation with target (optional)
    print("\nChecking correlations with target variable...")
    correlations = train_data_encoded.corr()['label'].sort_values(ascending=False)
    print("Top positive correlations:")
    print(correlations[correlations > 0].head(10))
    print("\nTop negative correlations:")
    print(correlations[correlations < 0].head(10))
    
    # Visualize top correlations (optional)
    plt.figure(figsize=(12, 8))
    top_features = correlations.abs().sort_values(ascending=False).head(20).index
    sns.barplot(x=correlations[top_features], y=top_features)
    plt.title('Top 20 Feature Correlations with Target')
    plt.tight_layout()
    plt.savefig('correlations.png')  # Save for later reference
    
    # Prepare for RBFN
    X_cols = [col for col in train_data_encoded.columns if col != 'label']
    y_col = 'label'
    
    # Standardize features
    print("\nStandardizing features...")
    train_data_std, scaler_params = standardize_features(train_data_encoded, exclude_cols=[y_col])
    
    # Extract X and y for cross-validation
    X = train_data_std[X_cols].values
    y = train_data_std[y_col].values
    
    # Perform cross-validation using your RBFN model with pre-tuned parameters
    print("\nPerforming cross-validation with pre-tuned parameters (centers=140, sigma=7.0)...")
    cv_results = perform_cross_validation(X, y, X_cols, RBFN, n_splits=5, num_centers=140, sigma=7.0)
    
    # Print cross-validation results
    print("\nCross-Validation Results Summary:")
    print(f"Average Accuracy: {cv_results['avg_metrics']['accuracy']:.4f} ± {cv_results['std_metrics']['accuracy']:.4f}")
    print(f"Average Precision: {cv_results['avg_metrics']['precision']:.4f} ± {cv_results['std_metrics']['precision']:.4f}")
    print(f"Average Recall: {cv_results['avg_metrics']['recall']:.4f} ± {cv_results['std_metrics']['recall']:.4f}")
    print(f"Average F1 Score: {cv_results['avg_metrics']['f1_score']:.4f} ± {cv_results['std_metrics']['f1_score']:.4f}")
    print(f"Average Balanced Accuracy: {cv_results['avg_metrics']['balanced_accuracy']:.4f} ± {cv_results['std_metrics']['balanced_accuracy']:.4f}")
    print(f"Average Training Time: {cv_results['avg_metrics']['training_time']:.2f} seconds")
    
    # Visualize cross-validation results
    plt.figure(figsize=(10, 6))
    metrics = ['accuracy', 'precision', 'recall', 'f1_score', 'balanced_accuracy']
    values = [cv_results['avg_metrics'][m] for m in metrics]
    errors = [cv_results['std_metrics'][m] for m in metrics]
    
    plt.bar(metrics, values, yerr=errors, capsize=10)
    plt.ylim(0, 1.1)
    plt.title('Cross-Validation Performance Metrics')
    plt.ylabel('Score')
    plt.grid(axis='y', linestyle='--', alpha=0.7)
    plt.savefig('cv_results.png')
    
    # Split into train and validation
    print("Splitting data...")
    train_df, val_df = train_validation_split(train_data_std, val_ratio=0.2)
    
    # Prepare data
    X_train = train_df[X_cols].values
    y_train = train_df[y_col].values
    X_val = val_df[X_cols].values
    y_val = val_df[y_col].values
    
    print(f"Training set: {X_train.shape[0]} samples, {X_train.shape[1]} features")
    print(f"Validation set: {X_val.shape[0]} samples, {X_val.shape[1]} features")
    
    # Train RBFN model with pre-tuned parameters
    print("\nTraining RBFN model...")
    start_time = time.time()
    rbfn = RBFN(num_centers=140, sigma=7.0)  # Use pre-tuned parameters
    rbfn.fit(X_train, y_train)
    training_time = time.time() - start_time
    print(f"Training completed in {training_time:.2f} seconds")
    
    # Evaluate on validation set
    print("\nEvaluating on validation set...")
    y_val_pred = rbfn.predict(X_val)
    val_metrics = evaluate_model(y_val, y_val_pred)
    
    print("\nValidation Results:")
    print(f"Accuracy: {val_metrics['accuracy']:.4f}")
    print(f"Precision: {val_metrics['precision']:.4f}")
    print(f"Recall: {val_metrics['recall']:.4f}")
    print(f"F1 Score: {val_metrics['f1_score']:.4f}")
    print(f"Balanced Accuracy: {val_metrics['balanced_accuracy']:.4f}")
    print("Confusion Matrix:")
    print(f"TP: {val_metrics['confusion_matrix']['tp']}, TN: {val_metrics['confusion_matrix']['tn']}")
    print(f"FP: {val_metrics['confusion_matrix']['fp']}, FN: {val_metrics['confusion_matrix']['fn']}")
    
    # Test on test set 1 if available
    try:
        print("\nLoading and evaluating on test set 1...")
        test_data1 = pd.read_csv('UNSWNB15_testing1_coursework.csv')
        
        # Apply the same preprocessing
        test_data1_encoded = one_hot_encode(test_data1, categorical_cols)
        test_data1_encoded = encode_protocol(test_data1_encoded, column='proto', top_n=10)
        
        # Handle missing columns (if any)
        for col in X_cols:
            if col not in test_data1_encoded.columns and col != 'label':
                test_data1_encoded[col] = 0  # Add missing columns with zeros
        
        # Ensure same column order as training data
        test_data1_encoded = test_data1_encoded[X_cols + ['label']]
        
        # Standardize using training parameters
        test_data1_std = pd.DataFrame()
        for col in X_cols:
            if col in scaler_params:
                mean = scaler_params[col]['mean']
                std = scaler_params[col]['std']
                if std > 0:
                    test_data1_std[col] = (test_data1_encoded[col] - mean) / std
                else:
                    test_data1_std[col] = 0
            else:
                test_data1_std[col] = test_data1_encoded[col]
        
        test_data1_std['label'] = test_data1_encoded['label']
        
        # Evaluate
        X_test1 = test_data1_std[X_cols].values
        y_test1 = test_data1_std['label'].values
        y_test1_pred = rbfn.predict(X_test1)
        
        test1_metrics = evaluate_model(y_test1, y_test1_pred)
        
        print("\nTest Set 1 Results:")
        print(f"Accuracy: {test1_metrics['accuracy']:.4f}")
        print(f"Precision: {test1_metrics['precision']:.4f}")
        print(f"Recall: {test1_metrics['recall']:.4f}")
        print(f"F1 Score: {test1_metrics['f1_score']:.4f}")
        print(f"Balanced Accuracy: {test1_metrics['balanced_accuracy']:.4f}")
        
    except Exception as e:
        print(f"Error evaluating test set 1: {e}")
    
    # Save the model parameters for later use
    model_params = {
        'centers': rbfn.centers,
        'weights': rbfn.weights,
        'bias': rbfn.bias,
        'sigma': rbfn.sigma,
        'scaler_params': scaler_params,
        'X_cols': X_cols,  # Save column names for consistent preprocessing
        'cv_results': cv_results  # Include cross-validation results
    }
    
    return rbfn, model_params, val_metrics



In [None]:
# if __name__ == "__main__":
#     rbfn_model, model_params, validation_metrics = main()

In [None]:
import numpy as np
import pandas as pd
import math
from collections import defaultdict
import random
import time
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import KFold

# Encoding functions
def one_hot_encode(df, categorical_cols):
    """
    One-hot encode categorical columns
    """
    df_encoded = df.copy()
    for col in categorical_cols:
        if col in df_encoded.columns:
            dummies = pd.get_dummies(df_encoded[col], prefix=col, drop_first=False)
            df_encoded = pd.concat([df_encoded, dummies], axis=1)
            df_encoded.drop(col, axis=1, inplace=True)
    
    return df_encoded

def encode_protocol(df, column='proto', top_n=10):
    """
    Encode protocol column by keeping top N values and grouping others
    """
    if column not in df.columns:
        return df
        
    # Get value counts
    value_counts = df[column].value_counts()
    
    # Identify top N values
    top_values = value_counts.head(top_n).index.tolist()
    
    # Create a copy of the dataframe
    df_encoded = df.copy()
    
    # Replace all non-top values with 'other'
    df_encoded[column] = df_encoded[column].apply(lambda x: x if x in top_values else 'other')
    
    # One-hot encode the modified column
    proto_encoded = pd.get_dummies(df_encoded[column], prefix=column, drop_first=False)
    
    # Concatenate with original dataframe
    df_encoded = pd.concat([df_encoded, proto_encoded], axis=1)
    
    # Drop the original column
    df_encoded.drop(column, axis=1, inplace=True)
    
    return df_encoded

# Add a class for RBFN with probability support
class RBFN:
    def __init__(self, num_centers=100, sigma=1.0):
        self.num_centers = num_centers
        self.sigma = sigma
        self.centers = None
        self.weights = None
        self.bias = None
        
    def _rbf_kernel(self, x, center):
        """Compute RBF kernel"""
        return np.exp(-np.linalg.norm(x - center)**2 / (2 * self.sigma**2))
    
    def _compute_activations(self, X):
        """Compute activations for each center for all samples in X"""
        n_samples = X.shape[0]
        activations = np.zeros((n_samples, self.num_centers))
        
        for i in range(n_samples):
            for j in range(self.num_centers):
                activations[i, j] = self._rbf_kernel(X[i], self.centers[j])
                
        return activations
    
    def fit(self, X, y):
        """Fit RBFN model using k-means for centers and least squares for weights"""
        n_samples, n_features = X.shape
        
        # Select centers using k-means or random selection
        if self.num_centers < n_samples:
            # Use k-means
            from sklearn.cluster import KMeans
            kmeans = KMeans(n_clusters=self.num_centers, random_state=42)
            kmeans.fit(X)
            self.centers = kmeans.cluster_centers_
        else:
            # Use all training examples as centers
            self.num_centers = n_samples
            self.centers = X.copy()
        
        # Compute activations using RBF kernel
        activations = self._compute_activations(X)
        
        # Add bias term
        activations_bias = np.column_stack((activations, np.ones(n_samples)))
        
        # Calculate weights using least squares (pseudoinverse)
        pseudoinv = np.linalg.pinv(activations_bias)
        weights_bias = np.dot(pseudoinv, y)
        
        # Separate weights and bias
        self.weights = weights_bias[:-1]
        self.bias = weights_bias[-1]
        
        return self
    
    def predict_proba(self, X):
        """Predict probabilities using the trained model"""
        n_samples = X.shape[0]
        
        # Compute activations for test samples
        activations = self._compute_activations(X)
        
        # Compute raw output (dot product with weights and add bias)
        raw_output = np.dot(activations, self.weights) + self.bias
        
        # Apply sigmoid function to get probabilities
        probabilities = 1 / (1 + np.exp(-raw_output))
        
        return probabilities
    
    def predict(self, X, threshold=0.5):
        """Predict class labels using the trained model with threshold"""
        probabilities = self.predict_proba(X)
        return (probabilities >= threshold).astype(int)

def standardize_features(df, exclude_cols=None):
    """
    Standardize numerical features in dataframe
    """
    if exclude_cols is None:
        exclude_cols = []
        
    df_std = df.copy()
    scaler_params = {}
    
    for col in df.columns:
        if col not in exclude_cols:
            # Calculate mean and std
            mean = df[col].mean()
            std = df[col].std()
            
            # Store parameters for future use
            scaler_params[col] = {'mean': mean, 'std': std}
            
            # Apply standardization
            if std > 0:
                df_std[col] = (df[col] - mean) / std
            # If std is 0, leave the column as is
    
    return df_std, scaler_params

def train_validation_split(df, val_ratio=0.2, random_state=42):
    """
    Split dataframe into training and validation sets
    """
    np.random.seed(random_state)
    
    # Get indices of samples
    indices = np.random.permutation(df.shape[0])
    val_size = int(val_ratio * df.shape[0])
    
    # Split indices
    val_indices = indices[:val_size]
    train_indices = indices[val_size:]
    
    # Create dataframes
    train_df = df.iloc[train_indices].copy()
    val_df = df.iloc[val_indices].copy()
    
    return train_df, val_df

def evaluate_model(y_true, y_pred):
    """
    Evaluate model performance using various metrics
    """
    # Calculate confusion matrix components
    tp = np.sum((y_true == 1) & (y_pred == 1))
    tn = np.sum((y_true == 0) & (y_pred == 0))
    fp = np.sum((y_true == 0) & (y_pred == 1))
    fn = np.sum((y_true == 1) & (y_pred == 0))
    
    # Calculate metrics
    accuracy = (tp + tn) / (tp + tn + fp + fn) if (tp + tn + fp + fn) > 0 else 0
    precision = tp / (tp + fp) if (tp + fp) > 0 else 0
    recall = tp / (tp + fn) if (tp + fn) > 0 else 0
    f1_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
    
    # Calculate balanced accuracy
    sensitivity = recall
    specificity = tn / (tn + fp) if (tn + fp) > 0 else 0
    balanced_accuracy = (sensitivity + specificity) / 2
    
    return {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1_score': f1_score,
        'balanced_accuracy': balanced_accuracy,
        'confusion_matrix': {
            'tp': int(tp),
            'tn': int(tn),
            'fp': int(fp),
            'fn': int(fn)
        }
    }

def perform_cross_validation(X, y, X_cols, rbfn_class, n_splits=5, num_centers=140, sigma=7.0, threshold=0.6):
    """
    Perform k-fold cross-validation on the data using the provided RBFN class
    """
    # Initialize KFold cross-validator
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)
    
    # Lists to store results for each fold
    fold_metrics = []
    
    print(f"\nPerforming {n_splits}-fold cross-validation...")
    
    for fold, (train_index, test_index) in enumerate(kf.split(X)):
        print(f"\nTraining on fold {fold+1}/{n_splits}...")
        
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        
        # Initialize and train RBFN model with the pre-tuned parameters
        rbfn = rbfn_class(num_centers=num_centers, sigma=sigma)
        start_time = time.time()
        rbfn.fit(X_train, y_train)
        training_time = time.time() - start_time
        
        # Make predictions with threshold
        y_test_pred = rbfn.predict(X_test, threshold=threshold)
        
        # Evaluate
        metrics = evaluate_model(y_test, y_test_pred)
        metrics['training_time'] = training_time
        fold_metrics.append(metrics)
        
        print(f"Fold {fold+1} results (threshold={threshold}):")
        print(f"  Accuracy: {metrics['accuracy']:.4f}")
        print(f"  Precision: {metrics['precision']:.4f}")
        print(f"  Recall: {metrics['recall']:.4f}")
        print(f"  F1 Score: {metrics['f1_score']:.4f}")
        print(f"  Training time: {metrics['training_time']:.2f} seconds")
    
    # Compute average metrics across folds
    avg_metrics = {
        'accuracy': np.mean([m['accuracy'] for m in fold_metrics]),
        'precision': np.mean([m['precision'] for m in fold_metrics]),
        'recall': np.mean([m['recall'] for m in fold_metrics]),
        'f1_score': np.mean([m['f1_score'] for m in fold_metrics]),
        'balanced_accuracy': np.mean([m['balanced_accuracy'] for m in fold_metrics]),
        'training_time': np.mean([m['training_time'] for m in fold_metrics])
    }
    
    # Compute standard deviations
    std_metrics = {
        'accuracy': np.std([m['accuracy'] for m in fold_metrics]),
        'precision': np.std([m['precision'] for m in fold_metrics]),
        'recall': np.std([m['recall'] for m in fold_metrics]),
        'f1_score': np.std([m['f1_score'] for m in fold_metrics]),
        'balanced_accuracy': np.std([m['balanced_accuracy'] for m in fold_metrics])
    }
    
    return {
        'fold_metrics': fold_metrics,
        'avg_metrics': avg_metrics,
        'std_metrics': std_metrics
    }

# 5. Main execution
def main():
    # Threshold for prediction
    threshold = 0.6
    
    # Load training data
    print("Loading training data...")
    train_data = pd.read_csv('UNSWNB15_training_coursework.csv')
    
    # Display basic information
    print("Original data info:")
    train_data.info()
    
    # Encode categorical columns
    print("\nPerforming one-hot encoding for categorical variables...")
    categorical_cols = ['service', 'state']
    train_data_encoded = one_hot_encode(train_data, categorical_cols)
    
    # Encode protocol column
    print("\nEncoding protocol column...")
    train_data_encoded = encode_protocol(train_data_encoded, column='proto', top_n=10)
    
    # Display encoded data info
    print("\nEncoded data info:")
    train_data_encoded.info()
    
    # Check protocol columns
    proto_columns = [col for col in train_data_encoded.columns if col.startswith('proto_')]
    print(f"Protocol encoded columns: {proto_columns}")
    print(f"Shape after encoding: {train_data_encoded.shape}")
    
    # Check correlation with target (optional)
    print("\nChecking correlations with target variable...")
    correlations = train_data_encoded.corr()['label'].sort_values(ascending=False)
    print("Top positive correlations:")
    print(correlations[correlations > 0].head(10))
    print("\nTop negative correlations:")
    print(correlations[correlations < 0].head(10))
    
    # Visualize top correlations (optional)
    plt.figure(figsize=(12, 8))
    top_features = correlations.abs().sort_values(ascending=False).head(20).index
    sns.barplot(x=correlations[top_features], y=top_features)
    plt.title('Top 20 Feature Correlations with Target')
    plt.tight_layout()
    plt.savefig('correlations.png')  # Save for later reference
    
    # Prepare for RBFN
    X_cols = [col for col in train_data_encoded.columns if col != 'label']
    y_col = 'label'
    
    # Standardize features
    print("\nStandardizing features...")
    train_data_std, scaler_params = standardize_features(train_data_encoded, exclude_cols=[y_col])
    
    # Extract X and y for cross-validation
    X = train_data_std[X_cols].values
    y = train_data_std[y_col].values
    
    # Perform cross-validation using RBFN model with pre-tuned parameters and threshold
    print(f"\nPerforming cross-validation with pre-tuned parameters (centers=140, sigma=7.0, threshold={threshold})...")
    cv_results = perform_cross_validation(X, y, X_cols, RBFN, n_splits=5, num_centers=140, sigma=7.0, threshold=threshold)
    
    # Print cross-validation results
    print("\nCross-Validation Results Summary (with threshold):")
    print(f"Threshold: {threshold}")
    print(f"Average Accuracy: {cv_results['avg_metrics']['accuracy']:.4f} ± {cv_results['std_metrics']['accuracy']:.4f}")
    print(f"Average Precision: {cv_results['avg_metrics']['precision']:.4f} ± {cv_results['std_metrics']['precision']:.4f}")
    print(f"Average Recall: {cv_results['avg_metrics']['recall']:.4f} ± {cv_results['std_metrics']['recall']:.4f}")
    print(f"Average F1 Score: {cv_results['avg_metrics']['f1_score']:.4f} ± {cv_results['std_metrics']['f1_score']:.4f}")
    print(f"Average Balanced Accuracy: {cv_results['avg_metrics']['balanced_accuracy']:.4f} ± {cv_results['std_metrics']['balanced_accuracy']:.4f}")
    print(f"Average Training Time: {cv_results['avg_metrics']['training_time']:.2f} seconds")
    
    # Visualize cross-validation results
    plt.figure(figsize=(10, 6))
    metrics = ['accuracy', 'precision', 'recall', 'f1_score', 'balanced_accuracy']
    values = [cv_results['avg_metrics'][m] for m in metrics]
    errors = [cv_results['std_metrics'][m] for m in metrics]
    
    plt.bar(metrics, values, yerr=errors, capsize=10)
    plt.ylim(0, 1.1)
    plt.title(f'Cross-Validation Performance Metrics (Threshold = {threshold})')
    plt.ylabel('Score')
    plt.grid(axis='y', linestyle='--', alpha=0.7)
    plt.savefig('cv_results_threshold.png')
    
    # Split into train and validation
    print("Splitting data...")
    train_df, val_df = train_validation_split(train_data_std, val_ratio=0.2)
    
    # Prepare data
    X_train = train_df[X_cols].values
    y_train = train_df[y_col].values
    X_val = val_df[X_cols].values
    y_val = val_df[y_col].values
    
    print(f"Training set: {X_train.shape[0]} samples, {X_train.shape[1]} features")
    print(f"Validation set: {X_val.shape[0]} samples, {X_val.shape[1]} features")
    
    # Train RBFN model with pre-tuned parameters
    print("\nTraining RBFN model...")
    start_time = time.time()
    rbfn = RBFN(num_centers=140, sigma=7.0)  # Use pre-tuned parameters
    rbfn.fit(X_train, y_train)
    training_time = time.time() - start_time
    print(f"Training completed in {training_time:.2f} seconds")
    
    # Get probability predictions
    print(f"\nEvaluating on validation set with threshold {threshold}...")
    val_proba = rbfn.predict_proba(X_val)
    y_val_pred = (val_proba >= threshold).astype(int)
    val_metrics = evaluate_model(y_val, y_val_pred)
    
    # Visualize probability distributions
    plt.figure(figsize=(10, 6))
    plt.hist(val_proba[y_val == 0], bins=50, alpha=0.5, label='Negative class (0)')
    plt.hist(val_proba[y_val == 1], bins=50, alpha=0.5, label='Positive class (1)')
    plt.axvline(x=threshold, color='red', linestyle='--', label=f'Threshold = {threshold}')
    plt.title('Probability Distribution by Class')
    plt.xlabel('Probability')
    plt.ylabel('Count')
    plt.legend()
    plt.grid(True, alpha=0.3)
    plt.savefig('probability_distribution.png')
    
    print("\nValidation Results:")
    print(f"Threshold: {threshold}")
    print(f"Accuracy: {val_metrics['accuracy']:.4f}")
    print(f"Precision: {val_metrics['precision']:.4f}")
    print(f"Recall: {val_metrics['recall']:.4f}")
    print(f"F1 Score: {val_metrics['f1_score']:.4f}")
    print(f"Balanced Accuracy: {val_metrics['balanced_accuracy']:.4f}")
    print("Confusion Matrix:")
    print(f"TP: {val_metrics['confusion_matrix']['tp']}, TN: {val_metrics['confusion_matrix']['tn']}")
    print(f"FP: {val_metrics['confusion_matrix']['fp']}, FN: {val_metrics['confusion_matrix']['fn']}")
    
    # Test on test set 1 if available
    try:
        print("\nLoading and evaluating on test set 1...")
        test_data1 = pd.read_csv('UNSWNB15_testing1_coursework.csv')
        
        # Apply the same preprocessing
        test_data1_encoded = one_hot_encode(test_data1, categorical_cols)
        test_data1_encoded = encode_protocol(test_data1_encoded, column='proto', top_n=10)
        
        # Handle missing columns (if any)
        for col in X_cols:
            if col not in test_data1_encoded.columns and col != 'label':
                test_data1_encoded[col] = 0  # Add missing columns with zeros
        
        # Ensure same column order as training data
        test_data1_encoded = test_data1_encoded[X_cols + ['label']]
        
        # Standardize using training parameters
        test_data1_std = pd.DataFrame()
        for col in X_cols:
            if col in scaler_params:
                mean = scaler_params[col]['mean']
                std = scaler_params[col]['std']
                if std > 0:
                    test_data1_std[col] = (test_data1_encoded[col] - mean) / std
                else:
                    test_data1_std[col] = 0
            else:
                test_data1_std[col] = test_data1_encoded[col]
        
        test_data1_std['label'] = test_data1_encoded['label']
        
        # Evaluate with threshold
        X_test1 = test_data1_std[X_cols].values
        y_test1 = test_data1_std['label'].values
        
        # Get probability predictions and apply threshold
        test_proba = rbfn.predict_proba(X_test1)
        y_test1_pred = (test_proba >= threshold).astype(int)
        
        test1_metrics = evaluate_model(y_test1, y_test1_pred)
        
        print("\nTest Set 1 Results:")
        print(f"Threshold: {threshold}")
        print(f"Accuracy: {test1_metrics['accuracy']:.4f}")
        print(f"Precision: {test1_metrics['precision']:.4f}")
        print(f"Recall: {test1_metrics['recall']:.4f}")
        print(f"F1 Score: {test1_metrics['f1_score']:.4f}")
        print(f"Balanced Accuracy: {test1_metrics['balanced_accuracy']:.4f}")
        
        # Visualize probability distributions for test set
        plt.figure(figsize=(10, 6))
        plt.hist(test_proba[y_test1 == 0], bins=50, alpha=0.5, label='Negative class (0)')
        plt.hist(test_proba[y_test1 == 1], bins=50, alpha=0.5, label='Positive class (1)')
        plt.axvline(x=threshold, color='red', linestyle='--', label=f'Threshold = {threshold}')
        plt.title('Test Set Probability Distribution by Class')
        plt.xlabel('Probability')
        plt.ylabel('Count')
        plt.legend()
        plt.grid(True, alpha=0.3)
        plt.savefig('test_probability_distribution.png')
        
    except Exception as e:
        print(f"Error evaluating test set 1: {e}")
    
    # Save the model parameters for later use
    model_params = {
        'centers': rbfn.centers,
        'weights': rbfn.weights,
        'bias': rbfn.bias,
        'sigma': rbfn.sigma,
        'threshold': threshold,  # Save the threshold
        'scaler_params': scaler_params,
        'X_cols': X_cols,  # Save column names for consistent preprocessing
        'cv_results': cv_results  # Include cross-validation results
    }
    
    return rbfn, model_params, val_metrics

if __name__ == "__main__":
    rbfn_model, model_params, validation_metrics = main()