In [28]:
import pandas as pd
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

def save_cusum_parameters_from_preprocessed_data(preprocessed_data, output_file, threshold_scale=3):
    """
    Calculates and saves reference values and thresholds for each feature from preprocessed data.

    Parameters:
        preprocessed_data (pd.DataFrame): Preprocessed data (encoded and scaled).
        output_file (str): Path to save the reference parameters as a CSV.
        threshold_scale (float): Scaling factor for the decision threshold based on standard deviation.
    """

  
    # Calculate parameters for each featuáre
    parameters = {
        "Feature": [],
        "ReferenceValue": [],
        "DriftThreshold": [],
        "DecisionThreshold": [],
    }

    for column_name in preprocessed_data.columns:
        values = preprocessed_data[column_name]
        reference_value = values.mean()
        drift_threshold = values.std() * 0.1
        decision_threshold = values.std() * threshold_scale

        parameters["Feature"].append(column_name)
        parameters["ReferenceValue"].append(reference_value)
        parameters["DriftThreshold"].append(drift_threshold)
        parameters["DecisionThreshold"].append(decision_threshold)

    # Save to CSV
    pd.DataFrame(parameters).to_csv(output_file, index=False)
    print(f"Parameters saved to {output_file}")

def load_cusum_parameters(file_path):
    """
    Loads CUSUM parameters from a CSV file.

    Parameters:
        file_path (str): Path to the CUSUM parameters CSV file.

    Returns:
        pd.DataFrame: DataFrame containing the CUSUM parameters.
    """
    return pd.read_csv(file_path)

def test_cusum_results(attack_data, cusum_parameters):
    """
    Tests CUSUM results on the attack dataset.

    Parameters:
        attack_data (pd.DataFrame): The attack dataset with preprocessed features.
        cusum_parameters (pd.DataFrame): DataFrame containing the CUSUM reference parameters.

    Returns:
        pd.DataFrame: Results indicating whether each feature exceeds thresholds.
    """
    results = {
        "Feature": [],
        "ReferenceValue": [],
        "DriftThreshold": [],
        "DecisionThreshold": [],
        "MaxDeviation": [],
        "DriftDetected": [],
        "DecisionThresholdExceeded": []
    }

    for _, row in cusum_parameters.iterrows():
        feature = row["Feature"]
        reference_value = row["ReferenceValue"]
        drift_threshold = row["DriftThreshold"]
        decision_threshold = row["DecisionThreshold"]

        if feature in attack_data.columns:
            deviations = attack_data[feature] - reference_value
            max_deviation = deviations.abs().max()

            results["Feature"].append(feature)
            results["ReferenceValue"].append(reference_value)
            results["DriftThreshold"].append(drift_threshold)
            results["DecisionThreshold"].append(decision_threshold)
            results["MaxDeviation"].append(max_deviation)
            results["DriftDetected"].append(max_deviation > drift_threshold)
            results["DecisionThresholdExceeded"].append(max_deviation > decision_threshold)

    return pd.DataFrame(results)

def evaluate_cusum_results(results, attack_labels):
    """
    Compares CUSUM results with attack labels and computes metrics.

    Parameters:
        results (pd.DataFrame): CUSUM results with DriftDetected.
        attack_labels (pd.Series): True labels for the attack dataset.

    Returns:
        dict: Dictionary containing accuracy, precision, recall, and F1-score.
    """
    # Aggregate drift detections across all features
    overall_drift_detected = results.groupby(results.index)["DriftDetected"].any()

    # Calculate metrics
    accuracy = accuracy_score(attack_labels, overall_drift_detected)
    precision = precision_score(attack_labels, overall_drift_detected)
    recall = recall_score(attack_labels, overall_drift_detected)
    f1 = f1_score(attack_labels, overall_drift_detected)

    return {
        "Accuracy": accuracy,
        "Precision": precision,
        "Recall": recall,
        "F1 Score": f1
    }

# Example usage
preprocessed_normal_data = pd.DataFrame({  # Example DataFrame; replace with your actual preprocessed data
    "feature1": [0.1, 0.2, 0.3, 0.4],
    "feature2": [0.5, 0.6, 0.7, 0.8],
    "feature3": [0.9, 1.0, 1.1, 1.2]
})

output_csv = "reference_params.csv"
save_cusum_parameters_from_preprocessed_data(preprocessed_normal_data, output_csv)

CUSUM_PARAMETERS = load_cusum_parameters("reference_params.csv")
attack_data = pd.read_csv('../Preprocessing/preprocessed_attack_data.csv', encoding='utf-8')

# Separate labels from the dataset
attack_labels = attack_data.pop("Normal/Attack")
print(attack_labels.value_counts())
# Test CUSUM results
results = test_cusum_results(attack_data, CUSUM_PARAMETERS)

# Evaluate results
metrics = evaluate_cusum_results(results, attack_labels)
print("Evaluation Metrics:", metrics)


Parameters saved to reference_params.csv
Normal/Attack
Normal    7219
Attack    2781
Name: count, dtype: int64


ValueError: Found input variables with inconsistent numbers of samples: [10000, 0]

In [18]:
results.head()


Unnamed: 0,Feature,ReferenceValue,DriftThreshold,DecisionThreshold,MaxDeviation,DriftDetected,DecisionThresholdExceeded
