In [None]:
#@title Imports

import json
import zipfile
import os
import numpy as np
import pandas as pd
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold, GridSearchCV
from sklearn.feature_selection import SelectKBest, mutual_info_classif
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, roc_auc_score, silhouette_score, davies_bouldin_score
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
from typing import List, Tuple, Dict
import logging
from joblib import Parallel, delayed
import seaborn as sns
from sklearn.metrics import confusion_matrix
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE

# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

In [None]:
#@title Load and pre-process data

def load_and_preprocess_data(dataset_choice: str = 'ransomware_headers') -> tuple[pd.DataFrame, pd.Series]:
    """
    Load and preprocess the data from either the Ransomware_headers.csv file or the RISS dataset.

    Args:
        dataset_choice (str): Choice of dataset ('ransomware_headers' or 'riss').

    Returns:
        Tuple[pd.DataFrame, pd.Series]: Features (X) and target variable (y).
    """
    if dataset_choice == 'pe':
        return load_ransomware_headers()
    elif dataset_choice == 'riss':
        return load_riss_dataset()
    else:
        raise ValueError("Invalid dataset choice. Choose 'ransomware_headers' or 'riss'.")

In [None]:
#@title Load PE Dataset

def load_ransomware_headers() -> tuple[pd.DataFrame, pd.Series]:
    """Load and preprocess the Ransomware_headers.csv dataset."""
    logging.info("Loading data from Ransomware_headers.csv")
    print("Loading data from Ransomware_headers.csv")
    df = pd.read_csv('/PE-Dataset/Ransomware_headers.csv')

    # Remove irrelevant columns (adjust as needed)
    df = df.drop(df.columns[[0, 1, 3]], axis=1)

    # Separate features and target
    X = df.iloc[:, 1:]
    y = df.iloc[:, 0]

    return preprocess_features(X, y)

In [None]:
#@title Load RISS Dataset

def load_riss_dataset() -> tuple[pd.DataFrame, pd.Series]:
    """Load and preprocess the RISS dataset."""
    logging.info("Loading data from RISS dataset")
    print("Loading data from RISS dataset")

    # Clone the repository if it doesn't exist
    if not os.path.exists('riss'):
        os.system('git clone https://github.com/rissgrouphub/ransomwaredataset2016 riss')

    zip_file_path = 'riss/RansomwareData.zip'
    extracted_folder = 'riss/data/'

    # Extract the dataset if it hasn't been extracted
    if not os.path.exists(extracted_folder):
        os.makedirs(extracted_folder, exist_ok=True)
        with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
            zip_ref.extractall(extracted_folder)

    # Load header mapping
    header_file = 'riss/VariableNames.txt'
    header_mapping = {}
    with open(header_file, 'r') as file:
        for line in file:
            parts = line.strip().split(';')
            index = parts[0]
            column_name = ';'.join(parts[1:])
            header_mapping[int(index) - 1] = column_name

    # Load the dataset
    df = pd.read_csv('riss/data/RansomwareData.csv', header=None)
    df.columns = [header_mapping.get(i, f'Unknown_{i}') for i in range(len(df.columns))]

    df = df.drop(df.columns[0], axis=1)
    df = df.drop(df.columns[1], axis=1)

    # Separate features and target
    X = df.iloc[:, 1:]  # The features start from column 1
    y = df.iloc[:, 0]   # The label is in column 0

    return preprocess_features(X, y)

In [None]:
#@title Preprocess Features

def preprocess_features(X: pd.DataFrame, y: pd.Series) -> tuple[pd.DataFrame, pd.Series]:
    """Preprocess the features."""
    # Check for missing values
    missing_values = X.isnull().sum()
    if missing_values.sum() > 0:
        logging.warning(f"Missing values found:\n{missing_values[missing_values > 0]}")
        print(f"Missing values found:\n{missing_values[missing_values > 0]}")
        # Impute missing values
        imputer = SimpleImputer(strategy='mean')
        X = pd.DataFrame(imputer.fit_transform(X), columns=X.columns)

    # Normalize features
    scaler = StandardScaler()
    X = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)

    logging.info(f"Data preprocessed. Shape: {X.shape}")
    print(f"Data preprocessed. Shape: {X.shape}")
    return X, y

In [None]:
#@title Find Optimal Features

def find_optimal_features(X: pd.DataFrame, y: pd.Series, clf, max_features: int = 1000, step: int = 50) -> int:
    """
    Find the optimal number of features using cross-validation.

    Args:
        X (pd.DataFrame): Features.
        y (pd.Series): Target variable.
        clf: Classifier to use for evaluation.
        max_features (int): Maximum number of features to consider.
        step (int): Step size for feature count.

    Returns:
        int: Optimal number of features.
    """
    logging.info("Finding optimal number of features")
    print("Finding optimal number of features")
    feature_counts = range(step, min(X.shape[1], max_features) + 1, step)
    mean_scores = []
    std_scores = []

    for k in feature_counts:
        selector = SelectKBest(score_func=mutual_info_classif, k=k)
        X_selected = selector.fit_transform(X, y)
        scores = cross_val_score(clf, X_selected, y, cv=5, scoring='accuracy')
        mean_scores.append(scores.mean())
        std_scores.append(scores.std())
        logging.info(f"Features: {k}, Mean accuracy: {scores.mean():.4f}, Std: {scores.std():.4f}")
        print(f"Features: {k}, Mean accuracy: {scores.mean():.4f}, Std: {scores.std():.4f}")

    optimal_k = feature_counts[np.argmax(mean_scores)]

    # Plot the results
    plt.figure(figsize=(10, 6))
    plt.errorbar(feature_counts, mean_scores, yerr=std_scores, capsize=5)
    plt.xlabel('Feature Count', fontsize=12)
    plt.ylabel('Cross-validation Accuracy', fontsize=12)
    # plt.title('Feature Selection: Accuracy vs Number of Features')
    plt.axvline(x=optimal_k, color='r', linestyle='--', label=f'Optimal features: {optimal_k}')
    plt.legend()
    plt.grid(True)
    plt.savefig('feature_selection_plot.eps')
    plt.close()

    logging.info(f"Optimal number of features: {optimal_k}")
    print(f"Optimal number of features: {optimal_k}")
    return optimal_k

In [None]:
#@title Perform Feature Selection

def perform_feature_selection(X: pd.DataFrame, y: pd.Series, k: int) -> pd.DataFrame:
    """Perform feature selection using mutual information."""
    logging.info(f"Performing feature selection. Selecting top {k} features")
    print(f"Performing feature selection. Selecting top {k} features")
    selector = SelectKBest(score_func=mutual_info_classif, k=min(k, X.shape[1]))
    X_new = selector.fit_transform(X, y)
    selected_features = X.columns[selector.get_support()]
    print("Feature selection complete")
    return pd.DataFrame(X_new, columns=selected_features)

In [None]:
#@title Center Attack

def center_attack(X: pd.DataFrame, y: pd.Series, percent: float) -> pd.Series:
    """
    Perform a center attack by flipping labels of samples closest to class centers.

    Args:
        X (pd.DataFrame): Features.
        y (pd.Series): Target variable.
        percent (float): Percentage of labels to flip.

    Returns:
        pd.Series: Attacked labels.
    """
    if percent <= 0.0:
        return y.copy()

    logging.info(f"Performing center attack with {percent:.2%} label flipping")
    print(f"Performing center attack with {percent:.2%} label flipping")
    y_attacked = y.copy()
    class_centers = [X[y == label].mean(axis=0) for label in np.unique(y)]
    distances = np.array([np.linalg.norm(X - center, axis=1) for center in class_centers]).T

    for label in np.unique(y):
        indices = y[y == label].index
        num_to_flip = int(len(indices) * percent)
        flip_indices = indices[np.argsort(distances[y == label, label])[:num_to_flip]]
        y_attacked.loc[flip_indices] = 1 - y_attacked.loc[flip_indices]

    return y_attacked

In [None]:
#@title Random Attack

def random_attack(y: pd.Series, percent: float) -> pd.Series:
    """
    Perform a random attack by flipping random labels.

    Args:
        y (pd.Series): Target variable.
        percent (float): Percentage of labels to flip.

    Returns:
        pd.Series: Attacked labels.
    """
    logging.info(f"Performing random attack with {percent:.2%} label flipping")
    print(f"Performing random attack with {percent:.2%} label flipping")
    y_attacked = y.copy()
    num_to_flip = int(len(y) * percent)
    flip_indices = np.random.choice(y.index, num_to_flip, replace=False)
    y_attacked.loc[flip_indices] = 1 - y_attacked.loc[flip_indices]
    return y_attacked

In [None]:
#@title Evaluate Model

def evaluate_model(clf, X_train: pd.DataFrame, y_train: pd.Series, X_test: pd.DataFrame, y_test: pd.Series) -> Dict[str, float]:
    """
    Evaluate a model using various metrics.

    Args:
        clf: Trained classifier.
        X_train (pd.DataFrame): Training features.
        y_train (pd.Series): Training labels.
        X_test (pd.DataFrame): Test features.
        y_test (pd.Series): Test labels.

    Returns:
        Dict[str, float]: Dictionary of evaluation metrics.
    """
    y_pred = clf.predict(X_test)
    return {
        'accuracy': accuracy_score(y_test, y_pred),
        'f1': f1_score(y_test, y_pred),
        'precision': precision_score(y_test, y_pred),
        'recall': recall_score(y_test, y_pred),
        'auc': roc_auc_score(y_test, clf.predict_proba(X_test)[:, 1])
    }

In [None]:
#@title Save Results to JSON

import json
import logging
from datetime import datetime

def save_results_to_json(results: Dict, noise_levels: List[float], dataset_name: str):
    """Save experiment results to a JSON file with a timestamp in the file name."""
    # Generate a timestamp string
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")

    # Create the JSON structure
    json_results = {
        "dataset_name": dataset_name,
        "noise_levels": noise_levels,
        "results": results
    }

    # Save the results to a file with the timestamp
    file_name = f'/results/{dataset_name}_results_{timestamp}.json'
    with open(file_name, 'w') as f:
        json.dump(json_results, f)

    # Log and print the save location
    logging.info(f"Results saved to {file_name}")
    print(f"Results saved to {file_name}")

In [None]:
#@title Load Results from JSON

def load_results_from_json(filename: str) -> Tuple[Dict, List[float], str]:
    """Load experiment results from a JSON file."""
    with open(filename, 'r') as f:
        data = json.load(f)
    return data["results"], data["noise_levels"], data["dataset_name"]

In [None]:
#@title Plot Results

def plot_results(results: Dict, noise_levels: List[float], dataset_name: str):
    """Plot the results of the experiment."""
    metrics = ['accuracy', 'f1', 'precision', 'recall', 'auc']

    for clf_name in results:
        for metric in metrics:
            fig, ax = plt.subplots(figsize=(5, 3.4), layout='constrained')

            for attack_type in ['random', 'center']:
                mean_values = [np.mean([r[metric] for r in results[clf_name][attack_type][str(nl)]]) for nl in noise_levels]
                std_values = [np.std([r[metric] for r in results[clf_name][attack_type][str(nl)]]) for nl in noise_levels]

                ax.errorbar(noise_levels, mean_values, yerr=std_values, capsize=5,
                            label=f'{attack_type.capitalize()} Attack')

            ax.set_xlabel('Noise Level', fontsize=12)
            ax.set_ylabel(metric.capitalize(), fontsize=12)
            ax.set_title(f'{metric.capitalize()} vs Noise Level for {clf_name}', fontsize=14)
            ax.legend(fontsize=10)
            ax.grid(True)
            ax.tick_params(axis='both', which='major', labelsize=10)

            plt.tight_layout()
            plt.savefig(f'{dataset_name}_{clf_name}_{metric}_chart.eps', dpi=300, bbox_inches='tight')
            plt.close()

In [None]:
#@title Plot Results Compact

def plot_results_compact(results: Dict, noise_levels: List[float], dataset_name: str):
    metrics = ['accuracy', 'f1', 'precision', 'recall', 'auc']
    for clf_name in results:
        fig, axes = plt.subplots(2, 3, figsize=(10, 6), layout='constrained')
        axes = axes.flatten()
        for i, metric in enumerate(metrics):
            ax = axes[i]
            for attack_type in ['random', 'center']:
                mean_values = [np.mean([r[metric] for r in results[clf_name][attack_type][str(nl)]]) for nl in noise_levels]
                std_values = [np.std([r[metric] for r in results[clf_name][attack_type][str(nl)]]) for nl in noise_levels]
                ax.errorbar(noise_levels, mean_values, yerr=std_values, capsize=5, label=f'{attack_type.capitalize()} Attack')
            ax.set_xlabel('Noise Level', fontsize=12)
            ax.set_ylabel(metric.capitalize(), fontsize=12)
            ax.set_title(f'{metric.capitalize()} vs Noise Level', fontsize=14)
            ax.legend(fontsize=10)
            ax.grid(True)
            ax.tick_params(axis='both', which='major', labelsize=10)

        # Remove the last (empty) subplot
        fig.delaxes(axes[5])

        fig.suptitle(f'Performance Metrics for {clf_name}', fontsize=16)
        plt.tight_layout()
        file_clf_name = ''
        if clf_name == 'SVM (Linear)':
            file_clf_name = 'svml'
        elif clf_name == 'SVM (RBF)':
            file_clf_name = 'svmr'
        elif clf_name == 'Logistic Regression':
            file_clf_name = 'lr'
        elif clf_name == 'Neural Network':
            file_clf_name = 'nn'
        plt.savefig(f'{dataset_name}_{file_clf_name}_charts.eps', dpi=300, bbox_inches='tight')
        plt.savefig(f'{dataset_name}_{file_clf_name}_charts.pdf', dpi=300, bbox_inches='tight')
        plt.close()

In [None]:
#@title Plot Results Compact Both

def plot_results_compact_both(results: Dict, results2: Dict, noise_levels: List[float], dataset_name: str, dataset_name2: str):
    metrics = ['accuracy', 'f1', 'precision', 'recall', 'auc']
    for clf_name in results:
        for i, metric in enumerate(metrics):
            fig, ax = plt.subplots(figsize=(6, 4))
            for attack_type in ['random', 'center']:
                mean_values = [np.mean([r[metric] for r in results[clf_name][attack_type][str(nl)]]) for nl in noise_levels]
                mean_values2 = [np.mean([r[metric] for r in results2[clf_name][attack_type][str(nl)]]) for nl in noise_levels]
                std_values = [np.std([r[metric] for r in results[clf_name][attack_type][str(nl)]]) for nl in noise_levels]
                std_values2 = [np.std([r[metric] for r in results2[clf_name][attack_type][str(nl)]]) for nl in noise_levels]
                ax.errorbar(noise_levels, mean_values, yerr=std_values, capsize=5, label=f'{attack_type.capitalize()} Attack {dataset_name}')
                ax.errorbar(noise_levels, mean_values2, yerr=std_values2, capsize=5, label=f'{attack_type.capitalize()} Attack {dataset_name2}')

            ax.set_xlabel('Noise Level', fontsize=12)
            ax.set_ylabel(metric.capitalize(), fontsize=12)
            ax.set_title(f'{metric.capitalize()} vs Noise Level', fontsize=14)
            ax.legend(fontsize=10)
            ax.grid(True)
            ax.tick_params(axis='both', which='major', labelsize=10)

            plt.tight_layout()

            file_clf_name = ''
            if clf_name == 'SVM (Linear)':
                file_clf_name = 'svml'
            elif clf_name == 'SVM (RBF)':
                file_clf_name = 'svmr'
            elif clf_name == 'Logistic Regression':
                file_clf_name = 'lr'
            elif clf_name == 'Neural Network':
                file_clf_name = 'nn'

            # Save each subplot individually
            plt.savefig(f'/charts/{file_clf_name}_charts_{metric}.pdf', dpi=300, bbox_inches='tight')
            plt.savefig(f'/charts/{file_clf_name}_charts_{metric}.eps', dpi=300, bbox_inches='tight')
            plt.close(fig)

In [None]:
#@title Run Experiment

def run_experiment(X: pd.DataFrame, y: pd.Series, classifiers: list[dict], noise_levels: list[float], n_repeats: int = 5, dataset_name: str = 'riss'):
    """Run the main experiment."""
    results = {}
    print("Main experiment started")
    for clf_config in classifiers:
        clf_name = clf_config['name']
        print(f"Performing attacks on {clf_name}")
        clf = clf_config['clf']
        param_grid = clf_config.get('param_grid', {})

        logging.info(f"Running experiment for {clf_name}")
        results[clf_name] = {'random': {str(nl): [] for nl in noise_levels}, 'center': {str(nl): [] for nl in noise_levels}}

        def run_iteration():
            X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=np.random.randint(0, 1000))
            if param_grid:
                grid_search = GridSearchCV(clf, param_grid, cv=10, scoring='accuracy', n_jobs=-1)
                grid_search.fit(X_train, y_train)
                clf_tuned = grid_search.best_estimator_
            else:
                clf_tuned = clf.fit(X_train, y_train)

            iteration_results = {}
            for noise_level in noise_levels:
                y_random = random_attack(y_train, noise_level)
                clf_tuned.fit(X_train, y_random)
                iteration_results[('random', str(noise_level))] = evaluate_model(clf_tuned, X_train, y_random, X_test, y_test)

                y_center = center_attack(X_train, y_train, noise_level)
                # y_center = improved_center_attack_weighted(X_train, y_train, noise_level, center_method='median', distance_metric='cosine', global_percentage=True, random_selection=True, weighting_method='variance')
                # y_center = improved_center_attack_density(X_train, y_train, noise_level)
                # y_center = autoencoder_based_attack(X_train, y_train, noise_level)

                clf_tuned.fit(X_train, y_center)
                iteration_results[('center', str(noise_level))] = evaluate_model(clf_tuned, X_train, y_center, X_test, y_test)

            return iteration_results

        all_results = Parallel(n_jobs=-1)(delayed(run_iteration)() for _ in range(n_repeats))

        for iteration_result in all_results:
            for (attack_type, noise_level), eval_result in iteration_result.items():
                results[clf_name][attack_type][noise_level].append(eval_result)

    save_results_to_json(results, noise_levels, dataset_name)
    #plot_results_compact(results, noise_levels, dataset_name)

In [None]:
#@title Fast Run Experiment

import pandas as pd
import numpy as np
import logging
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, roc_auc_score
from joblib import Parallel, delayed

# Assume random_attack, improved_center_attack_weighted, evaluate_model are defined elsewhere

def run_experiment_fast(X: pd.DataFrame, y: pd.Series, classifiers: list[dict], noise_levels: list[float], n_repeats: int = 5, dataset_name: str = 'riss'):
    """Run the main experiment with speed optimizations."""
    results = {}
    print("Main experiment started")
    for clf_config in classifiers:
        clf_name = clf_config['name']
        print(f"Performing attacks on {clf_name}")
        clf = clf_config['clf']
        param_grid = clf_config.get('param_grid', {})
        logging.info(f"Running experiment for {clf_name}")
        results[clf_name] = {'random': {str(nl): [] for nl in noise_levels}, 'center': {str(nl): [] for nl in noise_levels}}

        def run_iteration(repeat_index):  # Add repeat_index for individual random states
            X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42 + repeat_index)  # Use different random state for each repeat

            # Train the classifier once per iteration on the clean data
            if param_grid:
                grid_search = GridSearchCV(clf, param_grid, cv=10, scoring='accuracy', n_jobs=-1)
                grid_search.fit(X_train, y_train)
                clf_tuned = grid_search.best_estimator_
            else:
                clf_tuned = clf.fit(X_train, y_train)

            iteration_results = {}
            for noise_level in noise_levels:
                # Evaluate with random attack
                y_random = random_attack(y_train, noise_level)
                iteration_results[('random', str(noise_level))] = evaluate_model(clf_tuned, X_train, y_random, X_test, y_test)

                # Evaluate with center attack
                y_center = improved_center_attack_weighted(X_train, y_train, noise_level, center_method='median', distance_metric='cosine', global_percentage=True, random_selection=True, weighting_method='variance')
                iteration_results[('center', str(noise_level))] = evaluate_model(clf_tuned, X_train, y_center, X_test, y_test)

            return iteration_results

        # Pass repeat index to run_iteration for different random states
        all_results = Parallel(n_jobs=-1)(delayed(run_iteration)(i) for i in range(n_repeats))

        for iteration_result in all_results:
            for (attack_type, noise_level), eval_result in iteration_result.items():
                results[clf_name][attack_type][noise_level].append(eval_result)

    save_results_to_json(results, noise_levels, dataset_name)
    # plot_results_compact(results, noise_levels, dataset_name)

In [None]:
# Set random seed for reproducibility
np.random.seed(42)

# Set dataset name
dataset_choice = 'riss'

# Load and preprocess data
X, y = load_and_preprocess_data(dataset_choice)

# Find optimal number of features
base_clf = LogisticRegression(random_state=42)  # You can choose any classifier here
optimal_k = find_optimal_features(X, y, base_clf)

Loading data from RISS dataset
Data preprocessed. Shape: (1524, 30967)
Finding optimal number of features
Features: 50, Mean accuracy: 0.8825, Std: 0.0092
Features: 100, Mean accuracy: 0.9075, Std: 0.0147
Features: 150, Mean accuracy: 0.9265, Std: 0.0116
Features: 200, Mean accuracy: 0.9331, Std: 0.0092
Features: 250, Mean accuracy: 0.9331, Std: 0.0148
Features: 300, Mean accuracy: 0.9377, Std: 0.0116
Features: 350, Mean accuracy: 0.9613, Std: 0.0094
Features: 400, Mean accuracy: 0.9370, Std: 0.0148
Features: 450, Mean accuracy: 0.9239, Std: 0.0158
Features: 500, Mean accuracy: 0.9436, Std: 0.0113
Features: 550, Mean accuracy: 0.9442, Std: 0.0215
Features: 600, Mean accuracy: 0.9495, Std: 0.0061
Features: 650, Mean accuracy: 0.9462, Std: 0.0154
Features: 700, Mean accuracy: 0.9475, Std: 0.0154
Features: 750, Mean accuracy: 0.9449, Std: 0.0120
Features: 800, Mean accuracy: 0.9423, Std: 0.0152
Features: 850, Mean accuracy: 0.9449, Std: 0.0143
Features: 900, Mean accuracy: 0.9416, Std: 0.



Features: 1000, Mean accuracy: 0.9665, Std: 0.0124
Optimal number of features: 1000


In [None]:
#@title Main

# Set random seed for reproducibility
np.random.seed(42)

# Set dataset name
dataset_choice = 'pe'

# Load and preprocess data
X, y = load_and_preprocess_data(dataset_choice)

# Find optimal number of features
base_clf = LogisticRegression(random_state=42)  # You can choose any classifier here
optimal_k = 800 if dataset_choice == 'pe' else 1000 # find_optimal_features(X, y, base_clf)

# Perform feature selection
X_selected = perform_feature_selection(X, y, k=optimal_k)

# Define classifiers and their hyperparameter grids
classifiers = [
    # {'name': 'Naive Bayes', 'clf': GaussianNB()},
    {'name': 'SVM (Linear)', 'clf': SVC(kernel='linear', probability=True),
      'param_grid': {'C': [0.1, 1, 10]}},
    {'name': 'SVM (RBF)', 'clf': SVC(kernel='rbf', probability=True),
      'param_grid': {'C': [0.1, 1, 10], 'gamma': ['scale', 'auto']}},
    {'name': 'Logistic Regression', 'clf': LogisticRegression(random_state=42),
      'param_grid': {'C': [0.1, 1, 10]}},
    {'name': 'Neural Network', 'clf': MLPClassifier(random_state=42),
      'param_grid': {'hidden_layer_sizes': [(50,), (100,), (50, 50)], 'alpha': [0.0001, 0.001, 0.01]}}
]

# Define noise levels
noise_levels = [0.0, 0.05, 0.1, 0.15, 0.2, 0.25, 0.3]

# Run the experiment
run_experiment(X_selected, y, classifiers, noise_levels, 5, dataset_choice)

Loading data from Ransomware_headers.csv
Data preprocessed. Shape: (2157, 1024)
Performing feature selection. Selecting top 800 features
Feature selection complete
Main experiment started
Performing attacks on SVM (Linear)
Performing attacks on SVM (RBF)
Performing attacks on Logistic Regression
Performing attacks on Neural Network
Results saved to /content/drive/MyDrive/Thesis/results/pe_results_20241224_074226.json


In [None]:
#@title Generate Plots from Saved Result

# If you want to generate plots from saved results without running the experiment:
results, noise_levels, dataset_name = load_results_from_json('/results/pe_results_20241224_074226.json')
results2, noise_levels2, dataset_name2 = load_results_from_json('/results/riss_results_20241224_063619.json')

plot_results_compact_both(results, results2, noise_levels, 'PE', 'RISS')
# plot_results_compact(results, noise_levels, dataset_name)

