In [2]:
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
import pandas as pd
import joblib

# Preprocessing data

In [4]:
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
import pandas as pd
import joblib

def preprocess_and_save(input_file, output_file, encoder, scaler, categorical_columns):
    """
    Preprocess the input CSV file by applying OneHotEncoding to categorical columns,
    MinMax scaling to non-categorical columns, and combining them into a final dataset.
    """
    # Load the dataset
    df = pd.read_csv(input_file, encoding='utf-8', nrows=10000)

    # Save the first and last columns separately
    first_column = df.iloc[:, 0]
    last_column = df.iloc[:, -1]

    # Exclude the first and last columns for preprocessing
    data = df.iloc[:, 1:-1].copy()

    # Process categorical columns with OneHotEncoder
    encoded_features = encoder.transform(data[categorical_columns])
    encoded_feature_names = encoder.get_feature_names_out(categorical_columns)
    encoded_df = pd.DataFrame(encoded_features, columns=encoded_feature_names, index=data.index)

    # Process non-categorical columns with MinMaxScaler
    non_categorical_columns = [col for col in data.columns if col not in categorical_columns]
    scaled_non_categorical = scaler.transform(data[non_categorical_columns])
    scaled_non_categorical_df = pd.DataFrame(scaled_non_categorical, columns=non_categorical_columns, index=data.index)

    # Combine scaled non-categorical columns and encoded categorical columns
    final_data = pd.concat([scaled_non_categorical_df, encoded_df], axis=1)

    # Add the first and last columns back to the final data
    final_data = pd.concat([first_column, final_data, last_column], axis=1)

    # Save the final data to a CSV file
    final_data.to_csv(output_file, index=False)

    print(f"Preprocessed data saved to {output_file}")


# Define file paths and categorical columns
normal_file = '../Normal.csv'
attack_file = '../Attack.csv'
normal_output = 'preprocessed_normal_data.csv'
attack_output = 'preprocessed_attack_data.csv'
encoder_path = "onehot_encoder.pkl"
scaler_path = "minmax_scaler.pkl"
categorical_columns = ['MV101', 'P101', 'P102', 'MV201', 'P201',
                       'P202', 'P203', 'P204', 'P205', 'P206', 'MV301',
                       'MV302', 'MV303', 'MV304', 'P301', 'P302', 
                       'P401', 'P402', 'P403', 'P404', 'UV401', 'P501',
                       'P502', 'P601', 'P602', 'P603']

# Preprocess Normal.csv and train encoder and scaler
df_normal = pd.read_csv(normal_file, encoding='utf-8')
data = df_normal.iloc[:, 1:-1].copy()

# Train OneHotEncoder on categorical columns
categories = [list(range(3))] * len(categorical_columns) # here we build the colums _0, _1, e _2
encoder = OneHotEncoder(sparse_output=False, drop=None, categories=categories)
encoder.fit(data[categorical_columns])

# Save the OneHotEncoder
joblib.dump(encoder, encoder_path)

# Train MinMaxScaler on non-categorical columns
non_categorical_columns = [col for col in data.columns if col not in categorical_columns]
scaler = MinMaxScaler()
scaler.fit(data[non_categorical_columns])

# Save the MinMaxScaler
joblib.dump(scaler, scaler_path)

# Preprocess Normal.csv
preprocess_and_save(normal_file, normal_output, encoder, scaler, categorical_columns)

# Preprocess Attack.csv using trained encoder and scaler
# preprocess_and_save(attack_file, attack_output, encoder, scaler, categorical_columns)


Preprocessed data saved to preprocessed_normal_data.csv
Preprocessed data saved to preprocessed_attack_data.csv


# CUSUM


In [24]:
# @title CUSUM
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
import joblib
from sklearn.preprocessing import OneHotEncoder

def save_cusum_parameters(df, output_file, scaler_file="minmax_scaler.pkl", encoder_file="onehot_encoder.pkl", threshold_scale=3):
    """
    Reads a CSV file, applies OneHotEncoding and MinMaxScaling using pre-trained encoder and scaler,
    and saves reference values and thresholds for each feature to a file.

    Parameters:
        df
        output_file (str): Path to save the reference parameters as a CSV.
        scaler_file (str): Path to load the MinMaxScaler.
        encoder_file (str): Path to load the OneHotEncoder.
        threshold_scale (float): Scaling factor for the decision threshold based on standard deviation.
    """
    # Load the pre-trained encoder and scaler
    encoder = joblib.load(encoder_file)
    scaler = joblib.load(scaler_file)

    # Identify categorical and numerical columns
    categorical_columns = ['P601', 'P602', 'P603']  # Update with actual categorical columns
    numerical_columns = [col for col in df.columns if col not in categorical_columns]

    # Apply One-Hot Encoding using the pre-trained encoder
    encoded_features = encoder.transform(df[categorical_columns])
    encoded_df = pd.DataFrame(encoded_features, columns=encoder.get_feature_names_out(categorical_columns))

    # Apply MinMax Scaling using the pre-trained scaler
    scaled_features = scaler.transform(df[numerical_columns])
    scaled_df = pd.DataFrame(scaled_features, columns=numerical_columns)

    # Combine encoded and scaled data
    processed_df = pd.concat([encoded_df, scaled_df], axis=1)

    # Calculate parameters for each feature
    parameters = {
        "Feature": [],
        "ReferenceValue": [],
        "DriftThreshold": [],
        "DecisionThreshold": [],
    }

    for column_name in processed_df.columns:
        values = processed_df[column_name]
        reference_value = values.mean()
        drift_threshold = values.std() * 0.1
        decision_threshold = values.std() * threshold_scale

        parameters["Feature"].append(column_name)
        parameters["ReferenceValue"].append(reference_value)
        parameters["DriftThreshold"].append(drift_threshold)
        parameters["DecisionThreshold"].append(decision_threshold)

    # Save to CSV
    pd.DataFrame(parameters).to_csv(output_file, index=False)
    print(f"Parameters saved to {output_file}, using scaler from {scaler_file}, and encoder from {encoder_file}")


output_csv = "reference_params.csv"
scaler_file = "minmax_scaler.pkl"
encoder_file = "onehot_encoder.pkl"
save_cusum_parameters(df, output_csv, scaler_file, encoder_file)


Parameters saved to reference_params.csv, using scaler from minmax_scaler.pkl, and encoder from onehot_encoder.pkl
