In [3]:
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
import pandas as pd
import joblib

# Preprocessing data

In [4]:
categorical_columns = ['MV101', 'P101', 'P102', 'MV201', 'P201',
                       'P202', 'P203', 'P204', 'P205', 'P206', 'MV301',
                       'MV302', 'MV303', 'MV304', 'P301', 'P302', 
                       'P401', 'P402', 'P403', 'P404', 'UV401', 'P501',
                       'P502', 'P601', 'P602', 'P603']

In [5]:
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
import pandas as pd
import joblib

def preprocess_and_save(input_file, output_file, encoder, scaler, categorical_columns):
    """
    Preprocess the input CSV file by applying OneHotEncoding to categorical columns,
    MinMax scaling to non-categorical columns, and combining them into a final dataset.
    """
    # Load the dataset
    df = pd.read_csv(input_file, encoding='utf-8', nrows=10000)

    # Save the first and last columns separately
    first_column = df.iloc[:, 0]
    last_column = df.iloc[:, -1]

    # Exclude the first and last columns for preprocessing
    data = df.iloc[:, 1:-1].copy()

    # Process categorical columns with OneHotEncoder
    encoded_features = encoder.transform(data[categorical_columns])
    encoded_feature_names = encoder.get_feature_names_out(categorical_columns)
    encoded_df = pd.DataFrame(encoded_features, columns=encoded_feature_names, index=data.index)

    # Process non-categorical columns with MinMaxScaler
    non_categorical_columns = [col for col in data.columns if col not in categorical_columns]
    scaled_non_categorical = scaler.transform(data[non_categorical_columns])
    scaled_non_categorical_df = pd.DataFrame(scaled_non_categorical, columns=non_categorical_columns, index=data.index)

    # Combine scaled non-categorical columns and encoded categorical columns
    final_data = pd.concat([scaled_non_categorical_df, encoded_df], axis=1)

    # Add the first and last columns back to the final data
    final_data = pd.concat([first_column, final_data, last_column], axis=1)

    # Save the final data to a CSV file
    final_data.to_csv(output_file, index=False)

    print(f"Preprocessed data saved to {output_file}")


# Define file paths and categorical columns
normal_file = '../Normal.csv'
attack_file = '../Attack.csv'
normal_output = 'preprocessed_normal_data.csv'
attack_output = 'preprocessed_attack_data.csv'
encoder_path = "onehot_encoder.pkl"
scaler_path = "minmax_scaler.pkl"


# Preprocess Normal.csv and train encoder and scaler
df_normal = pd.read_csv(normal_file, encoding='utf-8')
data = df_normal.iloc[:, 1:-1].copy()

# Train OneHotEncoder on categorical columns
categories = [list(range(3))] * len(categorical_columns) # here we build the colums _0, _1, e _2
encoder = OneHotEncoder(sparse_output=False, drop=None, categories=categories)
encoder.fit(data[categorical_columns])

# Save the OneHotEncoder
joblib.dump(encoder, encoder_path)

# Train MinMaxScaler on non-categorical columns
non_categorical_columns = [col for col in data.columns if col not in categorical_columns]
scaler = MinMaxScaler()
scaler.fit(data[non_categorical_columns])

# Save the MinMaxScaler
joblib.dump(scaler, scaler_path)

# Preprocess Normal.csv
preprocess_and_save(normal_file, normal_output, encoder, scaler, categorical_columns)

# Preprocess Attack.csv using trained encoder and scaler
# preprocess_and_save(attack_file, attack_output, encoder, scaler, categorical_columns)


Preprocessed data saved to preprocessed_normal_data.csv


## Applying PCA

In [14]:
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import joblib

# Step 1: Load and preprocess the data
attack = pd.read_csv('preprocessed_attack_data.csv', encoding='utf-8')

# Assuming the first and last columns are to be kept, we exclude them for PCA
# If you have a timestamp or other non-numeric column, drop it and apply PCA only to the numeric columns
# `iloc[:, 1:-1]` excludes the first and last columns
data_features = attack.iloc[:, 1:-1]

# Step 3: Apply PCA (e.g., reduce to 50 components)
pca = PCA(n_components=50)
pca_result = pca.fit_transform(data_features)

# Step 4: Create a DataFrame for the PCA result with proper column names
pca_columns = [f'PCA_{i+1}' for i in range(pca_result.shape[1])]
pca_df = pd.DataFrame(pca_result, columns=pca_columns)

# Step 5: Reattach the first and last columns (keeping them unchanged)
final_df = pd.concat([attack.iloc[:, [0]], pca_df, attack.iloc[:, [-1]]], axis=1)

# Step 6: Save the PCA model for future use
joblib.dump(pca, 'pca_model_attack.pkl')
joblib.dump(scaler, 'scaler_model.pkl')

# Optionally save the resulting DataFrame to CSV (if you need to inspect the output)
final_df.to_csv('attack_data_with_pca.csv', index=False)

# Display the resulting DataFrame (optional)
print(final_df.head())


                 Timestamp     PCA_1     PCA_2     PCA_3     PCA_4     PCA_5  \
0   28/12/2015 10:00:00 AM -1.059829 -0.049552 -0.260320 -0.127696  0.001428   
1   28/12/2015 10:00:01 AM -1.060511 -0.049077 -0.263962 -0.128010  0.002074   
2   28/12/2015 10:00:02 AM -1.061154 -0.047027 -0.272267 -0.126532  0.003439   
3   28/12/2015 10:00:03 AM -1.062506 -0.045495 -0.280783 -0.127275  0.004903   
4   28/12/2015 10:00:04 AM -1.063629 -0.044246 -0.287349 -0.127848  0.005966   

      PCA_6     PCA_7     PCA_8     PCA_9  ...    PCA_42        PCA_43  \
0  0.007354  0.002574 -0.143380 -0.013842  ... -0.000017 -2.886347e-07   
1  0.007394  0.002471 -0.142775 -0.013821  ...  0.000241 -2.840494e-07   
2  0.007495  0.002204 -0.141969 -0.014063  ...  0.000193 -1.012426e-07   
3  0.007309  0.001933 -0.141023 -0.014065  ... -0.000016  8.246891e-09   
4  0.007190  0.001696 -0.139883 -0.014046  ... -0.000384  3.571765e-07   

         PCA_44        PCA_45        PCA_46        PCA_47        PCA_48  \