In [21]:
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
import pandas as pd
import joblib

df = pd.read_csv('/content/Normal.csv', encoding='utf-8').iloc[:,1:-1]

In [23]:
# @title One Hot Encoding and Scaling - saving encoder and scaler
data = df.copy()
# Define the categorical columns and their possible values
categorical_columns = ['P601', 'P602', 'P603']
categories = [list(range(3))] * len(categorical_columns)  # Values 0, 1, 2 for all columns

# One-Hot Encoding
encoder = OneHotEncoder(sparse_output=False, drop=None, categories=categories)
encoded_features = encoder.fit_transform(data[categorical_columns])

# Convert encoded features to a DataFrame
encoded_feature_names = encoder.get_feature_names_out(categorical_columns)
encoded_df = pd.DataFrame(encoded_features, columns=encoded_feature_names, index=data.index)

# Save the OneHotEncoder for future use
joblib.dump(encoder, "onehot_encoder.pkl")

# Drop the original categorical columns and concatenate the encoded columns
data = pd.concat([data.drop(columns=categorical_columns), encoded_df], axis=1)

# Separate the non-categorical columns
non_categorical_columns = [col for col in data.columns if col not in encoded_feature_names]

# Scale only the non-categorical columns
scaler = MinMaxScaler()
scaled_non_categorical = scaler.fit_transform(data[non_categorical_columns])

# Convert scaled data back to a DataFrame
scaled_non_categorical_df = pd.DataFrame(scaled_non_categorical, columns=non_categorical_columns, index=data.index)

# Combine the scaled non-categorical columns with the encoded categorical columns
final_data = pd.concat([scaled_non_categorical_df, encoded_df], axis=1)

# Save the MinMaxScaler for future use
joblib.dump(scaler, "minmax_scaler.pkl")

# Display the result
print("Encoded and Scaled Data:")
print(final_data)


Encoded and Scaled Data:
          FIT101    LIT101  MV101  P101  P102    AIT201   AIT202    AIT203  \
0       0.899895  0.160292    1.0   1.0   0.0  0.000000  0.00000  0.019710   
1       0.895111  0.159845    1.0   1.0   0.0  0.000000  0.00000  0.019710   
2       0.888694  0.159533    1.0   1.0   0.0  0.000000  0.00000  0.019710   
3       0.884611  0.158819    1.0   1.0   0.0  0.000000  0.00000  0.019710   
4       0.883327  0.158372    1.0   1.0   0.0  0.003408  0.00000  0.019710   
...          ...       ...    ...   ...   ...       ...      ...       ...   
496795  0.896278  0.457619    1.0   1.0   0.0  0.627270  0.25853  0.103740   
496796  0.892078  0.457530    1.0   1.0   0.0  0.627270  0.25853  0.103740   
496797  0.886944  0.457440    1.0   1.0   0.0  0.627270  0.25853  0.104221   
496798  0.884611  0.457485    1.0   1.0   0.0  0.627270  0.25853  0.104221   
496799  0.884144  0.457396    1.0   1.0   0.0  0.627270  0.25853  0.104221   

          FIT201  MV201  ...    FIT601

In [24]:
# @title CUSUM
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
import joblib
from sklearn.preprocessing import OneHotEncoder

def save_cusum_parameters(df, output_file, scaler_file="minmax_scaler.pkl", encoder_file="onehot_encoder.pkl", threshold_scale=3):
    """
    Reads a CSV file, applies OneHotEncoding and MinMaxScaling using pre-trained encoder and scaler,
    and saves reference values and thresholds for each feature to a file.

    Parameters:
        df
        output_file (str): Path to save the reference parameters as a CSV.
        scaler_file (str): Path to load the MinMaxScaler.
        encoder_file (str): Path to load the OneHotEncoder.
        threshold_scale (float): Scaling factor for the decision threshold based on standard deviation.
    """
    # Load the pre-trained encoder and scaler
    encoder = joblib.load(encoder_file)
    scaler = joblib.load(scaler_file)

    # Identify categorical and numerical columns
    categorical_columns = ['P601', 'P602', 'P603']  # Update with actual categorical columns
    numerical_columns = [col for col in df.columns if col not in categorical_columns]

    # Apply One-Hot Encoding using the pre-trained encoder
    encoded_features = encoder.transform(df[categorical_columns])
    encoded_df = pd.DataFrame(encoded_features, columns=encoder.get_feature_names_out(categorical_columns))

    # Apply MinMax Scaling using the pre-trained scaler
    scaled_features = scaler.transform(df[numerical_columns])
    scaled_df = pd.DataFrame(scaled_features, columns=numerical_columns)

    # Combine encoded and scaled data
    processed_df = pd.concat([encoded_df, scaled_df], axis=1)

    # Calculate parameters for each feature
    parameters = {
        "Feature": [],
        "ReferenceValue": [],
        "DriftThreshold": [],
        "DecisionThreshold": [],
    }

    for column_name in processed_df.columns:
        values = processed_df[column_name]
        reference_value = values.mean()
        drift_threshold = values.std() * 0.1
        decision_threshold = values.std() * threshold_scale

        parameters["Feature"].append(column_name)
        parameters["ReferenceValue"].append(reference_value)
        parameters["DriftThreshold"].append(drift_threshold)
        parameters["DecisionThreshold"].append(decision_threshold)

    # Save to CSV
    pd.DataFrame(parameters).to_csv(output_file, index=False)
    print(f"Parameters saved to {output_file}, using scaler from {scaler_file}, and encoder from {encoder_file}")


output_csv = "reference_params.csv"
scaler_file = "minmax_scaler.pkl"
encoder_file = "onehot_encoder.pkl"
save_cusum_parameters(df, output_csv, scaler_file, encoder_file)


Parameters saved to reference_params.csv, using scaler from minmax_scaler.pkl, and encoder from onehot_encoder.pkl
