In [None]:
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler, RobustScaler
import os as os
import skfuzzy as fuzz
import random
from sklearn.ensemble import IsolationForest
from sklearn.preprocessing import RobustScaler
from time import time

In [None]:
data = pd.read_csv('ar41_for_ulb_comp.csv', sep=";", parse_dates=True, index_col="timestamps_UTC")

In [None]:
features = ['RS_E_InAirTemp_PC1', 'RS_E_InAirTemp_PC2', 'RS_E_OilPress_PC1',
            'RS_E_OilPress_PC2', 'RS_E_RPM_PC1', 'RS_E_RPM_PC2', 'RS_E_WatTemp_PC1',
            'RS_E_WatTemp_PC2', 'RS_T_OilTemp_PC1', 'RS_T_OilTemp_PC2', 'temperature',
            'precipitation', 'windspeed_10m', 'sum_pollen']

In [None]:
# Extract the selected features from the DataFrame
X = data[features]
# Standardize (robust) the data
Robustscaler = RobustScaler() # Test with robust scaler
X_Robustscaled = Robustscaler.fit_transform(X)

# Standardize (standard) the data
standardscaler = StandardScaler()
X_Standardscaled = standardscaler.fit_transform(X)

# Kmean

In [None]:
k = 5  # Adjust this value based on the Elbow curve
kmeansStandard = KMeans(n_clusters=k, random_state=0)
kmeansStandard.fit(X_Standardscaled)

In [None]:
# Get distances from center
distances = kmeansStandard.transform(X_Standardscaled)

# Get distance from closest center
min_distances = distances.min(axis=1)

min_distances_series = pd.Series(min_distances)

# Take 5% quantile as threshold
threshold = min_distances_series.quantile(0.95)

data['kmean'] = (min_distances > threshold).astype(int)

# Fuzzy

In [None]:
# Parameters of Fuzzy C-means
n_clusters = 4
m = 2
error_threshold = 0.005
data_T = X_Robustscaled.T

# Execution of Fuzzy C-means
cntr, u, _, _, _, _, fpc = fuzz.cluster.cmeans(
    data_T, n_clusters, m, error=error_threshold, maxiter=1000, init=None, seed=3
)
fpc

In [None]:
membership_values = np.argmax(u, axis=0)
cntr_T = cntr.T
distances = np.linalg.norm(data_T - cntr_T[:, membership_values], axis=0)
outlier_distance_threshold = 21

In [None]:
outliers = data[distances > outlier_distance_threshold]
data['fuzzy'] = (distances > outlier_distance_threshold).astype(int)

# Isolation forest

In [None]:
param_grid = {
    "n_estimators": [500],
    "contamination": ["auto"],
}

# Choix aléatoire des paramètres
params = {param: random.choice(values) for param, values in param_grid.items()}


clf = IsolationForest(**params, n_jobs=-1, max_samples="auto")

start_time = time()
result = clf.fit_predict(X_Robustscaled)
end_time = time()
print(f"Prediction time : {round(end_time - start_time, 2)} s")


data["isolation_forest"] = result

data["isolation_forest"] = data["isolation_forest"].astype("category")

# Check similarity

In [None]:
# in col isolation_forest, replace -1 with 0
data['isolation_forest'] = data['isolation_forest'].replace(1, 0)
data['isolation_forest'] = data['isolation_forest'].replace(-1, 1)

# make a col "combined" with value 1 if 'isol' and 'fuzzy' are 1, 0 otherwise
data['combined'] = (data[['isolation_forest', 'fuzzy', 'kmean']].sum(axis=1) >= 2).astype(int)

# Save to CSV

In [None]:
data.to_csv('ar41_for_ulb_merged.csv', sep=';')