In [24]:
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler, RobustScaler
import os as os
import skfuzzy as fuzz
import random
from sklearn.ensemble import IsolationForest
from sklearn.preprocessing import RobustScaler

In [25]:
data = pd.read_csv('ar41_for_ulb_cleaned.csv', sep=";", parse_dates=True, index_col="timestamps_UTC")

In [26]:
features = ['RS_E_InAirTemp_PC1', 'RS_E_InAirTemp_PC2', 'RS_E_OilPress_PC1',
             'RS_E_OilPress_PC2', 'RS_E_RPM_PC1', 'RS_E_RPM_PC2', 'RS_E_WatTemp_PC1',
             'RS_E_WatTemp_PC2', 'RS_T_OilTemp_PC1', 'RS_T_OilTemp_PC2', 'temperature',
             'precipitation', 'windspeed_10m', 'sum_pollen']

In [27]:
# Extract the selected features from the DataFrame
X = data[features]
# Standardize (robust) the data
Robustscaler = RobustScaler() # Test with robust scaler
X_Robustscaled = Robustscaler.fit_transform(X)

# Standardize (standard) the data
standardscaler = StandardScaler()
X_Standardscaled = standardscaler.fit_transform(X)

# Kmean

In [28]:
k = 5  # Adjust this value based on the Elbow curve
kmeansStandard = KMeans(n_clusters=k, random_state=0)
kmeansStandard.fit(X_Standardscaled)

  super()._check_params_vs_input(X, default_n_init=10)


In [29]:
# Get distances from center
distances = kmeansStandard.transform(X_Standardscaled)

# Get distance from closest center
min_distances = distances.min(axis=1)

min_distances_series = pd.Series(min_distances)

# Take 5% quantile as threshold
threshold = min_distances_series.quantile(0.95)

data['kmean'] = (min_distances > threshold).astype(int)

# Fuzzy

In [30]:
# Parameters of Fuzzy C-means
n_clusters = 2
m = 2
error_threshold = 0.005
data_T = X_Robustscaled.T

# Execution of Fuzzy C-means
cntr, u, _, _, _, _, fpc = fuzz.cluster.cmeans(
    data_T, n_clusters, m, error=error_threshold, maxiter=1000, init=None, seed=3
)
fpc

0.9558406879651284

Compute the distance with threshold

In [31]:
membership_values = np.argmax(u, axis=0)
distances = np.linalg.norm(data_T - cntr.T[:, membership_values], axis=0)
outlier_distance_threshold = 21

In [32]:
outliers = data[distances > outlier_distance_threshold]
data['fuzzy'] = (distances > outlier_distance_threshold).astype(int)

In [33]:
print("Number of outliers :", outliers.shape[0])

Number of outliers : 828113


# Isolation forest

In [34]:
param_grid = {
    "n_estimators": [500],
    "contamination": ["auto"],
}

# Random choice
params = {param: random.choice(values) for param, values in param_grid.items()}

clf = IsolationForest(**params, n_jobs=-1, max_samples="auto")

result = clf.fit_predict(X_Robustscaled)

data["isolation_forest"] = result

data["isolation_forest"] = data["isolation_forest"].astype("category")

# Check similarity

In [35]:
# in col isolation_forest, replace 1 with 0 and -1 with 1
data['isolation_forest'] = data['isolation_forest'].replace(1, 0)
data['isolation_forest'] = data['isolation_forest'].replace(-1, 1)

# make a col "combined" with value 1 if 'isol' and 'fuzzy' are 1, 0 otherwise
data['combined'] = (data[['isolation_forest', 'fuzzy', 'kmean']].sum(axis=1) >= 2).astype(int)

# Save to CSV

In [36]:
data.to_csv('ar41_for_ulb_merged.csv', sep=';')

KeyboardInterrupt: 

Outliers CSV

In [None]:
# make a new df with only the outliers
outliers = data[data['combined'] == 1]
print(len(outliers))

outliers.drop('kmean', axis=1)
outliers.drop('fuzzy', axis=1)
outliers.drop('isolation_forest', axis=1)
outliers.drop('combined', axis=1)

outliers['reason'] = "Oultiers"
outliers['IDreason'] = 7

df_outliers_from_explo = pd.read_csv('ar41_for_ulb_all_outliers.csv', sep=";", parse_dates=True, index_col="timestamps_UTC")

df_outliers = pd.concat([outliers, df_outliers_from_explo])

# to csv
df_outliers.to_csv('ar41_for_ulb_all_outliers.csv', sep=';')

580651


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  outliers['reason'] = "Outliers"
