In [43]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.cluster import DBSCAN
from sklearn.preprocessing import StandardScaler
import skfuzzy as fuzz
import numpy as np

In [44]:
df = pd.read_csv('ar41_for_ulb_cleaned.csv', sep=';')

In [45]:
# take a random sample of 100 000 rows
sample_df = df.sample(n=100000, random_state=1)

In [46]:
sample_df.head()

Unnamed: 0.1,Unnamed: 0,mapped_veh_id,timestamps_UTC,lat,lon,RS_E_InAirTemp_PC1,RS_E_InAirTemp_PC2,RS_E_OilPress_PC1,RS_E_OilPress_PC2,RS_E_RPM_PC1,RS_E_RPM_PC2,RS_E_WatTemp_PC1,RS_E_WatTemp_PC2,RS_T_OilTemp_PC1,RS_T_OilTemp_PC2
11832710,12771493,186.0,2023-03-09 19:39:50,51.03902,5.273067,45.0,49.0,279.0,341.0,1941.0,1925.0,83.0,85.0,87.0,90.0
4743539,5118359,144.0,2023-09-04 15:27:59,50.863911,3.813591,42.0,45.0,362.0,296.0,1421.0,1430.0,90.0,85.0,93.0,85.0
6921457,7470181,116.0,2023-05-12 06:54:40,51.03786,3.698845,24.0,33.0,210.0,175.0,804.0,806.0,79.0,76.0,80.0,77.0
13446449,14512083,162.0,2023-03-15 10:35:12,50.851757,3.60193,18.0,40.0,420.0,486.0,1475.0,1597.0,76.0,81.0,81.0,84.0
3658596,3950315,104.0,2023-06-15 20:46:54,50.094344,4.527845,13.0,16.5,58.5,34.5,129.5,56.0,39.5,38.0,84.0,79.5


In [47]:
# keep only these features
features = sample_df[['RS_E_InAirTemp_PC1', 'RS_E_InAirTemp_PC2', 'RS_E_OilPress_PC1', 'RS_E_OilPress_PC2', 'RS_E_RPM_PC1', 'RS_E_RPM_PC2', 'RS_E_WatTemp_PC1', 'RS_E_WatTemp_PC2', 'RS_T_OilTemp_PC1', 'RS_T_OilTemp_PC2']]

scaler = StandardScaler()
features_scaled = scaler.fit_transform(features)

In [48]:
diff_features = pd.DataFrame()
feature_names = ['RS_E_InAirTemp', 'RS_E_OilPress', 'RS_E_RPM', 'RS_E_WatTemp', 'RS_T_OilTemp']
for feature in feature_names:
    diff_features[feature] = features[feature + '_PC1'] - features[feature + '_PC2']

In [49]:
features = diff_features

In [50]:
# Parameters of Fuzzy C-means
n_clusters = 10
m = 2
error_threshold = 0.005

# Execution of Fuzzy C-means
centers, u, _, _, _, _, fpc = fuzz.cluster.cmeans(
    features.T, n_clusters, m, error=error_threshold, maxiter=1000, init=None
)

fpc

0.4144418079789295

In [51]:
cluster_indices = np.argmax(u, axis=0)

elements_par_cluster = np.bincount(cluster_indices)

for cluster_num, count in enumerate(elements_par_cluster):
    print(f"Cluster {cluster_num + 1}: {count} éléments")

Cluster 1: 11711 éléments
Cluster 2: 1279 éléments
Cluster 3: 1505 éléments
Cluster 4: 14876 éléments
Cluster 5: 5274 éléments
Cluster 6: 14354 éléments
Cluster 7: 16311 éléments
Cluster 8: 9774 éléments
Cluster 9: 16458 éléments
Cluster 10: 8458 éléments


In [52]:
features['cluster'] = cluster_indices

In [53]:
# Show average values for each cluster
cluster_means = features.groupby('cluster').mean()

print("Cluster Means:")
cluster_means

Cluster Means:


Unnamed: 0_level_0,RS_E_InAirTemp,RS_E_OilPress,RS_E_RPM,RS_E_WatTemp,RS_T_OilTemp
cluster,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,0.27744,-67.706676,-18.197323,1.686819,1.640315
1,-1.740735,-438.294058,-9.603362,-0.600456,-0.228929
2,-0.555947,412.949967,29.196489,0.228616,-0.966301
3,0.425861,21.361745,1.322491,0.26703,-0.501634
4,1.12038,-131.940769,-77.318787,11.447658,11.135195
5,-0.213481,-42.720588,0.743328,-0.036691,0.315893
6,-0.100433,-22.214695,1.540535,0.172482,0.273596
7,-0.137663,45.918128,7.400888,0.343317,-0.913582
8,-0.010163,0.408353,-0.017121,0.591566,-0.043345
9,-0.833073,94.985982,54.198964,-3.278647,-3.630117
