In [13]:
from __future__ import print_function
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import keras
import warnings
import tensorflow as tf
import seaborn as sns
import sklearn
import random
import math
import time
import os
import pywt  # Bibliothèque pour les ondelettes

from scipy.stats import kurtosis, skew
from numpy.fft import fft


from lime.lime_tabular import RecurrentTabularExplainer
from tqdm import tqdm
from sklearn.metrics import mean_squared_error, r2_score 
from sklearn.model_selection import GroupKFold
from sklearn import preprocessing
from keras import backend as K
from sklearn.preprocessing import MinMaxScaler , StandardScaler
from tensorflow.keras.optimizers import Adam
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense, Dropout, LSTM, Activation, GRU
from scipy import optimize
from tensorflow.keras import optimizers


from sp_modif.model_function import *
from sp_modif.methods import *
from sp_modif.data_prep import *
from sp_modif.evaluator import *
from sp_modif.SHAP import *
from sp_modif.L2X import *
from methods import *

%matplotlib inline
warnings.filterwarnings('ignore')

SEED = 0
def set_seed(seed=SEED):
    os.environ['PYTHONHASHSEED'] = str(SEED)
    random.seed(SEED)
    np.random.seed(SEED)
    tf.random.set_seed(SEED)

# Appeler la fonction pour fixer le seed
set_seed(SEED)

In [14]:
def extract_temporal_features(signal):
    """Extrait les caractéristiques temporelles d'un signal."""
    rms = np.sqrt(np.mean(signal**2))
    mean_abs = np.mean(np.abs(signal))
    max_abs = np.max(np.abs(signal))
    energy = np.sum(signal**2)
    
    features = {
        'mean': signal.mean(),
        'std': signal.std(),
        'peak_to_peak': signal.max() - signal.min(),
        'rms': rms,
        'mean_abs': mean_abs,
        'max_abs': max_abs,
        'skewness': skew(signal),
        'kurtosis': kurtosis(signal),
        'form_factor': rms / mean_abs if mean_abs != 0 else 0,
        'crest_factor': signal.max() / rms if rms != 0 else 0,
        'impulse_factor': signal.max() / mean_abs if mean_abs != 0 else 0,
        'margin_factor': signal.max() / (np.mean(np.sqrt(np.abs(signal))) ** 2) if np.mean(np.sqrt(np.abs(signal))) != 0 else 0,
        'energy': energy
    }
    
    # Moments centrés (ordre 5 à 11)
    for order in range(5, 12):
        features[f'moment_{order}'] = np.mean((signal - signal.mean())**order)
    
    return features

def extract_frequency_features(signal, sampling_rate=25600):
    """Extrait les caractéristiques fréquentielles d'un signal donné."""
    signal = np.asarray(signal)
    if signal.size == 0:
        raise ValueError("Signal vide : impossible de calculer des caractéristiques fréquentielles.")

    # FFT
    N = len(signal)
    freqs = fft(signal)
    freqs = np.abs(freqs[:N // 2])  # Composantes positives
    freq_bins = np.fft.fftfreq(N, d=1/sampling_rate)[:N // 2]

    # Normalisation pour l'entropie spectrale
    power_spectrum = freqs**2
    power_spectrum /= np.sum(power_spectrum)

    # Calcul des caractéristiques
    max_amplitude = np.max(freqs)
    mean_freq = np.mean(freqs)
    rms_freq = np.sqrt(np.mean(freqs**2))
    variance_freq = np.var(freqs)
    std_freq = np.std(freqs)
    kurtosis_freq = kurtosis(freqs)
    skewness_freq = skew(freqs)
    peak_freq = freq_bins[np.argmax(freqs)]
    form_factor_freq = rms_freq / mean_freq if mean_freq != 0 else 0
    crest_factor_freq = max_amplitude / rms_freq if rms_freq != 0 else 0
    spectral_entropy = -np.sum(power_spectrum * np.log2(power_spectrum + 1e-10))
    
    # Division en bandes de fréquence
    num_bands = 25
    band_width = (sampling_rate / 2) / num_bands
    energy_bands = {}
    for band in range(num_bands):
        low = band * band_width
        high = (band + 1) * band_width
        energy_bands[f'energy_band_{band}'] = np.sum(power_spectrum[(freq_bins >= low) & (freq_bins < high)])
    
    features = {
        'max_amplitude': max_amplitude,
        'mean_freq': mean_freq,
        'rms_freq': rms_freq,
        'variance_freq': variance_freq,
        'std_freq': std_freq,
        'kurtosis_freq': kurtosis_freq,
        'skewness_freq': skewness_freq,
        'peak_freq': peak_freq,
        'form_factor_freq': form_factor_freq,
        'crest_factor_freq': crest_factor_freq,
        'spectral_entropy': spectral_entropy
    }
    features.update(energy_bands)
    return features

def extract_derivative_features(signal):
    """Extrait les caractéristiques temporelles de la dérivée du signal."""
    derivative = np.diff(signal, prepend=signal[0])
    return extract_temporal_features(derivative)

def extract_integral_features(signal):
    """Extrait les caractéristiques temporelles de l'intégrale du signal."""
    integral = np.cumsum(signal)
    return extract_temporal_features(integral)

def extract_imf_features(signal, num_imfs=5):
    """Extrait les caractéristiques des IMFs à partir d'une décomposition en ondelettes."""
    coeffs = pywt.wavedec(signal, wavelet='db4', level=num_imfs)
    features = {}
    for i, coeff in enumerate(coeffs[:num_imfs]):
        features[f'imf_rms_{i}'] = np.sqrt(np.mean(coeff**2))
        features[f'imf_energy_{i}'] = np.sum(coeff**2)
        features[f'imf_entropy_{i}'] = -np.sum((coeff**2 / np.sum(coeff**2)) * np.log2(coeff**2 / np.sum(coeff**2) + 1e-10))
    return features

def extract_features(signal, sampling_rate=25600):
    """Extrait toutes les caractéristiques temporelles, fréquentielles, dérivées et intégrées."""
    temporal_features = extract_temporal_features(signal)
    frequency_features = extract_frequency_features(signal, sampling_rate)
    derivative_features = extract_derivative_features(signal)
    integral_features = extract_integral_features(signal)
    imf_features = extract_imf_features(signal)

    # Fusionner toutes les caractéristiques dans un seul dictionnaire
    all_features = {
        **temporal_features,
        **frequency_features,
        **{f'derivative_{k}': v for k, v in derivative_features.items()},
        **{f'integral_{k}': v for k, v in integral_features.items()},
        **imf_features
    }
    return all_features

In [15]:
def calculate_rul(data, time_col='time_seconds', bearing_col='bearing_id'):
    """
    Calcule le Remaining Useful Life (RUL) et le RUL normalisé pour chaque échantillon.

    :param data: DataFrame contenant les données avec les colonnes `time_col` et `bearing_col`
    :param time_col: Nom de la colonne contenant les temps (par défaut `time_seconds`)
    :param bearing_col: Nom de la colonne contenant les IDs des roulements (par défaut `bearing_id`)
    :return: DataFrame avec deux colonnes supplémentaires `RUL` et `RUL_norm`
    """
    # Étape 1 : Trouver le temps total (fin de vie) pour chaque roulement
    total_time = data.groupby(bearing_col)[time_col].max()

    # Étape 2 : Calculer le RUL pour chaque échantillon
    data['RUL'] = data.apply(lambda row: total_time[row[bearing_col]] - row[time_col], axis=1)

    # Étape 3 : Normaliser le RUL
    max_rul = data['RUL'].max()  # Valeur maximale de RUL
    data['RUL_norm'] = data['RUL'] / max_rul if max_rul > 0 else data['RUL']

    return data

In [16]:
# Chargement des données
DEFAULT_SAMPLING_RATE = 100  # Fréquence d'échantillonnage en Hz

def load_vibration_data(data_folder, idx=None):
    """Charge les fichiers de vibration et calcule le temps en secondes pour chaque échantillon."""
    vibration_files = [f for f in os.listdir(data_folder) if f.startswith('acc_') and f.endswith('.csv')]
    if not vibration_files:
        print("No vibration files found in the specified folder.")
        return pd.DataFrame()  # Retourne un DataFrame vide si aucun fichier n'est trouvé
    
    vibration_data = []

    for file in vibration_files:
        file_path = os.path.join(data_folder, file)
        df = pd.read_csv(file_path, names=['Hour', 'Minute', 'Second', 'Microsecond', 'Horizontal_Accel', 'Vertical_Accel'])
        
        # Extraction de l'identifiant du roulement
        bearing_id = file.split('_')[1].split('.')[0]
        df['bearing_id'] = idx + bearing_id
        
        # Calcul du temps en secondes pour chaque échantillon
        df['time_seconds'] = df['Hour'] * 3600 + df['Minute'] * 60 + df['Second'] + df['Microsecond'] * 1e-6
        vibration_data.append(df)

    # Concaténation de toutes les données de vibration en un DataFrame unique
    result = pd.concat(vibration_data, ignore_index=True)
    print(f"Loaded data shape: {result.shape}")
    print(result.head())
    return result

# Calcul du temps total et du RUL normalisé
def calculate_total_time(vibration_data):
    """Calcule le temps total de l'expérience pour chaque roulement."""
    return vibration_data.groupby('bearing_id')['time_seconds'].max()

def calculate_normalized_rul(vibration_data, total_time):
    """Calcule le RUL normalisé pour chaque échantillon."""
    vibration_data['RUL_norm'] = vibration_data.apply(
        lambda row: 1 - (total_time[row['bearing_id']] - row['time_seconds']) / total_time[row['bearing_id']],
        axis=1
    )
    return vibration_data

# Division en fenêtres
def split_into_windows(data, window_size, sampling_rate=DEFAULT_SAMPLING_RATE):
    """Divise les données en fenêtres de taille définie."""
    samples_per_window = int(window_size * sampling_rate)
    num_windows = len(data) // samples_per_window
    print(f"Data size: {len(data)}, Samples per window: {samples_per_window}, Number of windows: {num_windows}")
    if num_windows == 0:
        print("Warning: Not enough data to form a window.")
    windows = [data[i * samples_per_window:(i + 1) * samples_per_window] for i in range(num_windows)]
    return windows

# Pipeline principal
def main_pipeline(data_folder, idx, window_size=10, sampling_rate=DEFAULT_SAMPLING_RATE):
    """Pipeline complet pour traiter les données."""
    # Charger les données
    vibration_data = load_vibration_data(data_folder, idx)
    if vibration_data.empty:
        print("No data loaded. Check your data folder.")
        return pd.DataFrame()
    
    # Calculer le temps total et le RUL normalisé
    # total_time = calculate_total_time(vibration_data)
    vibration_data = calculate_rul(vibration_data)
    print(f"Data after RUL calculation: {vibration_data.shape}")
    
    # Extraire les caractéristiques
    features_df = extract_features_per_window(vibration_data, window_size, sampling_rate)
    print(f"Extracted features shape: {features_df.shape}")
    
    return features_df

def plot_signal_pronostia(df, signal_name, unit=None):
    #     train = df
    plt.figure(figsize=(13,5))
    if unit:
        plt.plot('RUL_norm', signal_name,
                data=df[df['Unit']==unit])
    else:
        for i in df['Unit'].unique():
            # if (i % 10 == 0):  # only ploting every 10th unit_nr
            plt.plot('RUL_norm', signal_name, data=df[df['Unit']==i])
            
    plt.xlim(2560, 0)  # reverse the x-axis so RUL counts down to zero
    plt.xticks(np.arange(0, 2560, 250))
    plt.ylabel(signal_name)
    plt.xlabel('Remaining Use fulLife')
    #plt.savefig(signal_name+'.jpeg')
    plt.show()
def extract_time_frequency_features(signal, wavelet='db4', level=3):
    """Extrait les caractéristiques temps-fréquence à partir d'une décomposition en ondelettes."""
    coeffs = pywt.wavedec(signal, wavelet, level=level)
    features = {}
    for i, coeff in enumerate(coeffs):
        features[f'wavelet_energy_level_{i}'] = np.sum(coeff**2)
    return features

def extract_features_per_window(data, window_size, sampling_rate=DEFAULT_SAMPLING_RATE):
    """Extrait les caractéristiques de chaque fenêtre de données."""
    features_list = []

    for bearing_id, group in data.groupby('bearing_id'):
        print(f"Processing bearing: {bearing_id}, Data size: {len(group)}")
        horizontal_signal = group['Horizontal_Accel'].values
        vertical_signal = group['Vertical_Accel'].values
        
        horizontal_windows = split_into_windows(horizontal_signal, window_size, sampling_rate)
        vertical_windows = split_into_windows(vertical_signal, window_size, sampling_rate)
        
        # total_duration = group['time_seconds'].max()
        
        for i, (h_window, v_window) in enumerate(zip(horizontal_windows, vertical_windows)):
            # Extraction des caractéristiques temporelles, fréquentielles et temps-fréquence
            temporal_features_h = extract_temporal_features(h_window)
            # temporal_features_v = extract_temporal_features(v_window)
            # frequency_features_h = extract_frequency_features(h_window, sampling_rate)
            # frequency_features_v = extract_frequency_features(v_window, sampling_rate)
            # time_frequency_features_h = extract_imf_features(h_window)  # IMFs pour horizontal
            # time_frequency_features_v = extract_imf_features(v_window)  # IMFs pour vertical
            
            # # Extraction des caractéristiques pour la dérivée et l'intégrale
            # derivative_features_h = extract_derivative_features(h_window)
            # derivative_features_v = extract_derivative_features(v_window)
            # integral_features_h = extract_integral_features(h_window)
            # integral_features_v = extract_integral_features(v_window)
            
            # Fusionner toutes les caractéristiques
            features = {f"{k}_h": v for k, v in temporal_features_h.items()}
            # features.update({f"{k}_v": v for k, v in temporal_features_v.items()})
            # features.update({f"{k}_freq_h": v for k, v in frequency_features_h.items()})
            # features.update({f"{k}_freq_v": v for k, v in frequency_features_v.items()})
            # features.update({f"{k}_tf_h": v for k, v in time_frequency_features_h.items()})
            # features.update({f"{k}_tf_v": v for k, v in time_frequency_features_v.items()})
            # features.update({f"derivative_{k}_h": v for k, v in derivative_features_h.items()})
            # features.update({f"derivative_{k}_v": v for k, v in derivative_features_v.items()})
            # features.update({f"integral_{k}_h": v for k, v in integral_features_h.items()})
            # features.update({f"integral_{k}_v": v for k, v in integral_features_v.items()})
            
            # # Calculer le RUL normalisé pour cette fenêtre
            # start_time_window = i * window_size  # Temps de début de la fenêtre en secondes
            # RUL = max(0, total_duration - start_time_window)  # RUL décroît avec le temps
            # RUL_norm = RUL / total_duration if total_duration > 0 else 0  # RUL normalisé
            # Ajouter le RUL normalisé depuis les données
            index = int(i * window_size*100) # Assurez-vous que l'index est entier
            if index < len(group):
                features['RUL'] = np.mean(group.iloc[index:index+10]['RUL'])
            
            # Ajouter des informations contextuelles
            features['bearing_id'] = bearing_id
            features['window_index'] = i
            # features['RUL_norm'] = RUL_norm  # Ajout de RUL normalisé
            
            # Ajouter les caractéristiques extraites à la liste
            features_list.append(features)

    return pd.DataFrame(features_list)


In [19]:
vibration_data_with_rul = load_vibration_data(data_folder, idx=idx)

Loaded data shape: (7175680, 8)
   Hour  Minute  Second  Microsecond  Horizontal_Accel  Vertical_Accel  \
0     9      39      39      65664.0             0.552          -0.146   
1     9      39      39      65703.0             0.501          -0.480   
2     9      39      39      65742.0             0.138           0.435   
3     9      39      39      65781.0            -0.423           0.240   
4     9      39      39      65820.0            -0.802           0.020   

  bearing_id  time_seconds  
0   1_100001  34779.065664  
1   1_100001  34779.065703  
2   1_100001  34779.065742  
3   1_100001  34779.065781  
4   1_100001  34779.065820  


In [42]:
vibration_data_with_rul[vibration_data_with_rul.bearing_id=='1_100003']

Unnamed: 0,Hour,Minute,Second,Microsecond,Horizontal_Accel,Vertical_Accel,bearing_id,time_seconds
5120,9,39,59,65664.0,0.207,-0.686,1_100003,34799.065664
5121,9,39,59,65703.0,-0.099,0.147,1_100003,34799.065703
5122,9,39,59,65742.0,-0.151,-0.195,1_100003,34799.065742
5123,9,39,59,65781.0,-0.358,-0.584,1_100003,34799.065781
5124,9,39,59,65820.0,-0.424,-0.019,1_100003,34799.065820
...,...,...,...,...,...,...,...,...
7675,9,39,59,165470.0,-0.300,0.050,1_100003,34799.165470
7676,9,39,59,165510.0,-0.538,0.295,1_100003,34799.165510
7677,9,39,59,165550.0,-0.544,0.489,1_100003,34799.165550
7678,9,39,59,165580.0,-0.178,-0.341,1_100003,34799.165580


In [38]:
pip uninstall scipy

^C
Note: you may need to restart the kernel to use updated packages.


In [39]:
pip install scipy

Note: you may need to restart the kernel to use updated packages.


In [40]:
from tsfresh import extract_features

ImportError: cannot import name 'cwt' from 'scipy.signal' (c:\ProgramData\anaconda3\envs\tensorflow_env\lib\site-packages\scipy\signal\__init__.py)

In [17]:
from tqdm import tqdm
list_train_df = []
list_idx = ['1_1']
# list_idx = ['1_2']

for idx in tqdm(list_idx):
    folder = 'PRONOSTIA/Learning_set/'
    bearing = 'Bearing' + idx
    base_dir = os.path.join(folder, bearing)
    data_folder = base_dir
    vibration_data_with_rul = main_pipeline(data_folder, idx=idx,window_size= 10)
    # print(vibration_data_with_rul[['bearing_id', 'time_seconds', 'RUL_norm']].head())
    list_train_df.append(vibration_data_with_rul)

  0%|          | 0/1 [00:00<?, ?it/s]

Loaded data shape: (7175680, 8)
   Hour  Minute  Second  Microsecond  Horizontal_Accel  Vertical_Accel  \
0     9      39      39      65664.0             0.552          -0.146   
1     9      39      39      65703.0             0.501          -0.480   
2     9      39      39      65742.0             0.138           0.435   
3     9      39      39      65781.0            -0.423           0.240   
4     9      39      39      65820.0            -0.802           0.020   

  bearing_id  time_seconds  
0   1_100001  34779.065664  
1   1_100001  34779.065703  
2   1_100001  34779.065742  
3   1_100001  34779.065781  
4   1_100001  34779.065820  
Data after RUL calculation: (7175680, 10)


  0%|          | 0/1 [01:17<?, ?it/s]

Processing bearing: 1_100001, Data size: 2560
Data size: 2560, Samples per window: 1000, Number of windows: 2
Data size: 2560, Samples per window: 1000, Number of windows: 2





NameError: name 'total_duration' is not defined

In [8]:
# Utilisation de la fonction principale
from tqdm import tqdm
list_train_df = []
list_idx = ['1_1', '1_2', '2_1', '2_2']
# list_idx = ['1_2']

for idx in tqdm(list_idx):
    folder = 'PRONOSTIA/Learning_set/'
    bearing = 'Bearing' + idx
    base_dir = os.path.join(folder, bearing)
    data_folder = base_dir
    vibration_data_with_rul = main_pipeline(data_folder, idx=idx,window_size= 0.1)
    # print(vibration_data_with_rul[['bearing_id', 'time_seconds', 'RUL_norm']].head())
    list_train_df.append(vibration_data_with_rul)
    
# idx = '1_2'
# folder = 'PRONOSTIA/Learning_set/'
# bearing = 'Bearing' + idx
# base_dir = os.path.join(folder, bearing)


# data_folder = base_dir
# vibration_data_with_rul = main_pipeline(data_folder, idx=idx,window_size= 0.1)
# print(vibration_data_with_rul[['bearing_id', 'time_seconds', 'RUL_norm']].head())

  0%|          | 0/4 [00:00<?, ?it/s]

Loaded data shape: (7175680, 8)
   Hour  Minute  Second  Microsecond  Horizontal_Accel  Vertical_Accel  \
0     9      39      39      65664.0             0.552          -0.146   
1     9      39      39      65703.0             0.501          -0.480   
2     9      39      39      65742.0             0.138           0.435   
3     9      39      39      65781.0            -0.423           0.240   
4     9      39      39      65820.0            -0.802           0.020   

  bearing_id  time_seconds  
0   1_100001  34779.065664  
1   1_100001  34779.065703  
2   1_100001  34779.065742  
3   1_100001  34779.065781  
4   1_100001  34779.065820  
Data after RUL calculation: (7175680, 10)
Processing bearing: 1_100001, Data size: 2560
Data size: 2560, Samples per window: 10, Number of windows: 256
Data size: 2560, Samples per window: 10, Number of windows: 256
Processing bearing: 1_100002, Data size: 2560
Data size: 2560, Samples per window: 10, Number of windows: 256
Data size: 2560, Sample

 25%|██▌       | 1/4 [10:42<32:06, 642.13s/it]

Extracted features shape: (717568, 23)
Loaded data shape: (2229760, 8)
   Hour  Minute  Second  Microsecond  Horizontal_Accel  Vertical_Accel  \
0     8      47       5     196910.0             0.050          -0.253   
1     8      47       5     196950.0             0.165          -0.140   
2     8      47       5     196990.0             0.125           0.542   
3     8      47       5     197030.0             0.157          -0.261   
4     8      47       5     197070.0             0.421           0.081   

  bearing_id  time_seconds  
0   1_200001   31625.19691  
1   1_200001   31625.19695  
2   1_200001   31625.19699  
3   1_200001   31625.19703  
4   1_200001   31625.19707  
Data after RUL calculation: (2229760, 10)
Processing bearing: 1_200001, Data size: 2560
Data size: 2560, Samples per window: 10, Number of windows: 256
Data size: 2560, Samples per window: 10, Number of windows: 256
Processing bearing: 1_200002, Data size: 2560
Data size: 2560, Samples per window: 10, Number 

 50%|█████     | 2/4 [13:58<12:39, 379.67s/it]

Extracted features shape: (222976, 23)
Loaded data shape: (2332160, 8)
   Hour  Minute  Second  Microsecond  Horizontal_Accel  Vertical_Accel  \
0     8      14      15     884410.0            -0.391           0.011   
1     8      14      15     884450.0             0.292           0.133   
2     8      14      15     884490.0             0.596           0.024   
3     8      14      15     884530.0             0.230           0.272   
4     8      14      15     884570.0            -0.225           0.272   

  bearing_id  time_seconds  
0   2_100001   29655.88441  
1   2_100001   29655.88445  
2   2_100001   29655.88449  
3   2_100001   29655.88453  
4   2_100001   29655.88457  
Data after RUL calculation: (2332160, 10)
Processing bearing: 2_100001, Data size: 2560
Data size: 2560, Samples per window: 10, Number of windows: 256
Data size: 2560, Samples per window: 10, Number of windows: 256
Processing bearing: 2_100002, Data size: 2560
Data size: 2560, Samples per window: 10, Number 

 75%|███████▌  | 3/4 [17:33<05:04, 304.79s/it]

Extracted features shape: (233216, 23)
Loaded data shape: (2040320, 8)
   Hour  Minute  Second  Microsecond  Horizontal_Accel  Vertical_Accel  \
0     7      40      33     540660.0             0.038           0.290   
1     7      40      33     540700.0             0.125          -0.104   
2     7      40      33     540740.0             0.035          -0.314   
3     7      40      33     540780.0            -0.092           0.200   
4     7      40      33     540820.0             0.033           0.211   

  bearing_id  time_seconds  
0   2_200001   27633.54066  
1   2_200001   27633.54070  
2   2_200001   27633.54074  
3   2_200001   27633.54078  
4   2_200001   27633.54082  
Data after RUL calculation: (2040320, 10)
Processing bearing: 2_200001, Data size: 2560
Data size: 2560, Samples per window: 10, Number of windows: 256
Data size: 2560, Samples per window: 10, Number of windows: 256
Processing bearing: 2_200002, Data size: 2560
Data size: 2560, Samples per window: 10, Number 

100%|██████████| 4/4 [20:37<00:00, 309.29s/it]

Extracted features shape: (204032, 23)





In [9]:
# print(vibration_data_with_rul[['bearing_id', 'time_seconds', 'RUL_norm']].head())
import pickle
nom_fichier = "df_training_rul.pkl"

# Sauvegarde de la liste dans un fichier avec pickle
with open(nom_fichier, "wb") as fichier:
    pickle.dump(list_train_df, fichier)

print(f"Liste sauvegardée dans le fichier '{nom_fichier}'.")

Liste sauvegardée dans le fichier 'df_training_rul.pkl'.


In [10]:
# Utilisation de la fonction principale
from tqdm import tqdm
list_validation_df = []
list_idx = ['1_3', '1_4', '2_3', '2_4']
for idx in tqdm(list_idx):
    folder = 'PRONOSTIA/Test_set/'
    bearing = 'Bearing' + idx
    base_dir = os.path.join(folder, bearing)
    data_folder = base_dir
    vibration_data_with_rul = main_pipeline(data_folder, idx=idx,window_size= 0.1)
    # print(vibration_data_with_rul[['bearing_id', 'time_seconds', 'RUL_norm']].head())
    list_validation_df.append(vibration_data_with_rul)

  0%|          | 0/4 [00:00<?, ?it/s]

Loaded data shape: (4613120, 8)
   Hour  Minute  Second  Microsecond  Horizontal_Accel  Vertical_Accel  \
0     8      33       1     378160.0             0.092           0.044   
1     8      33       1     378200.0            -0.025           0.432   
2     8      33       1     378240.0            -0.104           0.008   
3     8      33       1     378280.0             0.056          -0.264   
4     8      33       1     378320.0             0.074          -0.195   

  bearing_id  time_seconds  
0   1_300001   30781.37816  
1   1_300001   30781.37820  
2   1_300001   30781.37824  
3   1_300001   30781.37828  
4   1_300001   30781.37832  
Data after RUL calculation: (4613120, 10)
Processing bearing: 1_300001, Data size: 2560
Data size: 2560, Samples per window: 10, Number of windows: 256
Data size: 2560, Samples per window: 10, Number of windows: 256
Processing bearing: 1_300002, Data size: 2560
Data size: 2560, Samples per window: 10, Number of windows: 256
Data size: 2560, Sample

 25%|██▌       | 1/4 [06:50<20:31, 410.50s/it]

Extracted features shape: (461312, 23)
Loaded data shape: (2915840, 8)
   Hour  Minute  Second  Microsecond  Horizontal_Accel  Vertical_Accel  \
0     8       8       0     425040.0             0.065          -0.058   
1     8       8       0     425080.0             0.438           0.179   
2     8       8       0     425120.0            -0.079           0.646   
3     8       8       0     425160.0            -0.523          -0.411   
4     8       8       0     425200.0            -0.146          -0.387   

  bearing_id  time_seconds  
0   1_400001   29280.42504  
1   1_400001   29280.42508  
2   1_400001   29280.42512  
3   1_400001   29280.42516  
4   1_400001   29280.42520  
Data after RUL calculation: (2915840, 10)
Processing bearing: 1_400001, Data size: 2560
Data size: 2560, Samples per window: 10, Number of windows: 256
Data size: 2560, Samples per window: 10, Number of windows: 256
Processing bearing: 1_400002, Data size: 2560
Data size: 2560, Samples per window: 10, Number 

 50%|█████     | 2/4 [11:00<10:31, 315.96s/it]

Extracted features shape: (291584, 23)
Loaded data shape: (3077120, 8)
   Hour  Minute  Second  Microsecond  Horizontal_Accel  Vertical_Accel  \
0     8      39      57     571910.0             0.176          -0.133   
1     8      39      57     571950.0             0.126           0.064   
2     8      39      57     571990.0            -0.178           0.396   
3     8      39      57     572030.0            -0.341           0.126   
4     8      39      57     572070.0            -0.052          -0.243   

  bearing_id  time_seconds  
0   2_300001   31197.57191  
1   2_300001   31197.57195  
2   2_300001   31197.57199  
3   2_300001   31197.57203  
4   2_300001   31197.57207  
Data after RUL calculation: (3077120, 10)
Processing bearing: 2_300001, Data size: 2560
Data size: 2560, Samples per window: 10, Number of windows: 256
Data size: 2560, Samples per window: 10, Number of windows: 256
Processing bearing: 2_300002, Data size: 2560
Data size: 2560, Samples per window: 10, Number 

 75%|███████▌  | 3/4 [15:29<04:54, 294.58s/it]

Extracted features shape: (307712, 23)
Loaded data shape: (1566720, 8)
   Hour  Minute  Second  Microsecond  Horizontal_Accel  Vertical_Accel  \
0     8       3      36     462540.0            -0.088           0.128   
1     8       3      36     462580.0            -0.178           0.137   
2     8       3      36     462620.0            -0.323           0.177   
3     8       3      36     462660.0             0.183           0.032   
4     8       3      36     462700.0             0.562          -0.471   

  bearing_id  time_seconds  
0   2_400001   29016.46254  
1   2_400001   29016.46258  
2   2_400001   29016.46262  
3   2_400001   29016.46266  
4   2_400001   29016.46270  
Data after RUL calculation: (1566720, 10)
Processing bearing: 2_400001, Data size: 2560
Data size: 2560, Samples per window: 10, Number of windows: 256
Data size: 2560, Samples per window: 10, Number of windows: 256
Processing bearing: 2_400002, Data size: 2560
Data size: 2560, Samples per window: 10, Number 

100%|██████████| 4/4 [17:50<00:00, 267.62s/it]

Extracted features shape: (156672, 23)





In [11]:
# print(vibration_data_with_rul[['bearing_id', 'time_seconds', 'RUL_norm']].head())
import pickle
nom_fichier = "df_validation_rul.pkl"

# Sauvegarde de la liste dans un fichier avec pickle
with open(nom_fichier, "wb") as fichier:
    pickle.dump(list_validation_df, fichier)

print(f"Liste sauvegardée dans le fichier '{nom_fichier}'.")

Liste sauvegardée dans le fichier 'df_validation_rul.pkl'.


In [12]:
# Utilisation de la fonction principale
from tqdm import tqdm
list_test_df = []
list_idx = ['1_5', '1_6', '1_7', '2_5', '2_6', '2_7']
for idx in tqdm(list_idx):
    folder = 'PRONOSTIA/Test_set/'
    bearing = 'Bearing' + idx
    base_dir = os.path.join(folder, bearing)
    data_folder = base_dir
    vibration_data_with_rul = main_pipeline(data_folder, idx=idx,window_size= 0.1)
    # print(vibration_data_with_rul[['bearing_id', 'time_seconds', 'RUL_norm']].head())
    list_test_df.append(vibration_data_with_rul)

  0%|          | 0/6 [00:00<?, ?it/s]

Loaded data shape: (5893120, 8)
   Hour  Minute  Second  Microsecond  Horizontal_Accel  Vertical_Accel  \
0     9      20      24     212540.0             0.211           0.281   
1     9      20      24     212580.0             0.268          -0.206   
2     9      20      24     212620.0             0.306          -0.444   
3     9      20      24     212660.0             0.226           0.111   
4     9      20      24     212700.0            -0.025           0.077   

  bearing_id  time_seconds  
0   1_500001   33624.21254  
1   1_500001   33624.21258  
2   1_500001   33624.21262  
3   1_500001   33624.21266  
4   1_500001   33624.21270  
Data after RUL calculation: (5893120, 10)
Processing bearing: 1_500001, Data size: 2560
Data size: 2560, Samples per window: 10, Number of windows: 256
Data size: 2560, Samples per window: 10, Number of windows: 256
Processing bearing: 1_500002, Data size: 2560
Data size: 2560, Samples per window: 10, Number of windows: 256
Data size: 2560, Sample

 17%|█▋        | 1/6 [07:57<39:47, 477.42s/it]

Extracted features shape: (589312, 23)
Loaded data shape: (5893120, 8)
   Hour  Minute  Second  Microsecond  Horizontal_Accel  Vertical_Accel  \
0     9       2      51     775040.0            -0.070          -0.354   
1     9       2      51     775080.0            -0.571          -0.036   
2     9       2      51     775120.0            -0.660           0.413   
3     9       2      51     775160.0            -0.498          -0.043   
4     9       2      51     775200.0            -0.749          -0.354   

  bearing_id  time_seconds  
0   1_600001   32571.77504  
1   1_600001   32571.77508  
2   1_600001   32571.77512  
3   1_600001   32571.77516  
4   1_600001   32571.77520  
Data after RUL calculation: (5893120, 10)
Processing bearing: 1_600001, Data size: 2560
Data size: 2560, Samples per window: 10, Number of windows: 256
Data size: 2560, Samples per window: 10, Number of windows: 256
Processing bearing: 1_600002, Data size: 2560
Data size: 2560, Samples per window: 10, Number 

 33%|███▎      | 2/6 [16:38<33:33, 503.30s/it]

Extracted features shape: (589312, 23)
Loaded data shape: (3845120, 8)
   Hour  Minute  Second  Microsecond  Horizontal_Accel  Vertical_Accel  \
0     8       2      37     962540.0             0.558           0.066   
1     8       2      37     962580.0             0.373          -0.333   
2     8       2      37     962620.0             0.558          -0.414   
3     8       2      37     962660.0             0.465           0.180   
4     8       2      37     962700.0             0.236           0.413   

  bearing_id  time_seconds  
0   1_700001   28957.96254  
1   1_700001   28957.96258  
2   1_700001   28957.96262  
3   1_700001   28957.96266  
4   1_700001   28957.96270  
Data after RUL calculation: (3845120, 10)
Processing bearing: 1_700001, Data size: 2560
Data size: 2560, Samples per window: 10, Number of windows: 256
Data size: 2560, Samples per window: 10, Number of windows: 256
Processing bearing: 1_700002, Data size: 2560
Data size: 2560, Samples per window: 10, Number 

 50%|█████     | 3/6 [22:42<21:58, 439.54s/it]

Extracted features shape: (384512, 23)
Loaded data shape: (5125120, 8)
   Hour  Minute  Second  Microsecond  Horizontal_Accel  Vertical_Accel  \
0     8      37      46     150040.0             0.441          -0.018   
1     8      37      46     150080.0             0.148           0.208   
2     8      37      46     150120.0            -0.042          -0.124   
3     8      37      46     150160.0            -0.489           0.032   
4     8      37      46     150200.0            -0.385          -0.373   

  bearing_id  time_seconds  
0   2_500001   31066.15004  
1   2_500001   31066.15008  
2   2_500001   31066.15012  
3   2_500001   31066.15016  
4   2_500001   31066.15020  
Data after RUL calculation: (5125120, 10)
Processing bearing: 2_500001, Data size: 2560
Data size: 2560, Samples per window: 10, Number of windows: 256
Data size: 2560, Samples per window: 10, Number of windows: 256
Processing bearing: 2_500002, Data size: 2560
Data size: 2560, Samples per window: 10, Number 

 67%|██████▋   | 4/6 [30:30<15:01, 450.62s/it]

Extracted features shape: (512512, 23)
Loaded data shape: (1464320, 8)
   Hour  Minute  Second  Microsecond  Horizontal_Accel  Vertical_Accel  \
0     8      52      11     618790.0             0.321           0.170   
1     8      52      11     618830.0            -0.030          -0.542   
2     8      52      11     618870.0            -0.382           0.027   
3     8      52      11     618910.0            -0.294           0.319   
4     8      52      11     618940.0            -0.033           0.485   

  bearing_id  time_seconds  
0   2_600001   31931.61879  
1   2_600001   31931.61883  
2   2_600001   31931.61887  
3   2_600001   31931.61891  
4   2_600001   31931.61894  
Data after RUL calculation: (1464320, 10)
Processing bearing: 2_600001, Data size: 2560
Data size: 2560, Samples per window: 10, Number of windows: 256
Data size: 2560, Samples per window: 10, Number of windows: 256
Processing bearing: 2_600002, Data size: 2560
Data size: 2560, Samples per window: 10, Number 

 83%|████████▎ | 5/6 [32:36<05:33, 333.71s/it]

Extracted features shape: (146432, 23)
Loaded data shape: (440320, 8)
   Hour  Minute  Second  Microsecond  Horizontal_Accel  Vertical_Accel  \
0     8      40      25     150040.0             0.613           0.188   
1     8      40      25     150080.0             0.680          -0.213   
2     8      40      25     150120.0             0.073          -0.186   
3     8      40      25     150160.0            -0.673           0.305   
4     8      40      25     150200.0            -0.602           0.303   

  bearing_id  time_seconds  
0   2_700001   31225.15004  
1   2_700001   31225.15008  
2   2_700001   31225.15012  
3   2_700001   31225.15016  
4   2_700001   31225.15020  
Data after RUL calculation: (440320, 10)
Processing bearing: 2_700001, Data size: 2560
Data size: 2560, Samples per window: 10, Number of windows: 256
Data size: 2560, Samples per window: 10, Number of windows: 256
Processing bearing: 2_700002, Data size: 2560
Data size: 2560, Samples per window: 10, Number of

100%|██████████| 6/6 [33:11<00:00, 331.97s/it]

Extracted features shape: (44032, 23)





In [20]:
# print(vibration_data_with_rul[['bearing_id', 'time_seconds', 'RUL_norm']].head())
import pickle
nom_fichier = "df_test_rul.pkl"

# Sauvegarde de la liste dans un fichier avec pickle
with open(nom_fichier, "wb") as fichier:
    pickle.dump(list_test_df, fichier)

print(f"Liste sauvegardée dans le fichier '{nom_fichier}'.")

Liste sauvegardée dans le fichier 'df_test_rul.pkl'.


In [18]:
len(list_test_df)

6

In [12]:
df_merged = pd.concat(list_train_df, ignore_index=True)

In [51]:
X = df_merged.drop(columns=['bearing_id', 'window_index', 'RUL_norm'])
X.shape

(1377792, 222)

In [65]:
X = X.dropna()

In [66]:
X.shape

(1377792, 222)

In [82]:
X_normalized = (X - X.min()) / (X.max() - X.min())

In [83]:
X_normalized.describe()

Unnamed: 0,mean_h,std_h,peak_to_peak_h,rms_h,mean_abs_h,max_abs_h,skewness_h,kurtosis_h,form_factor_h,crest_factor_h,...,integral_impulse_factor_v,integral_margin_factor_v,integral_energy_v,integral_moment_5_v,integral_moment_6_v,integral_moment_7_v,integral_moment_8_v,integral_moment_9_v,integral_moment_10_v,integral_moment_11_v
count,1377792.0,1377792.0,1377792.0,1377792.0,1377792.0,1377792.0,1377792.0,1377792.0,1377792.0,1377792.0,...,1377792.0,1377792.0,1377792.0,1377792.0,1377792.0,1377792.0,1377792.0,1377792.0,1377792.0,1377792.0
mean,0.5786804,0.01568022,0.01721883,0.01456413,0.0137105,0.01909916,0.4836694,0.1753639,0.1211885,0.5668876,...,0.2823309,0.1894631,8.687563e-05,0.6603333,4.903168e-06,0.6934527,3.799052e-06,0.7094422,3.088308e-06,0.7281253
std,0.009073555,0.01431249,0.01502734,0.01462755,0.01428179,0.0177984,0.1005046,0.09361923,0.05686452,0.1480223,...,0.1216195,0.08886501,0.002505242,0.0009940786,0.001668345,0.0008586503,0.001461911,0.0007934329,0.001299828,0.0007625892
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.5762881,0.008588209,0.009665214,0.007695561,0.007016533,0.01053725,0.4190518,0.1106646,0.08096772,0.4994644,...,0.1781856,0.115558,1.323548e-05,0.6603346,3.562353e-14,0.6934536,1.2637500000000001e-18,0.709443,4.3697880000000005e-23,0.728126
50%,0.5787203,0.01207222,0.01352381,0.01085125,0.01001618,0.01481866,0.4837953,0.1554677,0.1134721,0.5954458,...,0.3124066,0.202996,2.700599e-05,0.6603346,1.981079e-13,0.6934536,1.2573200000000002e-17,0.709443,7.809273e-22,0.728126
75%,0.5810894,0.01832323,0.02021703,0.01672232,0.015706,0.02215525,0.548339,0.2171258,0.152999,0.665111,...,0.3699189,0.2495518,5.755122e-05,0.6603346,1.159937e-12,0.6934536,1.332751e-16,0.709443,1.497767e-20,0.728126
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [None]:
X_normalized = X_normalized.dropna(axis=1)

In [88]:
from varclushi import VarClusHi

resclv = VarClusHi(X_normalized,maxeigval2=1,maxclus=None)
resclv.varclus()

<varclushi.varclushi.VarClusHi at 0x28b156f35b0>

In [90]:
resclv.rsquare

Unnamed: 0,Cluster,Variable,RS_Own,RS_NC,RS_Ratio
0,0,std_v,0.927756,0.915376,0.853702
1,0,peak_to_peak_v,0.927472,0.880343,0.606131
2,0,rms_v,0.951252,0.896875,0.472707
3,0,mean_abs_v,0.920459,0.892391,0.739171
4,0,max_abs_v,0.938805,0.817758,0.335788
...,...,...,...,...,...
177,37,imf_entropy_4_tf_h,0.109717,0.006587,0.896185
178,37,imf_entropy_3_tf_h,0.541211,0.023706,0.469930
179,38,derivative_crest_factor_h,0.512603,0.527494,1.031516
180,38,derivative_skewness_v,0.512603,0.447526,0.882209


In [77]:
print(X_normalized.isna().sum()[X_normalized.isna().sum()>0])# Si tout est à 0, il n'y a plus de NaN

energy_band_1_freq_h     1377792
energy_band_2_freq_h     1377792
energy_band_3_freq_h     1377792
energy_band_4_freq_h     1377792
energy_band_6_freq_h     1377792
energy_band_7_freq_h     1377792
energy_band_8_freq_h     1377792
energy_band_9_freq_h     1377792
energy_band_11_freq_h    1377792
energy_band_12_freq_h    1377792
energy_band_13_freq_h    1377792
energy_band_14_freq_h    1377792
energy_band_16_freq_h    1377792
energy_band_17_freq_h    1377792
energy_band_18_freq_h    1377792
energy_band_19_freq_h    1377792
energy_band_21_freq_h    1377792
energy_band_22_freq_h    1377792
energy_band_23_freq_h    1377792
energy_band_24_freq_h    1377792
energy_band_1_freq_v     1377792
energy_band_2_freq_v     1377792
energy_band_3_freq_v     1377792
energy_band_4_freq_v     1377792
energy_band_6_freq_v     1377792
energy_band_7_freq_v     1377792
energy_band_8_freq_v     1377792
energy_band_9_freq_v     1377792
energy_band_11_freq_v    1377792
energy_band_12_freq_v    1377792
energy_ban

In [None]:
energy_band_1_freq_h     
energy_band_2_freq_h     
energy_band_3_freq_h     
energy_band_4_freq_h     
energy_band_6_freq_h     
energy_band_7_freq_h     
energy_band_8_freq_h     
energy_band_9_freq_h     
energy_band_11_freq_h    
energy_band_12_freq_h    
energy_band_13_freq_h    
energy_band_14_freq_h    
energy_band_16_freq_h    
energy_band_17_freq_h    
energy_band_18_freq_h    
energy_band_19_freq_h    
energy_band_21_freq_h    
energy_band_22_freq_h    
energy_band_23_freq_h    
energy_band_24_freq_h    
energy_band_1_freq_v     
energy_band_2_freq_v     
energy_band_3_freq_v     
energy_band_4_freq_v     
energy_band_6_freq_v     
energy_band_7_freq_v     
energy_band_8_freq_v     
energy_band_9_freq_v     
energy_band_11_freq_v    
energy_band_12_freq_v   
energy_band_13_freq_v    
energy_band_14_freq_v    
energy_band_16_freq_v    
energy_band_17_freq_v    
energy_band_18_freq_v    
energy_band_19_freq_v    
energy_band_21_freq_v    
energy_band_22_freq_v    
energy_band_23_freq_v
energy_band_24_freq_v    