In [1]:
# Importing necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from scipy.stats import norm

def load_data_from_excel(data_dir):
    # Load the training, test, and RUL data from Excel files
    train_set = pd.read_excel(f"{data_dir}/train_FD001.xlsx")
    test_set = pd.read_excel(f"{data_dir}/test_FD001.xlsx")
    RUL_test_set = pd.read_excel(f"{data_dir}/RUL_FD001.xlsx")
    
    return train_set, test_set, RUL_test_set

# Load the data
data_dir = r"C:\Users\Admin\Downloads\CMAPSSData"
train_set, test_set, RUL_test_set = load_data_from_excel(data_dir)

In [2]:
print(train_set.columns)


Index(['Unit number', 'Time (in cycles)', 'Operational setting 1',
       'Operational setting 2', 'Operational setting 3',
       'Sensor measurement 1', 'Sensor measurement 2', 'Sensor measurement 3',
       'Sensor measurement 4', 'Sensor measurement 5', 'Sensor measurement 6',
       'Sensor measurement 7', 'Sensor measurement 8', 'Sensor measurement 9',
       'Sensor measurement 10', 'Sensor measurement 11',
       'Sensor measurement 12', 'Sensor measurement 13',
       'Sensor measurement 14', 'Sensor measurement 15',
       'Sensor measurement 16', 'Sensor measurement 17',
       'Sensor measurement 18', 'Sensor measurement 19',
       'Sensor measurement 20', 'Sensor measurement 21'],
      dtype='object')


In [3]:
# Add Time to Event (TTE) variable
def add_tte(dataset):
    dataset['TTE'] = dataset.groupby('Unit number')['Time (in cycles)'].transform('max') - dataset['Time (in cycles)']
    return dataset

train_set = add_tte(train_set)




In [4]:
print(train_set.columns)
print(train_set.info())


Index(['Unit number', 'Time (in cycles)', 'Operational setting 1',
       'Operational setting 2', 'Operational setting 3',
       'Sensor measurement 1', 'Sensor measurement 2', 'Sensor measurement 3',
       'Sensor measurement 4', 'Sensor measurement 5', 'Sensor measurement 6',
       'Sensor measurement 7', 'Sensor measurement 8', 'Sensor measurement 9',
       'Sensor measurement 10', 'Sensor measurement 11',
       'Sensor measurement 12', 'Sensor measurement 13',
       'Sensor measurement 14', 'Sensor measurement 15',
       'Sensor measurement 16', 'Sensor measurement 17',
       'Sensor measurement 18', 'Sensor measurement 19',
       'Sensor measurement 20', 'Sensor measurement 21', 'TTE'],
      dtype='object')
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20631 entries, 0 to 20630
Data columns (total 27 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   Unit number            20631 non-null  int64  
 1

In [10]:
RUL_test_set.to_csv('RUL_test_set.csv', index=False)


In [None]:
#################### EXPLORATION 

import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt


features = ['Unit number', 'Time (in cycles)', 'Operational setting 1',
       'Operational setting 2', 'Operational setting 3',
       'Sensor measurement 1', 'Sensor measurement 2', 'Sensor measurement 3',
       'Sensor measurement 4', 'Sensor measurement 5', 'Sensor measurement 6',
       'Sensor measurement 7', 'Sensor measurement 8', 'Sensor measurement 9',
       'Sensor measurement 10', 'Sensor measurement 11',
       'Sensor measurement 12', 'Sensor measurement 13',
       'Sensor measurement 14', 'Sensor measurement 15',
       'Sensor measurement 16', 'Sensor measurement 17',
       'Sensor measurement 18', 'Sensor measurement 19',
       'Sensor measurement 20', 'Sensor measurement 21' ]

# Créer des scatterplots pour chaque caractéristique par rapport à la variable cible "TTE"
for feature in features:
    sns.scatterplot(x=feature, y='TTE', data=train_set)
    plt.title(f'Scatterplot de {feature} par rapport à TTE')
    plt.xlabel(feature)
    plt.ylabel('TTE')
    plt.show()


In [None]:
#######{{{{{{{{{2}}}}}################################# Exploration ##########################

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt



# Variables explicatives et cible
variables_explicatives =['Unit number', 'Time (in cycles)', 'Operational setting 1',
       'Operational setting 2', 'Operational setting 3',
       'Sensor measurement 1', 'Sensor measurement 2', 'Sensor measurement 3',
       'Sensor measurement 4', 'Sensor measurement 5', 'Sensor measurement 6',
       'Sensor measurement 7', 'Sensor measurement 8', 'Sensor measurement 9',
       'Sensor measurement 10', 'Sensor measurement 11',
       'Sensor measurement 12', 'Sensor measurement 13',
       'Sensor measurement 14', 'Sensor measurement 15',
       'Sensor measurement 16', 'Sensor measurement 17',
       'Sensor measurement 18', 'Sensor measurement 19',
       'Sensor measurement 20', 'Sensor measurement 21']
variable_cible = 'TTE'

# Représentation graphique des distributions des variables
for variable in variables_explicatives + [variable_cible]:
    sns.histplot(train_set[variable], kde=True)
    plt.title(f'Distribution de {variable}')
    plt.xlabel(variable)
    plt.ylabel('Fréquence')
    plt.show()

# Analyse de la distribution de la variable cible
sns.histplot(train_set[variable_cible], kde=True)
plt.title(f'Distribution de {variable_cible}')
plt.xlabel(variable_cible)
plt.ylabel('Fréquence')
plt.show()