In [1]:
import os
import h5py
import time
import matplotlib
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from pandas import DataFrame
from matplotlib import gridspec
from scipy.stats import zscore
%matplotlib inline

In [2]:
filename = 'data/N-CMAPSS_DS01-005.h5'

In [3]:
# time tracking
t = time.process_time()  

# loading data
with h5py.File(filename, 'r') as hdf:
        
        # development set
        W_dev = np.array(hdf.get('W_dev'))             # W
        X_s_dev = np.array(hdf.get('X_s_dev'))         # X_s
        X_v_dev = np.array(hdf.get('X_v_dev'))         # X_v
        T_dev = np.array(hdf.get('T_dev'))             # T
        Y_dev = np.array(hdf.get('Y_dev'))             # RUL  
        A_dev = np.array(hdf.get('A_dev'))             # Auxiliary

        # test set
        W_test = np.array(hdf.get('W_test'))           # W
        X_s_test = np.array(hdf.get('X_s_test'))       # X_s
        X_v_test = np.array(hdf.get('X_v_test'))       # X_v
        T_test = np.array(hdf.get('T_test'))           # T
        Y_test = np.array(hdf.get('Y_test'))           # RUL  
        A_test = np.array(hdf.get('A_test'))           # Auxiliary
        
        # varnams
        W_var = np.array(hdf.get('W_var'))
        X_s_var = np.array(hdf.get('X_s_var'))  
        X_v_var = np.array(hdf.get('X_v_var')) 
        T_var = np.array(hdf.get('T_var'))
        A_var = np.array(hdf.get('A_var'))
        
        # from np.array to list dtype U4/U5
        W_var = list(np.array(W_var, dtype='U20'))
        X_s_var = list(np.array(X_s_var, dtype='U20'))  
        X_v_var = list(np.array(X_v_var, dtype='U20')) 
        T_var = list(np.array(T_var, dtype='U20'))
        A_var = list(np.array(A_var, dtype='U20'))
                          
W = np.concatenate((W_dev, W_test), axis=0)  
X_s = np.concatenate((X_s_dev, X_s_test), axis=0)
X_v = np.concatenate((X_v_dev, X_v_test), axis=0)
T = np.concatenate((T_dev, T_test), axis=0)
Y = np.concatenate((Y_dev, Y_test), axis=0) 
A = np.concatenate((A_dev, A_test), axis=0) 
    
print('')
print("Operation time (min): " , (time.process_time()-t)/60)
print('')
print ("W shape: " + str(W.shape))
print ("X_s shape: " + str(X_s.shape))
print ("X_v shape: " + str(X_v.shape))
print ("T shape: " + str(T.shape))
print ("A shape: " + str(A.shape))


Operation time (min):  0.039322916666666666

W shape: (7641868, 4)
X_s shape: (7641868, 14)
X_v shape: (7641868, 14)
T shape: (7641868, 10)
A shape: (7641868, 4)


In [4]:
# creating the dataframes for training
df_A_train = DataFrame(data=A_dev, columns=A_var)             # auxiliary information
df_W_train = DataFrame(data=W_dev, columns=W_var)             # operative conditions 
df_X_s_train = DataFrame(data=X_s_dev, columns=X_s_var)       # sensor readings
df_X_v_train = DataFrame(data=X_v_dev, columns=X_v_var)       # virtual sensors
df_T_train = DataFrame(data=T_dev, columns=T_var)             # degradation
df_Y_train = DataFrame(data=Y_dev)                            # RUL

# creating the dataframes for testing
df_A_test = DataFrame(data=A_test, columns=A_var)             # auxiliary information
df_W_test = DataFrame(data=W_test, columns=W_var)             # operative conditions 
df_X_s_test = DataFrame(data=X_s_test, columns=X_s_var)       # sensor readings
df_X_v_test = DataFrame(data=X_v_test, columns=X_v_var)       # virtual sensors
df_T_test = DataFrame(data=T_test, columns=T_var)             # degradation
df_Y_test = DataFrame(data=Y_test)                            # RUL

# concatinated DataFrames 
df_train = pd.concat([df_W_train, df_X_s_train, df_X_v_train, df_T_train, df_Y_train, df_A_train], axis=1)
df_test = pd.concat([df_W_test, df_X_s_test, df_X_v_test, df_T_test, df_Y_test, df_A_test], axis=1)

In [5]:
print(f"The train set of {filename} has {df_train.shape[0]} rows.")
print(f"The test set of {filename} has {df_test.shape[0]} rows.")

print("\n")

print(f"The units in the train set of {filename} are {df_train.unit.unique().tolist()}.")
print(f"The units in the test set of {filename} are {df_test.unit.unique().tolist()}.")

The train set of data/N-CMAPSS_DS01-005.h5 has 4906636 rows.
The test set of data/N-CMAPSS_DS01-005.h5 has 2735232 rows.


The units in the train set of data/N-CMAPSS_DS01-005.h5 are [1.0, 2.0, 3.0, 4.0, 5.0, 6.0].
The units in the test set of data/N-CMAPSS_DS01-005.h5 are [7.0, 8.0, 9.0, 10.0].


In [6]:
df_train

Unnamed: 0,alt,Mach,TRA,T2,T24,T30,T48,T50,P15,P2,...,HPC_flow_mod,HPT_eff_mod,HPT_flow_mod,LPT_eff_mod,LPT_flow_mod,0,unit,cycle,Fc,hs
0,3013.0,0.376362,70.311996,522.314770,618.288596,1470.469798,1849.620676,1269.275585,19.432070,14.484611,...,0.0,-0.000604,0.0,0.0,0.0,99,1.0,1.0,1.0,1.0
1,3020.0,0.376866,70.311996,522.327145,618.296355,1470.415593,1849.519871,1269.177159,19.431385,14.484683,...,0.0,-0.000604,0.0,0.0,0.0,99,1.0,1.0,1.0,1.0
2,3025.0,0.377685,70.311996,522.371840,618.336514,1470.453853,1849.566139,1269.167353,19.435163,14.488224,...,0.0,-0.000604,0.0,0.0,0.0,99,1.0,1.0,1.0,1.0
3,3035.0,0.376992,70.399887,522.282418,618.302173,1470.650929,1850.195069,1269.518670,19.426003,14.477632,...,0.0,-0.000604,0.0,0.0,0.0,99,1.0,1.0,1.0,1.0
4,3043.0,0.377622,70.399887,522.300605,618.345228,1470.640421,1849.950988,1269.253972,19.427484,14.478114,...,0.0,-0.000604,0.0,0.0,0.0,99,1.0,1.0,1.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4906631,3030.0,0.201159,31.728291,511.974677,570.015066,1285.435213,1555.520405,1164.991647,16.109128,13.492444,...,0.0,-0.017792,0.0,0.0,0.0,0,6.0,94.0,2.0,0.0
4906632,3021.0,0.203238,31.728291,512.092657,570.111968,1285.592440,1555.936249,1165.280244,16.121885,13.504848,...,0.0,-0.017792,0.0,0.0,0.0,0,6.0,94.0,2.0,0.0
4906633,3015.0,0.202986,31.728291,512.103385,570.136953,1285.638575,1555.788971,1165.146642,16.124832,13.506797,...,0.0,-0.017792,0.0,0.0,0.0,0,6.0,94.0,2.0,0.0
4906634,3007.0,0.203301,31.728291,512.145375,570.176550,1285.711680,1555.959488,1165.285600,16.130621,13.512026,...,0.0,-0.017792,0.0,0.0,0.0,0,6.0,94.0,2.0,0.0


In [18]:
lijst = ["k", "i", "m"]

print(lijst + ["r", 'h'])

['k', 'i', 'm', 'r', 'h']


In [19]:
def preprocessing(df):

    # renaming RUL column
    col = list(df.columns)
    col[42]='RUL'
    df.columns = col

    # dropping columns 'P2' and 'T2'
    df = df.drop(columns = ["P2", "T2"])

    # setting rolling window size 
    window_size = 100

    # selecting columns for noise reduction (sensor measurements & virtual readings)
    to_reduce = ['TRA', 'T24', 'T30', 'T48', 'T50', 'P15', 'P21', 'P24',
       'Ps30', 'P40', 'P50', 'Nf', 'Nc', 'Wf', 'T40', 'P30', 'P45', 'W21',
       'W22', 'W25', 'W31', 'W32', 'W48', 'W50', 'SmFan', 'SmLPC', 'SmHPC',
       'phi', 'fan_eff_mod', 'fan_flow_mod', 'LPC_eff_mod', 'LPC_flow_mod',
       'HPC_eff_mod', 'HPC_flow_mod', 'HPT_eff_mod', 'HPT_flow_mod',
       'LPT_eff_mod', 'LPT_flow_mod']
    
    # part of dataframe that doesn't need noise reduction
    no_noise = df[['alt', 'Mach', 'RUL', 'unit', 'cycle', 'Fc', 'hs']]

    # calculate rolling average for each column base on window
    rolling_avg_df = df[to_reduce].rolling(window=window_size).mean()

    # fill in nan values with the original value 
    for col in to_reduce:
        # rolling_avg_df[col].fillna(df[col], inplace=True)
        rolling_avg_df.fillna({col: df[col]}, inplace=True)

    # concatenate both parts of dataframe back together 
    df_reduced = pd.concat([no_noise, rolling_avg_df], axis = 1)

    # calculating T30 - T24 & adding to dataframe
    df_reduced["dT_30_24"] = abs(df_reduced.T30 - df_reduced.T24)

    # calculating T50 - T40 & adding to dataframe 
    df_reduced["dT_50_40"] = abs(df_reduced.T50 - df_reduced.T40)

   # applying z-score normalization 
    df_reduced[to_reduce + ["dT_30_24", "dT_50_40"]] = df_reduced[to_reduce + ["dT_30_24", "dT_50_40"]].apply(zscore)

    return df_reduced


In [20]:
preprocessing(df_train)

Unnamed: 0,alt,Mach,RUL,unit,cycle,Fc,hs,TRA,T24,T30,...,LPC_eff_mod,LPC_flow_mod,HPC_eff_mod,HPC_flow_mod,HPT_eff_mod,HPT_flow_mod,LPT_eff_mod,LPT_flow_mod,dT_30_24,dT_50_40
0,3013.0,0.376362,99,1.0,1.0,1.0,1.0,0.535307,2.358783,2.093251,...,,,,,0.802273,,,,1.832631,1.269362
1,3020.0,0.376866,99,1.0,1.0,1.0,1.0,0.535307,2.359160,2.092440,...,,,,,0.802273,,,,1.831386,1.268980
2,3025.0,0.377685,99,1.0,1.0,1.0,1.0,0.535307,2.361108,2.093012,...,,,,,0.802273,,,,1.831347,1.269560
3,3035.0,0.376992,99,1.0,1.0,1.0,1.0,0.540149,2.359442,2.095964,...,,,,,0.802273,,,,1.836000,1.274116
4,3043.0,0.377622,99,1.0,1.0,1.0,1.0,0.540149,2.361530,2.095806,...,,,,,0.802273,,,,1.834923,1.273112
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4906631,3030.0,0.201159,0,6.0,94.0,2.0,0.0,-1.716660,-0.156504,-0.899322,...,,,,,-3.360292,,,,-1.142484,-1.530206
4906632,3021.0,0.203238,0,6.0,94.0,2.0,0.0,-1.713513,-0.152766,-0.894109,...,,,,,-3.360292,,,,-1.137034,-1.525161
4906633,3015.0,0.202986,0,6.0,94.0,2.0,0.0,-1.710365,-0.148998,-0.888883,...,,,,,-3.360292,,,,-1.131580,-1.520112
4906634,3007.0,0.203301,0,6.0,94.0,2.0,0.0,-1.707217,-0.145184,-0.883637,...,,,,,-3.360292,,,,-1.126118,-1.515049
