# Baseline models

In [113]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import tensorflow as tf
from tensorflow import keras
from keras import Sequential
from keras.layers import LSTM, Convolution1D, GlobalAveragePooling1D, Dense, Dropout
import keras_tuner as kt

from sklearn.metrics import mean_squared_error, r2_score

from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import export_graphviz

from sklearn.ensemble import GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.pipeline import make_pipeline

from sklearn.model_selection import RandomizedSearchCV

# Check for TensorFlow GPU access
print(f"TensorFlow has access to the following devices:\n{tf.config.list_physical_devices()}")

# See TensorFlow version
print(f"TensorFlow version: {tf.__version__}")

TensorFlow has access to the following devices:
[PhysicalDevice(name='/physical_device:CPU:0', device_type='CPU'), PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]
TensorFlow version: 2.9.2


In [114]:
train_data_df = pd.read_csv('../data_analysis/fd001/fd001-scaled_train.csv', sep=' ')
test_data_df = pd.read_csv('../data_analysis/fd001/fd001-scaled_test.csv', sep=' ')

train_labels_df = pd.read_csv('../data_analysis/fd001/fd001-training_labels.csv', sep=' ')
test_labels_df = pd.read_csv('../data_analysis/fd001/fd001-testing_labels.csv', sep=' ')
test_labels_at_break_df = pd.DataFrame(pd.read_csv('../TED/CMAPSSData/RUL_FD001.txt', sep=' ', header=None)[0])
test_labels_at_break_df.columns = ['RUL']

In [115]:
test_at_break_df = test_data_df.groupby(['ID'], sort=False).last().reset_index()
train_labels_df = train_labels_df.clip(upper = 125)
test_labels_df = test_labels_df.clip(upper = 125)

### Windows extraction

In [116]:
def get_windows(data_df, labels_df, window_length, mode = 'train'):

    if mode == 'train':

        labels_df['ID'] = data_df['ID']

        data_groupby = data_df.groupby('ID', sort=False)
        labels_groupby = labels_df.groupby('ID', sort=False)

        val_indices = np.random.choice(len(data_groupby), size = int(0.2 * len(data_groupby)))

        tr_data_eng_arr = []
        tr_labels_eng_arr = []

        val_data_eng_arr = []
        val_labels_eng_arr = []

        for i in range(len(data_groupby)):
            if i in val_indices:
                val_data_eng_arr.append(data_groupby.get_group(i+1))
            else:
                tr_data_eng_arr.append(data_groupby.get_group(i+1))

        for i in range(len(labels_groupby)):
            if i in val_indices:
                val_labels_eng_arr.append(labels_groupby.get_group(i+1))
            else:
                tr_labels_eng_arr.append(labels_groupby.get_group(i+1))

        tr_data_windows = []
        tr_label_windows = []
        for index in range(len(tr_data_eng_arr)):
            tr_data_arr = tr_data_eng_arr[index].to_numpy()
            tr_labels_arr = tr_labels_eng_arr[index].to_numpy()
            for t in range(tr_data_arr.shape[0] - window_length + 1):
                tr_data_windows.append(tr_data_arr[t:t+window_length, :])
                tr_label_windows.append(tr_labels_arr[t+window_length - 1])

        val_data_windows = []
        val_label_windows = []
        for index in range(len(val_data_eng_arr)):
            val_data_arr = val_data_eng_arr[index].to_numpy()
            val_labels_arr = val_labels_eng_arr[index].to_numpy()
            for t in range(val_data_arr.shape[0] - window_length + 1):
                val_data_windows.append(val_data_arr[t:t+window_length, :])
                val_label_windows.append(val_labels_arr[t+window_length - 1])

        return np.array(tr_data_windows), np.array(tr_label_windows), np.array(val_data_windows), np.array(val_label_windows)

    else:

        labels_df['ID'] = data_df['ID']

        data_groupby = data_df.groupby('ID', sort=False)
        labels_groupby = labels_df.groupby('ID', sort=False)
        data_eng_arr = []
        labels_eng_arr = []

        for i in range(len(data_groupby)):
            data_eng_arr.append(data_groupby.get_group(i+1))

        for i in range(len(labels_groupby)):
            labels_eng_arr.append(labels_groupby.get_group(i+1))

        data_windows = []
        label_windows = []
        for index in range(len(data_eng_arr)):
            data_arr = data_eng_arr[index].to_numpy()
            labels_arr = labels_eng_arr[index].to_numpy()
            data_windows.append(data_arr[-window_length:, :])
            label_windows.append(labels_arr[-1, 0])

        return np.array(data_windows), np.array(label_windows)

### Divide train set between train and validation sets

In [117]:
train_labels_df['ID'] = train_data_df['ID']

In [118]:
train_groupby_df = train_data_df.groupby(['ID'], sort = False)
train_labels_groupby_df = train_labels_df.groupby(['ID'], sort = False)
val_indices = np.random.choice(len(train_groupby_df), size = int(0.2 * len(train_groupby_df)))

train_labels_df = train_labels_df['RUL']

val_arr = []
train_set_arr = []
val_labels_arr = []
train_set_labels_arr = []

for i in range(len(train_groupby_df)):
    if i in val_indices:
        val_arr.append(train_groupby_df.get_group(i+1))
        val_labels_arr.append(train_labels_groupby_df.get_group(i+1)['RUL'])
    else:
        train_set_arr.append(train_groupby_df.get_group(i+1))
        train_set_labels_arr.append(train_labels_groupby_df.get_group(i+1)['RUL'])

val_set_df = val_arr[0]
val_labels_df = val_labels_arr[0]
for i in range(1, len(val_arr)):
    val_set_df = pd.concat([val_set_df, val_arr[i]])
    val_labels_df = pd.concat([val_labels_df, val_labels_arr[i]])

train_set_df = train_set_arr[0]
train_set_labels_df = train_set_labels_arr[0]
for i in range(1, len(train_set_arr)):
    train_set_df = pd.concat([train_set_df, train_set_arr[i]])
    train_set_labels_df = pd.concat([train_set_labels_df, train_set_labels_arr[i]])

train_set = train_set_df.values
train_set_labels = train_set_labels_df.values
val_set = val_set_df.values
val_labels = val_labels_df.values
val_labels = np.expand_dims(val_labels, axis = 1)
train_set_labels = np.expand_dims(train_set_labels, axis = 1)
train_labels = np.expand_dims(train_labels_df.values, axis = 1)

In [119]:
ms_used = train_data_df.columns[2:]
ms_used

Index(['SensorMeasure2', 'SensorMeasure3', 'SensorMeasure4', 'SensorMeasure7',
       'SensorMeasure8', 'SensorMeasure9', 'SensorMeasure11',
       'SensorMeasure12', 'SensorMeasure13', 'SensorMeasure14',
       'SensorMeasure15', 'SensorMeasure17', 'SensorMeasure20',
       'SensorMeasure21'],
      dtype='object')

## Random Forest Regressor

In [None]:
# rf_param_grid = {
#     'bootstrap': [True, False], 
#     'max_depth': [6, 7, 8, 9, 10], 
#     'min_samples_leaf': [30, 35, 40, 45, 50],
#     'max_features': ['log2', 'sqrt'], 
#     'n_estimators': [100 * x for x in range(5, 11)],
#     }

# rf = RandomForestRegressor(random_state=42)
# rand_search_rf = RandomizedSearchCV(estimator = rf, param_distributions = rf_param_grid, cv = 3, n_jobs = 1, verbose = 3, return_train_score=True)
# rand_search_rf.fit(train_data_df[ms_used].values, train_labels_df.values.squeeze())
# rf_results = pd.DataFrame(rand_search_rf.cv_results_)

# predictions_rf = rand_search_rf.predict(test_at_break_df[ms_used].values).round()
# print(rand_search_rf.best_params_)
# rmse = np.sqrt(mean_squared_error(test_labels_at_break_df.values, predictions_rf))
# print("RMSE: " + str(rmse)) 

In [None]:
train_data_df[ms_used]

In [None]:
best_rf = RandomForestRegressor(n_estimators=100, max_features="sqrt", random_state=42, max_depth=8, min_samples_leaf=50)
best_rf.fit(train_data_df[ms_used].values, train_labels_df.values.squeeze())
predictions_rf = best_rf.predict(test_at_break_df[ms_used].values).round()
rmse = np.sqrt(mean_squared_error(test_labels_at_break_df.values, predictions_rf))
print("RMSE: " + str(rmse)) 

## Gradient Boosting Regressor

In [None]:
# gb_params = {
#     'learning_rate': [0.001, 0.05, 0.1, 0.2, 0.3],
#     'n_estimators': [100 * x for x in range(5, 11)],
#     'subsample': [0.75, 0.85, 0.95, 1],
#     'min_samples_leaf': [30, 35, 40, 45, 50],
# }

# gb = GradientBoostingRegressor()
# rand_search_gb = RandomizedSearchCV(estimator = gb, param_distributions = gb_params, cv = 3, n_jobs = 1, verbose = 3, return_train_score=True)
# rand_search_gb.fit(train_data_df.values[:,1:], train_labels_df['RUL'].values.squeeze())

# predictions_gb = rand_search_gb.predict(test_at_break_df.values[:,1:]).round()
# print(rand_search_gb.best_params_)
# rmse = np.sqrt(mean_squared_error(test_labels_at_break_df.values, predictions_gb))
# print("RMSE: " + str(rmse)) 

In [None]:
best_gb = RandomForestRegressor(random_state=42, n_estimators = 900, min_samples_leaf = 40, max_features = 'sqrt', max_depth = 10)
best_gb.fit(train_data_df[ms_used].values, train_labels_df.values.squeeze())
predictions_gb = best_gb.predict(test_at_break_df[ms_used].values).round()
rmse = np.sqrt(mean_squared_error(test_labels_at_break_df.values, predictions_gb))
print("RMSE: " + str(rmse)) 

## Support Vector Regressor

In [None]:
# svmr_params = {
#     'kernel': ['rbf', 'linear', 'poly'],
#     'C': [1, 2, 5, 10],
#     'epsilon': [0.1 * i for i in range(1, 6)]
# }

# svmr = SVR()
# rand_search_svmr = RandomizedSearchCV(estimator = svmr, param_distributions = svmr_params, cv = 3, n_jobs = 1, verbose = 3, return_train_score=True)
# rand_search_svmr.fit(train_data_df[ms_used].values, train_labels_df.values.squeeze())

# predictions_svmr = rand_search_svmr.predict(test_at_break_df[ms_used].values).round()
# print(rand_search_svmr.best_params_)
# rmse = np.sqrt(mean_squared_error(test_labels_at_break_df.values, predictions_svmr))
# print("RMSE: " + str(rmse)) 

In [120]:
best_svmr = SVR(C = 1, epsilon = 0.0)
best_svmr.fit(train_data_df[ms_used].values, train_labels_df.values.squeeze())
predictions_svmr = best_svmr.predict(test_at_break_df[ms_used].values).round()
rmse = np.sqrt(mean_squared_error(test_labels_at_break_df.values, predictions_svmr))
print("RMSE: " + str(rmse)) 

RMSE: 43.80096110302132


## Multi Layer Perceptron - Neural Network

In [34]:
window_length = 20
mlp_tr_data, mlp_tr_labels, mlp_val_data, mlp_val_labels = get_windows(train_data_df, train_labels_df, window_length, mode='train')
mlp_test_data, mlp_test_labels = get_windows(test_data_df, test_labels_df, 20, mode = 'test')

mlp_tr_data = mlp_tr_data.reshape(mlp_tr_data.shape[0], -1)
mlp_val_data = mlp_val_data.reshape(mlp_val_data.shape[0], -1)
mlp_test_data = mlp_test_data.reshape(mlp_test_data.shape[0], -1)

mlp_tr_labels = np.expand_dims(mlp_tr_labels, axis=1)
mlp_val_labels = np.expand_dims(mlp_val_labels, axis=1)
mlp_test_labels = np.expand_dims(mlp_test_labels, axis=1)

In [35]:
def mlp_model_builder(hp):

    hp_units1 = hp.Int('units1', min_value=32, max_value=128, step=32)
    hp_units2 = hp.Int('units2', min_value=32, max_value=128, step=32)
    hp_units3 = hp.Int('units3', min_value=32, max_value=128, step=32)

    hp_dropout = hp.Choice('dropout_rate', values=[0.1, 0.2, 0.3, 0.4])
    hp_learning_rate = hp.Choice('learning_rate', values=[0.001, 0.005, 0.01, 0.05])

    mlp_model = Sequential()
    mlp_model.add(Dense(units = hp_units1, activation = 'relu', input_dim = train_set_df[ms_used].values.shape[1]))
    mlp_model.add(Dropout(hp_dropout))
    mlp_model.add(Dense(units = hp_units2, activation = 'relu'))
    mlp_model.add(Dropout(hp_dropout))
    mlp_model.add(Dense(units = hp_units3 , activation = 'relu'))
    mlp_model.add(Dropout(hp_dropout))
    mlp_model.add(Dense(1, activation = 'relu'))

    mlp_model.compile(optimizer=keras.optimizers.Adam(learning_rate = hp_learning_rate),
                loss=keras.losses.MeanSquaredError())

    return mlp_model

mlp_tuner = kt.BayesianOptimization(mlp_model_builder,
                                    objective='val_loss',
                                    max_trials = 9,
                                    directory='baseline_models',
                                    project_name='mlp')

INFO:tensorflow:Reloading Oracle from existing project baseline_models/mlp/oracle.json
Metal device set to: Apple M1 Max

systemMemory: 32.00 GB
maxCacheSize: 10.67 GB

INFO:tensorflow:Reloading Tuner from baseline_models/mlp/tuner0.json


2022-07-29 00:27:37.054452: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:305] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2022-07-29 00:27:37.054582: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:271] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)


In [None]:
mlp_tuner.search(mlp_tr_data, mlp_tr_labels, epochs=100, validation_data = (mlp_val_data, mlp_val_labels), batch_size = 256)
best_mlp_hps = mlp_tuner.get_best_hyperparameters(num_trials=1)[0]

best_mlp_model = mlp_tuner.hypermodel.build(best_mlp_hps)
mlp_history = best_mlp_model.fit(mlp_tr_data, mlp_tr_labels, epochs=100, validation_data = (mlp_val_data, mlp_val_labels), batch_size = 256)

In [36]:
mlp_tuner.search(train_set_df[ms_used].values, train_set_labels.squeeze(), epochs=100, validation_data = (val_set_df[ms_used].values, val_labels.squeeze()), batch_size = 256)
best_mlp_hps = mlp_tuner.get_best_hyperparameters(num_trials=1)[0]

best_mlp_model = mlp_tuner.hypermodel.build(best_mlp_hps)
mlp_history = best_mlp_model.fit(train_set_df[ms_used].values, train_set_labels.squeeze(), epochs=100, validation_data = (val_set_df[ms_used].values, val_labels.squeeze()), batch_size = 256)


Search: Running Trial #5

Value             |Best Value So Far |Hyperparameter
128               |128               |units1
32                |32                |units2
128               |32                |units3
0.1               |0.1               |dropout_rate
0.001             |0.001             |learning_rate

Epoch 1/100


2022-07-29 00:27:52.768364: W tensorflow/core/platform/profile_utils/cpu_utils.cc:128] Failed to get CPU frequency: 0 Hz


ValueError: in user code:

    File "/Users/henry/.virtualenvs/tf-m1/lib/python3.9/site-packages/keras/engine/training.py", line 1051, in train_function  *
        return step_function(self, iterator)
    File "/Users/henry/.virtualenvs/tf-m1/lib/python3.9/site-packages/keras/engine/training.py", line 1040, in step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "/Users/henry/.virtualenvs/tf-m1/lib/python3.9/site-packages/keras/engine/training.py", line 1030, in run_step  **
        outputs = model.train_step(data)
    File "/Users/henry/.virtualenvs/tf-m1/lib/python3.9/site-packages/keras/engine/training.py", line 889, in train_step
        y_pred = self(x, training=True)
    File "/Users/henry/.virtualenvs/tf-m1/lib/python3.9/site-packages/keras/utils/traceback_utils.py", line 67, in error_handler
        raise e.with_traceback(filtered_tb) from None
    File "/Users/henry/.virtualenvs/tf-m1/lib/python3.9/site-packages/keras/engine/input_spec.py", line 264, in assert_input_compatibility
        raise ValueError(f'Input {input_index} of layer "{layer_name}" is '

    ValueError: Input 0 of layer "sequential" is incompatible with the layer: expected shape=(None, 360), found shape=(None, 16)


In [None]:
# PLOT TRAIN AND VALIDATION LOSS
def plot_loss(fit_history):
    plt.figure(figsize=(13,5))
    plt.plot(range(1, len(fit_history.history['loss'])+1), fit_history.history['loss'], label='train')
    plt.plot(range(1, len(fit_history.history['val_loss'])+1), fit_history.history['val_loss'], label='validate')
    plt.xlabel('Epochs')
    plt.ylabel('Loss')
    plt.legend()
    plt.show()

plot_loss(mlp_history)

In [None]:
# TESTING FUNCTION
def testing(actual, pred, mode = 'Test'):
    mse = mean_squared_error(actual, pred)
    rmse = np.sqrt(mse)
    variance = r2_score(actual, pred)
    print(mode + ' set RMSE: ' + str(rmse) + ', R2: ' + str(variance))

In [None]:
train_full_pred = best_mlp_model.predict(mlp_tr_data)
testing(mlp_tr_labels, train_full_pred, 'Train')

test_at_break_pred = best_mlp_model.predict(mlp_test_data)
testing(mlp_test_labels, test_at_break_pred)

In [None]:
train_full_pred = best_mlp_model.predict(train_data_df[ms_used].values)
testing(train_labels_df.values.squeeze(), train_full_pred, 'Train')

test_at_break_pred = best_mlp_model.predict(test_at_break_df[ms_used].values)
testing(test_labels_at_break_df.values.squeeze(), test_at_break_pred)

## Convolutional Neural Network

In [None]:
window_length = 20
cnn_tr_data, cnn_tr_labels, cnn_val_data, cnn_val_labels = get_windows(train_data_df, train_labels_df, window_length, mode='train')
cnn_test_data, cnn_test_labels = get_windows(test_data_df, test_labels_df, 20, mode = 'test')

cnn_tr_labels = np.expand_dims(cnn_tr_labels, axis=1)
cnn_val_labels = np.expand_dims(cnn_val_labels, axis=1)
cnn_test_labels = np.expand_dims(cnn_test_labels, axis=1)

### Model

In [None]:
cnn_model = Sequential()
cnn_model.add(Convolution1D(256, 3, input_shape = (window_length, cnn_tr_data.shape[2])))
cnn_model.add(Convolution1D(128, 3, activation = 'relu'))
cnn_model.add(Convolution1D(64, 3, activation = 'relu'))
cnn_model.add(GlobalAveragePooling1D(data_format = 'channels_last', keepdims = False))
cnn_model.add(Dense(1, activation = 'relu'))

cnn_model.compile(loss='mean_squared_error', optimizer='adam')
cnn_model.save_weights('simple_lstm_weights.h5')

cnn_model.compile(loss='mean_squared_error', optimizer='adam')  
cnn_model.load_weights('simple_lstm_weights.h5')  

history = cnn_model.fit(cnn_tr_data, cnn_tr_labels,
                        validation_data=(cnn_val_data, cnn_val_labels),
                        epochs=50,
                        batch_size=128)

In [None]:
# TESTING FUNCTION
def evaluate(actual, pred, mode = 'test'):
    mse = mean_squared_error(actual, pred)
    rmse = np.sqrt(mse)
    variance = r2_score(actual, pred)
    print(mode + ' set RMSE: ' + str(rmse) + ', R2: ' + str(variance))

In [None]:
# TESTING
train_cnn_pred = cnn_model.predict(cnn_tr_data)
evaluate(cnn_tr_labels, train_cnn_pred, 'train')

test_cnn_pred = cnn_model.predict(cnn_test_data)
evaluate(cnn_test_labels, test_cnn_pred)

## LSTM Neural Network

In [None]:
lstm_model = Sequential()
lstm_model.add(LSTM(32, activation='tanh', input_shape=(window_length, cnn_tr_data.shape[2])))
lstm_model.add(Dense(1))

lstm_model.compile(loss='mean_squared_error', optimizer='adam')
lstm_model.save_weights('simple_lstm_weights.h5')

lstm_model.compile(loss='mean_squared_error', optimizer='adam')  
lstm_model.load_weights('simple_lstm_weights.h5')  

history = lstm_model.fit(cnn_tr_data, cnn_tr_labels,
                        validation_data=(cnn_val_data, cnn_val_labels),
                        epochs=50,
                        batch_size=128)

In [None]:
# PLOT LOSS HISTORY
def plot_loss(fit_history):
    plt.figure(figsize=(13,5))
    plt.plot(range(1, len(fit_history.history['loss'])+1), fit_history.history['loss'], label='train')
    plt.plot(range(1, len(fit_history.history['val_loss'])+1), fit_history.history['val_loss'], label='validate')
    plt.xlabel('Epochs')
    plt.ylabel('Loss')
    plt.legend()
    plt.show()

plot_loss(history)

In [None]:
# TESTING FUNCTION
def evaluate(actual, pred, mode = 'test'):
    mse = mean_squared_error(actual, pred)
    rmse = np.sqrt(mse)
    variance = r2_score(actual, pred)
    print(mode + ' set RMSE: ' + str(rmse) + ', R2: ' + str(variance))

In [None]:
# TESTING
train_cnn_pred = lstm_model.predict(cnn_tr_data)
evaluate(cnn_tr_labels, train_cnn_pred, 'train')

test_cnn_pred = lstm_model.predict(cnn_test_data)
evaluate(cnn_test_labels, test_cnn_pred)