In [1]:
# import dependencies
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.pipeline import Pipeline
from sklearn.metrics import make_scorer, mean_squared_error
from sklearn.model_selection import GridSearchCV, train_test_split, cross_validate

import time
import datetime
import random
from joblib import dump, load

# set random seed globally
np.random.seed(23)

In [2]:
# load training data
df = pd.read_csv('input/train_unscaled.csv')

In [3]:
# sensor selection based on Li2018
sensor_sel = ['s2','s3','s4','s7','s8','s9','s11','s12','s13','s14','s15','s17','s20','s21']

# transform into numpy arrays
train_data = df[sensor_sel].values
train_labels = df['RUL'].values

In [4]:
# create scoring object which scoring methods, which might be used for GridSearch and CV
scoring = {'mse': make_scorer(mean_squared_error),'rmse': make_scorer(mean_squared_error, squared=False)}

In [5]:
# initialize scaler, PCA, classifier objects
sc = StandardScaler()
pca = PCA(.95)
decisiontree = DecisionTreeRegressor()

# initialize a pipeline of three steps:
# 1) scale data
# 2) transform data with PCA
# 3) train regressor on data
pipe = Pipeline(steps=[('preprocessing', sc),
                       ('pca', pca),
                        ('regressor', decisiontree)])

In [6]:
# initialize lists of parameters for the DecisionTree GridSearch
max_depth = [4,6,8,12,16]
min_samples_leaf = [6,10,14]

param_grid_tree = [
    {'regressor': [DecisionTreeRegressor()],
     'regressor__max_depth': max_depth,
     'regressor__min_samples_leaf': min_samples_leaf,
    }]

In [7]:
clf_tree = GridSearchCV(pipe, param_grid_tree, scoring=scoring, cv=5, n_jobs=5, refit='mse')

In [8]:
# Fit the Tree GridSearch
start_time = time.time()
clf_tree.fit(train_data, train_labels)
gs_time = time.time()-start_time
print("Elapsed Computing Time for Gridsearch (DecisionTree):",str(datetime.timedelta(seconds=round(gs_time,0))))

Elapsed Computing Time for Gridsearch (DecisionTree): 0:00:03


In [9]:
# initialize lists of parameters for the GridSearch
hidden_layer_sizes = [(8,8,8),(8,8),(8,16),(8,32),(16,8),(16,16),(16,32),(32,16),(32,16),(32,32),
                      (32,32,32),(16,8,8),(8,16,8),(8,8,16),(16,16,8),(8,16,16),(8,16,32),(8,32,32),
                      (16,32,16),(16,32,32),(16,16,32),(32,32,32),(500,500),(100,100,100),(100,100),(100,)]
activation = ['tanh','relu']
alpha = [0.0001, 0.001, 0.01, 0.1]
max_iter = [250]

# create a dictionary of all the parameter options 
param_grid_nn = [
    {'regressor': [MLPRegressor()],
     'regressor__hidden_layer_sizes': hidden_layer_sizes,
     'regressor__activation': activation,
     'regressor__alpha': alpha,
     'regressor__max_iter': max_iter,
    }]

In [10]:
clf_nn = GridSearchCV(pipe, param_grid_nn, scoring=scoring, cv=5, n_jobs=5, refit='mse')

In [None]:
# Fit the NN GridSearch
start_time = time.time()
clf_nn.fit(train_data, train_labels)
gs_time = time.time()-start_time
print("Elapsed Computing Time for Gridsearch(NN):",str(datetime.timedelta(seconds=round(gs_time,0))))

In [None]:
# initialize lists of parameters for the SVM GridSearch
c = [0.001, 0.01, 0.1, 1, 10, 100]
gamma = [0.001, 0.01, 0.1, 1, 'scale']

param_grid_svm = [
    {'regressor': [SVR()],
     'regressor__C': c,
     'regressor__gamma': gamma,
    }]

In [None]:
clf_svm = GridSearchCV(pipe, param_grid_svm, scoring=scoring, cv=5, n_jobs=5, refit='mse')

In [None]:
# Fit the SVM GridSearch
start_time = time.time()
clf_svm.fit(train_data, train_labels)
gs_time = time.time()-start_time
print("Elapsed Computing Time for Gridsearch (SVM):",str(datetime.timedelta(seconds=round(gs_time,0))))

In [None]:
# view the best parameters based on RMSE
print("DecisionTree")
print("RMSE Cross-Validation Score with best Estimator: {:.2f}".format(clf_tree.best_score_))
print('Best max_depth:', clf_tree.best_estimator_.get_params()['regressor__max_depth'])
print('Best min_samples_leaf', clf_tree.best_estimator_.get_params()['regressor__min_samples_leaf'])
print("------------------------------------------------------------")
print("SVM")
print("RMSE Cross-Validation Score with best Estimator: {:.2f}".format(clf_svm.best_score_))
print('Best C:', clf_svm.best_estimator_.get_params()['regressor__C'])
print('Best gamma', clf_svm.best_estimator_.get_params()['regressor__gamma'])
print("------------------------------------------------------------")
print("NN")
print("RMSE Cross-Validation Score with best Estimator: {:.2f}".format(clf_nn.best_score_))
print('Best hidden_layer_sizes:', clf_nn.best_estimator_.get_params()['regressor__hidden_layer_sizes'])
print('Best activation', clf_nn.best_estimator_.get_params()['regressor__activation'])
print('Best alpha', clf_nn.best_estimator_.get_params()['regressor__alpha'])
print('Best max_iter', clf_nn.best_estimator_.get_params()['regressor__max_iter'])

In [None]:
# cross-validate
start_time = time.time()
cv_result_tree = cross_validate(clf_tree.best_estimator_, train_data, train_labels, cv=5, scoring=scoring, n_jobs=5,)
gs_time = time.time()-start_time
print("Elapsed Computing Time for CV (DecisionTree):",str(datetime.timedelta(seconds=round(gs_time,0))))

In [None]:
start_time = time.time()
cv_result_svm = cross_validate(clf_svm.best_estimator_, train_data, train_labels, cv=5, scoring=scoring, n_jobs=5,)
gs_time = time.time()-start_time
print("Elapsed Computing Time for CV (SVM):",str(datetime.timedelta(seconds=round(gs_time,0))))

In [None]:
start_time = time.time()
cv_result_nn = cross_validate(clf_nn.best_estimator_, train_data, train_labels, cv=5, scoring=scoring, n_jobs=5,)
gs_time = time.time()-start_time
print("Elapsed Computing Time for CV (NN):",str(datetime.timedelta(seconds=round(gs_time,0))))

In [None]:
print(cv_result_tree)
print(cv_result_svm)
print(cv_result_nn)

In [None]:
print("- 5-Fold Cross-Validation Results -")
print("\n------------------------------------------------------------")
print("\n-Mean-Fit-Time per Fold (Sec)-")
print("\nDecisiontTree:{:.2f}".format(cv_result_tree['fit_time'].mean()))
print("\nSVM:{:.2f}".format(cv_result_svm['fit_time'].mean()))
print("\nNN:{:.2f}".format(cv_result_nn['fit_time'].mean()))
print("\n------------------------------------------------------------")
print("\n-Mean-RMSE-")
print("\nDecisiontTree:{:.2f}".format(cv_result_tree['test_rmse'].mean()))
print("\nSVM:{:.2f}".format(cv_result_svm['test_rmse'].mean()))
print("\nNN:{:.2f}".format(cv_result_nn['test_rmse'].mean()))
print("\n------------------------------------------------------------")
print("\n- Used Pipelines -\n")
print(clf_tree.best_estimator_)
print(clf_svm.best_estimator_)
print(clf_nn.best_estimator_)

In [None]:
start_time = time.time()
cv_result_tree_vanilla = cross_validate(DecisionTreeRegressor(), train_data, train_labels, cv=5, scoring=scoring, n_jobs=5,)
gs_time = time.time()-start_time
print("Elapsed Computing Time for CV (DecisionTree):",str(datetime.timedelta(seconds=round(gs_time,0))))

In [None]:
start_time = time.time()
cv_result_svm_vanilla = cross_validate(SVR(), train_data, train_labels, cv=5, scoring=scoring, n_jobs=5,)
gs_time = time.time()-start_time
print("Elapsed Computing Time for CV (SVM):",str(datetime.timedelta(seconds=round(gs_time,0))))

In [None]:
start_time = time.time()
cv_result_nn_vanilla = cross_validate(MLPRegressor(), train_data, train_labels, cv=5, scoring=scoring, n_jobs=5,)
gs_time = time.time()-start_time
print("Elapsed Computing Time for CV (NN):",str(datetime.timedelta(seconds=round(gs_time,0))))

In [None]:
print("- 5-Fold Cross-Validation Results -")
print("\n------------------------------------------------------------")
print("\n-Mean-Fit-Time per Fold (Sec)-")
print("\nDecisiontTree:{:.2f}".format(cv_result_tree_vanilla['fit_time'].mean()))
print("\nSVM:{:.2f}".format(cv_result_svm_vanilla['fit_time'].mean()))
print("\nNN:{:.2f}".format(cv_result_nn_vanilla['fit_time'].mean()))
print("\n------------------------------------------------------------")
print("\n-Mean-RMSE-")
print("\nDecisiontTree:{:.2f}".format(cv_result_tree_vanilla['test_rmse'].mean()))
print("\nSVM:{:.2f}".format(cv_result_svm_vanilla['test_rmse'].mean()))
print("\nNN:{:.2f}".format(cv_result_nn_vanilla['test_rmse'].mean()))
print("\n------------------------------------------------------------")
print("\n- Used Pipelines -\n")
print(clf_tree.best_estimator_)
print(clf_svm.best_estimator_)
print(clf_nn.best_estimator_)

In [None]:
# load and preprocess test data
df_train_eval = pd.read_csv('input/train_ss.csv')
df_test_eval = pd.read_csv('input/test_ss.csv')

pca2 = PCA(.95)

ev_train_data = df_train_eval[sensor_sel].values
ev_train_data = pca2.fit_transform(ev_train_data)
ev_train_labels = df_train_eval['RUL'].values

ev_test_data = df_test_eval[sensor_sel].values
ev_test_data = pca2.transform(ev_test_data)
ev_test_labels = df_test_eval['RUL'].values

In [None]:
# train and save individual models
tree_eval = clf_tree.best_estimator_.get_params()['regressor']
tree_eval.fit(ev_train_data,ev_train_labels)
dump(tree_eval, './models/FD001_tree_pipe.joblib')

svm_eval = clf_svm.best_estimator_.get_params()['regressor']
svm_eval.fit(ev_train_data,ev_train_labels)
dump(svm_eval, './models/FD001_svm_pipe.joblib')

nn_eval = clf_nn.best_estimator_.get_params()['regressor']
nn_eval.fit(ev_train_data,ev_train_labels)
dump(nn_eval, './models/FD001_nn_pipe.joblib')

In [None]:
# make predictions on test set
t0 = time.time()
tree_preds = tree_eval.predict(ev_test_data)
print("DecisionTree Prediction Time:",time.time()-t0)

t0 = time.time()
svm_preds = svm_eval.predict(ev_test_data)
print("SVM Prediction Time:",time.time()-t0)

t0 = time.time()
nn_preds = nn_eval.predict(ev_test_data)
print("NN Prediction Time:",time.time()-t0)

In [None]:
print('Tree RMSE: %.2f'
      % mean_squared_error(tree_preds, ev_test_labels, squared=False))
print('SVM RMSE: %.2f'
      % mean_squared_error(svm_preds, ev_test_labels, squared=False))
print('NN RMSE: %.2f'
      % mean_squared_error(nn_preds, ev_test_labels, squared=False))