In [10]:
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import MinMaxScaler

from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn import svm, tree



Using TensorFlow backend.


In [2]:
#Import data and give column names

train = pd.read_csv('../../data/feature_selected_train_FD001.csv', sep=',', header=0, engine='python')
test = pd.read_csv('../../data/feature_selected_test_FD001.csv', sep=',', header=0, engine='python')
y_test = pd.read_csv('../../data/RUL_FD001.csv', sep=',', header=0,  engine='python')

index_names = train.columns[[0, 1]]
setting_names = train.columns[[2]]
sensor_names = train.drop(index_names.union(setting_names), axis = 1).columns # Find something better than union!!
scale_columns = sensor_names
keep_columns = scale_columns.union(index_names[[1]])

In [3]:
def scale_data(train, test, columns):
    sc = MinMaxScaler()
    scaled_train = train.copy()
    scaled_test = test.copy()
    scaled_train[columns] = pd.DataFrame(sc.fit_transform(scaled_train[columns]))
    scaled_test[columns] = pd.DataFrame(sc.transform(scaled_test[columns]))
    return scaled_train, scaled_test

def add_remaining_useful_life(df):
    grouped_by_unit = df.groupby(by="unit_no")
    max_cycle = grouped_by_unit["time_cycles"].max()
    
    result_frame = df.merge(max_cycle.to_frame(name='max_cycle'), left_on='unit_no', right_index=True)
    remaining_useful_life = result_frame["max_cycle"] - result_frame["time_cycles"]
    result_frame["RUL"] = remaining_useful_life
    
    result_frame = result_frame.drop("max_cycle", axis=1)
    return result_frame
def plot_loss(history):
    plt.figure(figsize=(13,5))
    plt.plot(range(1, len(history.history['loss'])+1), history.history['loss'], label='train')
    plt.plot(range(1, len(history.history['val_loss'])+1), history.history['val_loss'], label='validate')
    plt.xlabel('Epochs')
    plt.ylabel('Loss')
    plt.legend()
    plt.show()

def evaluate(y_true, y_hat, label='test'):
    mse = mean_squared_error(y_true, y_hat)
    rmse = np.sqrt(mse)
    variance = r2_score(y_true, y_hat)
    print('{} set RMSE:{}, R2:{}'.format(label, rmse, variance))
    return rmse, variance
    
def plot_predictions(y_true, y_predicted):
    plt.figure(figsize=(13,5))
    plt.plot(y_true, label='true')
    plt.plot(y_predicted, label='predicted')
    plt.xlabel('Predictions')
    plt.ylabel('Predicted Values')
    plt.legend()
    plt.show()

In [4]:
x_train = add_remaining_useful_life(train)
y_train = x_train.pop('RUL')
x_test = test.groupby(by="unit_no").last().reset_index()
x_train_scaled, x_test_scaled = scale_data(x_train, x_test, scale_columns)

In [8]:
models = [
          ('Linear Regression', LinearRegression()),  
          ('SVM', svm.SVR(kernel = 'poly', gamma = 0.0001, C = 10, epsilon= 0.5)),
          ('Random Forest', RandomForestRegressor(n_estimators=100, max_features="sqrt", random_state=0,
                           min_samples_leaf=11, min_impurity_decrease=0.0, 
                           max_depth=15, ccp_alpha=0.125))
         ]
results = []
names = []
dfs = []
for name, model in models:
    
    print(name)
    clf = model.fit(x_train_scaled, y_train)
    y_hat_train = clf.predict(x_train_scaled)
    rmse_train, variance_train = evaluate(y_train, y_hat_train, 'train')
    
    y_hat_test = clf.predict(x_test_scaled)
    rmse_test, variance_test = evaluate(y_test, y_hat_test)
    
    metrics = [rmse_train, variance_train, rmse_test, variance_test]
    results.append(metrics)
    
    names.append(name)
    
result_df = pd.DataFrame(results, columns = ['RMSE Train', 'R2 Train', 'RMSE Test', 'R2 Test'])
result_df['Model'] = names

Linear Regression
train set RMSE:39.59310521746384, R2:0.669583736222076
test set RMSE:31.588573847431114, R2:0.4221694575464129
SVM
train set RMSE:38.951113872679606, R2:0.6802120822951863
test set RMSE:27.744628683468143, R2:0.5542428792634235
Random Forest
train set RMSE:25.44657812868412, R2:0.8635162119566338
test set RMSE:25.381584304690122, R2:0.6269406666165807


In [9]:
result_df.head()

Unnamed: 0,RMSE Train,R2 Train,RMSE Test,R2 Test,Model
0,39.593105,0.669584,31.588574,0.422169,Linear Regression
1,38.951114,0.680212,27.744629,0.554243,SVM
2,25.446578,0.863516,25.381584,0.626941,Random Forest
