In [1]:
''' Data Preparation, Label, Normalization'''

import pandas as pd
import numpy as np
import pickle
from sklearn import preprocessing
from sklearn.metrics import confusion_matrix, recall_score, precision_score


def printScores(y_pred, y_true):
    print()
    cm = confusion_matrix(y_true, y_pred)
    print(cm)
    precision = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    print( 'precision = ', precision, ', recall = ', recall)


dataFiles = ['upload/train_FD001.txt', 'upload/test_FD001.txt', 'upload/RUL_FD001.txt']
dataColumns = ['id', 'cycle', 'setting1', 'setting2', 'setting3', 's1', 's2', 's3', 's4', 's5', 's6', 's7', 's8', 's9', 's10', 's11', 's12', 's13', 's14', 's15', 's16', 's17', 's18', 's19', 's20', 's21']

# read data 
train_df = pd.read_csv(dataFiles[0], sep=" ", header=None)
train_df.drop(train_df.columns[[26, 27]], axis=1, inplace=True)
train_df.columns = dataColumns

test_df = pd.read_csv(dataFiles[1], sep=" ", header=None)
test_df.drop(test_df.columns[[26, 27]], axis=1, inplace=True)
test_df.columns = dataColumns

rul_df = pd.read_csv(dataFiles[2], sep=" ", header=None)
rul_df.drop(rul_df.columns[[1]], axis=1, inplace=True)
rul_df.columns = ['more']
rul_df['id'] = rul_df.index + 1

# train set, calculate RUL
train_df = train_df.sort_values(['id','cycle'])
rul = pd.DataFrame(train_df.groupby('id')['cycle'].max()).reset_index()
rul.columns = ['id', 'max']
train_df = train_df.merge(rul, on=['id'], how='left')
train_df['RUL'] = train_df['max'] - train_df['cycle']
train_df.drop('max', axis=1, inplace=True)

# test set, use ground truth to calculate RUL
test_df = test_df.sort_values(['id','cycle'])
rul = pd.DataFrame(test_df.groupby('id')['cycle'].max()).reset_index()
rul.columns = ['id', 'max']
rul_df['max'] = rul['max'] + rul_df['more']
rul_df.drop('more', axis=1, inplace=True)
test_df = test_df.merge(rul_df, on=['id'], how='left')
test_df['RUL'] = test_df['max'] - test_df['cycle']
test_df.drop('max', axis=1, inplace=True)

# label data
w1 = 30
train_df['label1'] = np.where(train_df['RUL'] <= w1, 1, 0 )
test_df['label1'] = np.where(test_df['RUL'] <= w1, 1, 0 )

# normalize train data
train_df['cycle_norm'] = train_df['cycle']
cols_normalize = train_df.columns.difference(['id','cycle','RUL','label1'])   # feature columns
min_max_scaler = preprocessing.MinMaxScaler()
norm_train_df = pd.DataFrame(min_max_scaler.fit_transform(train_df[cols_normalize]), 
                             columns=cols_normalize, 
                             index=train_df.index)
with open('min_max_scaler.pickle','wb') as f:
    pickle.dump(min_max_scaler, f)
join_df = train_df[train_df.columns.difference(cols_normalize)].join(norm_train_df)
train_df = join_df.reindex(columns = train_df.columns)

# normalize test data
test_df['cycle_norm'] = test_df['cycle']
norm_test_df = pd.DataFrame(min_max_scaler.transform(test_df[cols_normalize]), 
                            columns=cols_normalize, 
                            index=test_df.index)
test_join_df = test_df[test_df.columns.difference(cols_normalize)].join(norm_test_df)
test_df = test_join_df.reindex(columns = test_df.columns)
test_df = test_df.reset_index(drop=True)

# describe data and use only some columns
def describe():
    print('train set', train_df.shape)
    print('test set', test_df.shape)
    print('check distribution \n', train_df['label1'].value_counts())
    stats = train_df.describe().T
    unchanging_cols = list(stats[stats['std']==0].index)
    print('unchanging cols', unchanging_cols)
    # ['setting3', 's1', 's5', 's10', 's16', 's18', 's19']

print('Describe data:')
describe()
    
feature_cols = ['cycle_norm', 'setting1', 'setting2', 'setting3', 's1', 's2', 's3', 's4', 's5', 's6', 's7', 's8', 's9', 's10', 's11', 's12', 's13', 's14', 's15', 's16', 's17', 's18', 's19', 's20', 's21']
feature_cols = [s for s in feature_cols if s not in ['setting3', 's1', 's5', 's10', 's16', 's18', 's19']]
    
cols = ['id','cycle','RUL','label1'] + feature_cols    
train_df = train_df[cols]
test_df = test_df[cols]

Describe data:
train set (20631, 29)
test set (13096, 29)
check distribution 
 0    17531
1     3100
Name: label1, dtype: int64
unchanging cols ['setting3', 's1', 's5', 's10', 's16', 's18', 's19']


In [2]:
train_df.head(5)

Unnamed: 0,id,cycle,RUL,label1,cycle_norm,setting1,setting2,s2,s3,s4,...,s8,s9,s11,s12,s13,s14,s15,s17,s20,s21
0,1,1,191,0,0.0,0.45977,0.166667,0.183735,0.406802,0.309757,...,0.242424,0.109755,0.369048,0.633262,0.205882,0.199608,0.363986,0.333333,0.713178,0.724662
1,1,2,190,0,0.00277,0.609195,0.25,0.283133,0.453019,0.352633,...,0.212121,0.100242,0.380952,0.765458,0.279412,0.162813,0.411312,0.333333,0.666667,0.731014
2,1,3,189,0,0.00554,0.252874,0.75,0.343373,0.369523,0.370527,...,0.272727,0.140043,0.25,0.795309,0.220588,0.171793,0.357445,0.166667,0.627907,0.621375
3,1,4,188,0,0.00831,0.54023,0.5,0.343373,0.256159,0.331195,...,0.318182,0.124518,0.166667,0.889126,0.294118,0.174889,0.166603,0.333333,0.573643,0.662386
4,1,5,187,0,0.01108,0.390805,0.333333,0.349398,0.257467,0.404625,...,0.242424,0.14996,0.255952,0.746269,0.235294,0.174734,0.402078,0.416667,0.589147,0.704502


In [3]:
test_df.head(5)

Unnamed: 0,id,cycle,RUL,label1,cycle_norm,setting1,setting2,s2,s3,s4,...,s8,s9,s11,s12,s13,s14,s15,s17,s20,s21
0,1,1,142,0,0.0,0.632184,0.75,0.545181,0.310661,0.269413,...,0.212121,0.127614,0.208333,0.646055,0.220588,0.13216,0.308965,0.333333,0.55814,0.661834
1,1,2,141,0,0.00277,0.344828,0.25,0.150602,0.379551,0.222316,...,0.166667,0.146684,0.386905,0.739872,0.264706,0.204768,0.213159,0.416667,0.682171,0.686827
2,1,3,140,0,0.00554,0.517241,0.583333,0.376506,0.346632,0.322248,...,0.227273,0.158081,0.386905,0.69936,0.220588,0.15564,0.458638,0.416667,0.728682,0.721348
3,1,4,139,0,0.00831,0.741379,0.5,0.370482,0.285154,0.408001,...,0.19697,0.105717,0.255952,0.573561,0.25,0.17009,0.257022,0.25,0.666667,0.66211
4,1,5,138,0,0.01108,0.58046,0.5,0.391566,0.352082,0.332039,...,0.166667,0.102396,0.27381,0.73774,0.220588,0.152751,0.300885,0.166667,0.658915,0.716377


In [4]:
''' Traditional feature engieering '''

import pandas as pd
import numpy as np

lag_window = 5
lag_cols = [s for s in feature_cols if s not in ['cycle_norm','setting1','setting2','setting3']]

# build lagging features - train data set
df_mean = train_df[lag_cols].rolling(window=lag_window).mean()
df_std = train_df[lag_cols].rolling(window=lag_window).std()
df_mean.columns = ['MA'+s for s in lag_cols]
df_std.columns = ['STD'+s for s in lag_cols]
df_train = pd.concat([train_df,df_mean,df_std], axis=1, join='inner')

# cut head by id, due to lagging transformation
train_array = [df_train[df_train['id']==id].values[lag_window+40:,:] for id in df_train['id'].unique()]
train_array = np.concatenate(train_array).astype(np.float32)

# build train data matrix
train_X = train_array[:,4:]
train_y = train_array[:,3]

# split train data set into train and validation sub sets
total_count = train_array.shape[0]
val_count = int(train_array.shape[0]*0.2)

val_X = train_X[-1*val_count:,:]
val_y = train_y[-1*val_count:]
train_X = train_X[:total_count-val_count,:]
train_y = train_y[:total_count-val_count]

# build test data matrix
df_mean = test_df[lag_cols].rolling(window=lag_window).mean()
df_std = test_df[lag_cols].rolling(window=lag_window).std()
df_mean.columns = ['MA'+s for s in lag_cols]
df_std.columns = ['STD'+s for s in lag_cols]
df_test = pd.concat([test_df,df_mean,df_std], axis=1, join='inner')
# select last row
test_array = [df_test[df_test['id']==id].values[-1:,:] for id in df_test['id'].unique()]
test_array = np.concatenate(test_array).astype(np.float32)
# build the matrix
test_X = test_array[:,4:]
test_y = test_array[:,3]

In [5]:
df_train.head(5)

Unnamed: 0,id,cycle,RUL,label1,cycle_norm,setting1,setting2,s2,s3,s4,...,STDs8,STDs9,STDs11,STDs12,STDs13,STDs14,STDs15,STDs17,STDs20,STDs21
0,1,1,191,0,0.0,0.45977,0.166667,0.183735,0.406802,0.309757,...,,,,,,,,,,
1,1,2,190,0,0.00277,0.609195,0.25,0.283133,0.453019,0.352633,...,,,,,,,,,,
2,1,3,189,0,0.00554,0.252874,0.75,0.343373,0.369523,0.370527,...,,,,,,,,,,
3,1,4,188,0,0.00831,0.54023,0.5,0.343373,0.256159,0.331195,...,,,,,,,,,,
4,1,5,187,0,0.01108,0.390805,0.333333,0.349398,0.257467,0.404625,...,0.040087,0.020584,0.089918,0.092233,0.038065,0.013682,0.099856,0.091287,0.05707,0.046256


In [6]:
train_X[:5]

array([[0.12465374, 0.51724136, 0.5833333 , 0.36746988, 0.37606278,
        0.32106686, 1.        , 0.6296296 , 0.24242425, 0.17342727,
        0.20833333, 0.7100213 , 0.29411766, 0.14846733, 0.28010774,
        0.41666666, 0.6124031 , 0.66321456, 0.31325302, 0.34283847,
        0.33021608, 1.        , 0.6476651 , 0.23939393, 0.1560262 ,
        0.31666666, 0.74925375, 0.25588235, 0.15158427, 0.3260485 ,
        0.36666667, 0.63100773, 0.68017125, 0.07225777, 0.07573704,
        0.04291154, 0.        , 0.02937209, 0.03455077, 0.02504378,
        0.07160197, 0.07947402, 0.05157552, 0.00922145, 0.0319951 ,
        0.04564355, 0.07954712, 0.07981122],
       [0.12742382, 0.5       , 0.9166667 , 0.30120483, 0.20231088,
        0.34841323, 1.        , 0.7294686 , 0.24242425, 0.13600467,
        0.22023809, 0.67164177, 0.25      , 0.1355661 , 0.27856869,
        0.33333334, 0.65891474, 0.81897265, 0.3186747 , 0.32391542,
        0.33531398, 1.        , 0.66183573, 0.23333333, 0.15152113,
   

In [7]:
'''
Traditional models
'''
# GBM
from sklearn.ensemble import GradientBoostingClassifier
model_GBM = GradientBoostingClassifier(random_state=42, verbose=1)
model_GBM.fit(train_X, train_y)
printScores(model_GBM.predict(train_X), train_y)
printScores(model_GBM.predict(val_X), val_y)
printScores(model_GBM.predict(test_X), test_y)

with open('model_GBM.pickle','wb') as f:
    pickle.dump(model_GBM, f)

# Logistic Regression
from sklearn.linear_model import LogisticRegression
model_Linear = LogisticRegression(C=1, penalty='l1', tol=0.0001, max_iter=1000, verbose=1)
model_Linear.fit(train_X, train_y)
printScores(model_Linear.predict(train_X), train_y)
printScores(model_Linear.predict(val_X), val_y)
printScores(model_Linear.predict(test_X), test_y)

with open('model_Linear.pickle','wb') as f:
    pickle.dump(model_Linear, f)

      Iter       Train Loss   Remaining Time 
         1           0.8488            3.77s
         2           0.7525            3.74s
         3           0.6756            3.68s
         4           0.6147            3.66s
         5           0.5660            3.61s
         6           0.5228            3.58s
         7           0.4870            3.56s
         8           0.4550            3.52s
         9           0.4281            3.48s
        10           0.4041            3.46s
        20           0.2710            3.09s
        30           0.2189            2.68s
        40           0.1946            2.28s
        50           0.1809            1.87s
        60           0.1695            1.49s
        70           0.1603            1.11s
        80           0.1536            0.74s
        90           0.1467            0.37s
       100           0.1411            0.00s

[[10256   107]
 [  226  2316]]
precision =  0.9558398679323153 , recall =  0.911093627065303

[[26