In [1]:

# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATASETS
# TO THE CORRECT LOCATION (/kaggle/input) IN YOUR NOTEBOOK,
# THEN FEEL FREE TO DELETE CELL.

import os
import sys
from tempfile import NamedTemporaryFile
from urllib.request import urlopen
from urllib.parse import unquote
from urllib.error import HTTPError
from zipfile import ZipFile

CHUNK_SIZE = 40960 
DATASET_MAPPING = 'cmapass:https%3A%2F%2Fstorage.googleapis.com%2Fkaggle-data-sets%2F29902%2F38112%2Fbundle%2Farchive.zip%3FX-Goog-Algorithm%3DGOOG4-RSA-SHA256%26X-Goog-Credential%3Dgcp-kaggle-com%2540kaggle-161607.iam.gserviceaccount.com%252F20201002%252Fauto%252Fstorage%252Fgoog4_request%26X-Goog-Date%3D20201002T065525Z%26X-Goog-Expires%3D259199%26X-Goog-SignedHeaders%3Dhost%26X-Goog-Signature%3Da75e303609bec5a34dd829bad0418a195ddac6faf3ced7e5a8ffad2aef791ed0b20b5909f9f9d9a0f3acd17175568cd7fa2fadd87e144a8d451c60417b997fe16d8de2547cbf72818ab0fd9bdeec2c3d919af94a8a8b2d9c9284e8142535df8a3866e87c0852f5e0bc436fd13c1b8d68fe6b62f638cb201439527953fdd116234f2402e237374a860cae3384c9f5176f517d47c07a7168a0c56d12b6ccd906fc8c0fbcc419d0e1411d53b1ae916349dd827f6a660958989a02d4568c133f99e72c74d89f4bff0402a9b8d3dccf374762f5606b273f99bacdf41ebb727f95407b20875f71ba3d2a05ed112ef26dd8720f42bf48e0d2c6ed1ec1397f44c4599d9c'
KAGGLE_INPUT_PATH='/home/kaggle/input'
KAGGLE_INPUT_SYMLINK='/kaggle'

!mkdir -p -- $KAGGLE_INPUT_PATH
!chmod 777 $KAGGLE_INPUT_PATH
!ln -sfn $KAGGLE_INPUT_PATH ../
!mkdir -p -- $KAGGLE_INPUT_SYMLINK
!ln -sfn $KAGGLE_INPUT_PATH $KAGGLE_INPUT_SYMLINK

for dataset_mapping in DATASET_MAPPING.split(','):
    directory, download_url_encoded = dataset_mapping.split(':')
    download_url = unquote(download_url_encoded)
    destination_path = os.path.join(KAGGLE_INPUT_PATH, directory)
    try:
        with urlopen(download_url) as zipfileres, NamedTemporaryFile() as tfile:
            total_length = zipfileres.headers['content-length']
            print(f'Downloading {directory}, {total_length} bytes zipped')
            dl = 0
            data = zipfileres.read(CHUNK_SIZE)
            while len(data) > 0:
                dl += len(data)
                tfile.write(data)
                done = int(50 * dl / int(total_length))
                sys.stdout.write(f"\r[{'=' * done}{' ' * (50-done)}] {dl} bytes downloaded")
                sys.stdout.flush()
                data = zipfileres.read(CHUNK_SIZE)
            print(f'\nUnzipping {directory}')
            with ZipFile(tfile) as zfile:
                zfile.extractall(destination_path)
    except HTTPError as e:
        print(f'Failed to load (likely expired) {download_url} to path {destination_path}')
        continue
    except OSError as e:
        print(f'Failed to load {download_url} to path {destination_path}')
        continue
print('Dataset import complete.')


Failed to load (likely expired) https://storage.googleapis.com/kaggle-data-sets/29902/38112/bundle/archive.zip?X-Goog-Algorithm=GOOG4-RSA-SHA256&X-Goog-Credential=gcp-kaggle-com%40kaggle-161607.iam.gserviceaccount.com%2F20201002%2Fauto%2Fstorage%2Fgoog4_request&X-Goog-Date=20201002T065525Z&X-Goog-Expires=259199&X-Goog-SignedHeaders=host&X-Goog-Signature=a75e303609bec5a34dd829bad0418a195ddac6faf3ced7e5a8ffad2aef791ed0b20b5909f9f9d9a0f3acd17175568cd7fa2fadd87e144a8d451c60417b997fe16d8de2547cbf72818ab0fd9bdeec2c3d919af94a8a8b2d9c9284e8142535df8a3866e87c0852f5e0bc436fd13c1b8d68fe6b62f638cb201439527953fdd116234f2402e237374a860cae3384c9f5176f517d47c07a7168a0c56d12b6ccd906fc8c0fbcc419d0e1411d53b1ae916349dd827f6a660958989a02d4568c133f99e72c74d89f4bff0402a9b8d3dccf374762f5606b273f99bacdf41ebb727f95407b20875f71ba3d2a05ed112ef26dd8720f42bf48e0d2c6ed1ec1397f44c4599d9c to path /home/kaggle/input/cmapass
Dataset import complete.


In [2]:
# load necessary packages and view available data
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import keras
from sklearn.preprocessing import MinMaxScaler
%matplotlib inline
sns.set()
print(os.listdir("../input/cmapass"))
# Setting seed for reproducability
np.random.seed(1234)  
PYTHONHASHSEED = 0

['test_FD003.txt', 'RUL_FD001.txt', 'RUL_FD002.txt', 'RUL_FD003.txt', 'train_FD003.txt', 'train_FD004.txt', 'test_FD004.txt', 'test_FD002.txt', 'train_FD001.txt', 'train_FD002.txt', 'RUL_FD004.txt', 'test_FD001.txt']


In [3]:
# the files did not contain headers. Here we create labels based on documentation
target_var = ['Target_Remaining_Useful_Life']
index_columns_names =  ["UnitNumber","Cycle"]
op_settings_columns = ["Op_Setting_"+str(i) for i in range(1,4)]
sensor_columns =["Sensor_"+str(i) for i in range(1,22)]
column_names = index_columns_names + op_settings_columns + sensor_columns
print(column_names)

['UnitNumber', 'Cycle', 'Op_Setting_1', 'Op_Setting_2', 'Op_Setting_3', 'Sensor_1', 'Sensor_2', 'Sensor_3', 'Sensor_4', 'Sensor_5', 'Sensor_6', 'Sensor_7', 'Sensor_8', 'Sensor_9', 'Sensor_10', 'Sensor_11', 'Sensor_12', 'Sensor_13', 'Sensor_14', 'Sensor_15', 'Sensor_16', 'Sensor_17', 'Sensor_18', 'Sensor_19', 'Sensor_20', 'Sensor_21']


In [4]:
# load data
train= pd.read_csv('../input/cmapass/train_FD001.txt', sep=" ", header=None)
test = pd.read_csv('../input/cmapass/test_FD001.txt', sep=" ", header=None)
print("train shape: ", train.shape, "test shape: ", test.shape)
# drop pesky NULL columns
train.drop(train.columns[[26, 27]], axis=1, inplace=True)
test.drop(test.columns[[26, 27]], axis=1, inplace=True)
# name columns
train.columns = column_names
test.columns = column_names
# train[train['UnitNumber'] == 1].head(5)
test[test['UnitNumber'] == 1].head(5)

train shape:  (20631, 28) test shape:  (13096, 28)


Unnamed: 0,UnitNumber,Cycle,Op_Setting_1,Op_Setting_2,Op_Setting_3,Sensor_1,Sensor_2,Sensor_3,Sensor_4,Sensor_5,...,Sensor_12,Sensor_13,Sensor_14,Sensor_15,Sensor_16,Sensor_17,Sensor_18,Sensor_19,Sensor_20,Sensor_21
0,1,1,0.0023,0.0003,100.0,518.67,643.02,1585.29,1398.21,14.62,...,521.72,2388.03,8125.55,8.4052,0.03,392,2388,100.0,38.86,23.3735
1,1,2,-0.0027,-0.0003,100.0,518.67,641.71,1588.45,1395.42,14.62,...,522.16,2388.06,8139.62,8.3803,0.03,393,2388,100.0,39.02,23.3916
2,1,3,0.0003,0.0001,100.0,518.67,642.46,1586.94,1401.34,14.62,...,521.97,2388.03,8130.1,8.4441,0.03,393,2388,100.0,39.08,23.4166
3,1,4,0.0042,0.0,100.0,518.67,642.44,1584.12,1406.42,14.62,...,521.38,2388.05,8132.9,8.3917,0.03,391,2388,100.0,39.0,23.3737
4,1,5,0.0014,0.0,100.0,518.67,642.51,1587.19,1401.92,14.62,...,522.15,2388.03,8129.54,8.4031,0.03,390,2388,100.0,38.99,23.413


In [5]:
# this section calculates Remaining Useful Life (RUL) in T-minus notation for the training data
# find the last cycle per unit number
max_cycle = train.groupby('UnitNumber')['Cycle'].max().reset_index()
max_cycle.columns = ['UnitNumber', 'MaxOfCycle']
# merge the max cycle back into the original frame
train_merged = train.merge(max_cycle, left_on='UnitNumber', right_on='UnitNumber', how='inner')
# calculate RUL for each row
Target_Remaining_Useful_Life = train_merged["MaxOfCycle"] - train_merged["Cycle"]
train_with_target = train_merged["Target_Remaining_Useful_Life"] = Target_Remaining_Useful_Life
# remove unnecessary column
train_with_target = train_merged.drop("MaxOfCycle", axis=1)
train_with_target[train_with_target['UnitNumber'] == 1].head(5)

Unnamed: 0,UnitNumber,Cycle,Op_Setting_1,Op_Setting_2,Op_Setting_3,Sensor_1,Sensor_2,Sensor_3,Sensor_4,Sensor_5,...,Sensor_13,Sensor_14,Sensor_15,Sensor_16,Sensor_17,Sensor_18,Sensor_19,Sensor_20,Sensor_21,Target_Remaining_Useful_Life
0,1,1,-0.0007,-0.0004,100.0,518.67,641.82,1589.7,1400.6,14.62,...,2388.02,8138.62,8.4195,0.03,392,2388,100.0,39.06,23.419,191
1,1,2,0.0019,-0.0003,100.0,518.67,642.15,1591.82,1403.14,14.62,...,2388.07,8131.49,8.4318,0.03,392,2388,100.0,39.0,23.4236,190
2,1,3,-0.0043,0.0003,100.0,518.67,642.35,1587.99,1404.2,14.62,...,2388.03,8133.23,8.4178,0.03,390,2388,100.0,38.95,23.3442,189
3,1,4,0.0007,0.0,100.0,518.67,642.35,1582.79,1401.87,14.62,...,2388.08,8133.83,8.3682,0.03,392,2388,100.0,38.88,23.3739,188
4,1,5,-0.0019,-0.0002,100.0,518.67,642.37,1582.85,1406.22,14.62,...,2388.04,8133.8,8.4294,0.03,393,2388,100.0,38.9,23.4044,187


In [7]:
# now it's time to clear out target leakage
print(train_with_target.shape)
leakage_to_drop = ['UnitNumber', 'Cycle', 'Op_Setting_1', 'Op_Setting_2', 'Op_Setting_3']  
train_no_leakage = train_with_target.drop(leakage_to_drop, axis = 1)
test_no_leakage = test.drop(leakage_to_drop, axis = 1)
print(train_no_leakage.shape)

#minmax normalization in [0, 1]
scaler = MinMaxScaler()
train_norm = scaler.fit_transform(train_no_leakage)
test_norm = scaler.fit_transform(test_no_leakage)

# set up features and target variable 
y = train_no_leakage['Target_Remaining_Useful_Life']
X = train_no_leakage.drop(['Target_Remaining_Useful_Life'], axis = 1)

(20631, 27)
(20631, 22)


In [31]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

batch = 32
lstm_input_layer = 21
lstm_hidden_layer = 50
lstm_output_layer = 1
n_epoches = 50

In [59]:
#slice along window
window_size = 12
def prepare_seq_dataset(input_data, ws):
    X, y = [], []
    n = len(input_data)
    for i in range(n-ws):
        X.append(input_data[i:i+ws, :-1])
        y.append(input_data[i+ws, -1])
    return torch.tensor(X).float(), torch.tensor(y).float()


class DataSet:
    def __init__(self, X, t):
        self.X = X # 入力
        self.t = t # 出力

    def __len__(self):
        return len(self.X) # データ数(10)を返す

    def __getitem__(self, index):
        # index番目の入出力ペアを返す
        return self.X[index], self.t[index]
    
X_seq, y_seq = prepare_seq_dataset(train_norm, window_size)
data = DataSet(X_seq, y_seq)
print(data.X.dtype)
dataloader = torch.utils.data.DataLoader(data, batch_size= batch)


torch.float32


In [60]:
class LSTMRegressor(nn.Module):
    def __init__(self, lstm_input_layer, lstm_hidden_layer, lstm_output_layer, batch):
        super().__init__()
        self.input_layer = lstm_input_layer
        self.hidden_layer = lstm_hidden_layer
        self.output_layer = lstm_output_layer
        self.batch = batch
        self.lstm = nn.LSTM(self.input_layer, self.hidden_layer, batch_first=True)
        #output size is
        # seq_len, batch, num_directions * hidden_size
        self.linear = nn.Linear(self.hidden_layer, self.output_layer)
    
    def init_hidden(self, batch_size):
        hidden_state = torch.zeros(1, batch_size, self.hidden_layer)
        cell_state = torch.zeros(1, batch_size, self.hidden_layer)
        self.hidden = (hidden_state, cell_state)
        
        
    def forward(self, x):
        batch = x.size(0)
        lstm_out, self.hidden = self.lstm(x, self.hidden)
        ret = self.linear(self.hidden[0][0].view(batch, -1))
        return ret
        
        

In [None]:
#training
model = LSTMRegressor(lstm_input_layer, lstm_hidden_layer, lstm_output_layer, batch)
loss_fn = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=.01)
for epoch in range(n_epoches):
    for i, data in enumerate(dataloader):
        x_batch, y_batch = data
#         print(x_batch.shape, x_batch.dtype)
        optimizer.zero_grad()
        model.init_hidden(len(x_batch))
        #calc 
        y_pred = model(x_batch)
        loss = loss_fn(y_pred, y_batch)
        # update
        loss.backward()
        optimizer.step()
    if epoch%10 == 0:
        print('epoch: {0:3}: loss: {1:10.8f}'.format(epoch, loss.item()))

epoch:   0: loss: 0.02165283
epoch:  10: loss: 0.01038879
