In [None]:
# Data wrangling imports
import pandas as pd
import numpy as np
import scipy
import scipy.stats as stats
from tqdm import tqdm

# PyTorch imports
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torch.utils.data as data
from torchvision import transforms

In [None]:
train_raw = np.load("./data/train_raw.npy")
train_mean = 4.51945
train_std = 10.735717
print(len(train_raw))

trainval_df = pd.read_pickle('./data/train_features.pkl')
trainval_df = trainval_df[:-2]
trainval_df.head()

test_df = pd.read_pickle('./data/test_features.pkl')
test_segments = np.array([seg for seg in test_df['segment'].to_numpy()])
test_labels = test_df['target']
test_dat = np.array([[test_segments[i], test_labels[i]] for i in range(len(test_labels))])
print(len(test_dat))

In [None]:
# Helper function for the data loader. Extracts mean, standard deviation etc per time step.
# Can easily be extended. Expects a two dimensional array.
def extract_features(z):
    return np.c_[
        z.mean(axis=1), 
        z.min(axis=1),
        z.max(axis=1),
        z.std(axis=1),
        stats.skew(z, axis=1),
        stats.kurtosis(z, axis=1),
        np.abs(z).max(axis=1),
        np.abs(z).min(axis=1),
        np.abs(z).std(axis=1),
        np.quantile(z ,0.01, axis=1),
        np.quantile(z ,0.05, axis=1),
        np.quantile(z ,0.95, axis=1),
        np.quantile(z ,0.55, axis=1)
     ]
    
# For a given ending position "last_index", we split the last 150'000 values 
# of "x" into 150 pieces of length 1000 each. So n_steps * step_length should equal 150'000.
# From each piece, a set features are extracted. This results in a feature matrix 
# of dimension (150 time steps x features).  
def create_X(x, n_steps=150, step_length=1000, other_lens=(10, 100)):
    # Reshaping and normalization.
    temp = (x.reshape(n_steps, -1) - train_mean) / train_std
    return extract_features(temp)

# Query "create_X" to figure out the number of features
n_features = create_X(trainval_df["segment"][0]).shape[1]
print("Our RNN is based on %i features"% n_features)

In [None]:
class LANLDataset(data.Dataset):
    def __init__(self, datas, n_seqs=5000 ,n_steps=150, step_length=1000, n_features=12, phase="train"):
        self.samples = torch.Tensor(n_seqs, n_steps, n_features)
        self.targets = torch.Tensor(n_seqs)
        if phase == "train":
            # Sample n_seqs indices from the whole data
            sampled_indices = sorted(np.random.randint(n_steps * step_length, high=len(datas), size=n_seqs))
            for i, idx in enumerate(sampled_indices):
                chunk = datas[idx - (n_steps * step_length) : idx]
                self.samples[i] = torch.tensor(
                    create_X(chunk[:,0], n_steps=n_steps, step_length=step_length)
                )
                self.targets[i] = torch.tensor([chunk[-1, 1]])
        else:
            for i, chunk in enumerate(datas):
                self.samples[i] = torch.tensor(
                    create_X(chunk[0], n_steps=n_steps, step_length=step_length)
                )
                self.targets[i] = torch.tensor([-999])
        print(self.samples.shape)
            
    def __getitem__(self, index):
        return self.samples[index], self.targets[index]
    
    def __len__(self):
        return len(self.targets)
    
    def train_val_split(self, train_ratio, val_ratio):
        if (train_ratio + val_ratio != 1):
            raise Exception('Ratios should sum to one.')
        train_length = int(train_ratio * len(self))
        val_length = len(self) - train_length
        splits = [train_length, val_length]
        return data.random_split(self, splits)

    
batch_size = 32

# Initialize data loaders
trainval_data = LANLDataset(train_raw, n_seqs=5000, n_steps=150, step_length=1000, n_features=13)
train_data, val_data = trainval_data.train_val_split(0.8, 0.2)
test_data = LANLDataset(test_dat, n_steps=150, step_length=1000, n_features=13, phase="test")

datasets = {
    "train": train_data, 
    "val": val_data,
    "test": test_data
}

dl_params = {
    "batch_size": batch_size,
    "shuffle": False,
    "num_workers": 16
}
dataloaders = {
    phase: data.DataLoader(dataset, **dl_params)
    for phase, dataset in datasets.items()
}

In [None]:
class LANLModel(nn.Module):
    def __init__(self, device, input_dim=1, hidden_dim=64, output_dim=1, batch_size=64, num_layers=1):
        super(LANLModel, self).__init__()
        self.device = device
        self.input_dim = input_dim
        self.hidden_dim = hidden_dim
        self.output_dim = output_dim
        self.batch_size = batch_size
        self.num_layers = num_layers
        self.rnn = nn.LSTM(self.input_dim, self.hidden_dim, self.num_layers, batch_first=True)
        self.fc = nn.Linear(self.hidden_dim, self.output_dim)
        self.to(self.device)
        
    def init_hidden(self):
        return (
            torch.zeros(self.num_layers, self.batch_size, self.hidden_dim).to(self.device),
            torch.zeros(self.num_layers, self.batch_size, self.hidden_dim).to(self.device)
        )
        
    def forward(self, x):
        #x = x.unsqueeze(2)
        rnn_out, _ = self.rnn(x, self.init_hidden())
        out = self.fc(rnn_out[:,-1,:])
        return out


model_params = {
    "batch_size": batch_size,
    "input_dim": 13,
    "hidden_dim": 48,
    "num_layers": 1
}

device = torch.device("cuda")
model = LANLModel(device, **model_params)
criterion = nn.L1Loss()
optimizer = optim.Adam(model.parameters(), lr=0.005)
print(len(dataloaders["train"]))

def train():
    model.train(mode=True)
    for epoch in range(30):
        print("Epoch " + str(epoch))
        train_loss = 0
        for idx, (samples, targets) in enumerate(dataloaders["train"]):
            if idx == len(dataloaders["train"]) - 1:
                continue
            samples, targets = samples.to(device), targets.to(device)
            model.zero_grad()
            outputs = model(samples)
            #print(outputs)
            loss = criterion(outputs.float(), targets)
            loss.backward()
            optimizer.step()
            train_loss += loss.item()
        train_loss /= (len(dataloaders["train"]) - 1)
        print(train_loss)

train()
torch.save(model.state_dict(), "./model.pth")

In [None]:
from datetime import datetime

def LANL_test(model, dataloaders, submission_path='./submissions'):
    predictions = {'seg_id': [], 'time_to_failure': []}
    model.eval()
    with torch.no_grad():
        for idx, (features, _) in enumerate(dataloaders['test']):
            features = features.to(device)
            start, end = idx, idx + features.shape[0]
            seg_id = test_df['seg_id'][start:end]
            output = model(features).squeeze(0)
            for i in range(features.shape[0]):
                predictions['seg_id'].append(seg_id.values[i])
                predictions['time_to_failure'].append(output[i].item())
    df = pd.DataFrame.from_dict(predictions)
    datetime_str = datetime.now().strftime("%Y%m%d-%H%M%S")
    df.to_csv(submission_path + '/submission_' + datetime_str + '.csv', index=False)
    return df

LANL_test(model, dataloaders)