In [66]:
import os, sys
sys.path.append(os.path.dirname(os.path.abspath(os.getcwd())))

import pickle
from src.configuration.constants import INTERIM_DATA_DIRECTORY, PROCESSED_DATA_DIRECTORY

from mlprimitives.custom.timeseries_preprocessing import time_segments_aggregate
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler
from mlprimitives.custom.timeseries_preprocessing import rolling_window_sequences
from orion.primitives.timeseries_preprocessing import slice_array_by_dims
from mlprimitives import load_primitive

from orion import Orion

In [13]:
msl_filepath = os.path.join(INTERIM_DATA_DIRECTORY, 'MSL.pickle')

with open(msl_filepath, 'rb') as f:
    msl_dataset = pickle.load(f)

In [114]:
train = msl_dataset['train']
test = msl_dataset['test']
anomaly = msl_dataset['anomaly']

X_train = train[train.signal == 'M-1'].iloc[:, 4:]
y_train = train[train.signal == 'M-1'].anomaly.values


X_test = test[test.signal == 'M-1'].iloc[:, 4:]
y_test = test[test.signal == 'M-1'].anomaly.values

anomaly = anomaly[anomaly.signal == 'M-1']

X_train.shape

(2209, 56)

In [115]:
# Creates an equi-spaced time series by aggregating values over fixed specified interval
# Food for thought, we don't need to bin it by set intervals?

params = {
    "time_column": "index", 
    "interval": 1, 
    # "interval": interval,
    "method": "mean"
}
primitive = load_primitive("mlprimitives.custom.timeseries_preprocessing.time_segments_aggregate", arguments=params)
X, index = primitive.produce(X=X_test)

X.shape, index.shape

((2277, 55), (2277,))

In [116]:
# This primitive is an imputation transformer for filling missing values
params = {
    'X': X
}
primitive = load_primitive('sklearn.impute.SimpleImputer', arguments=params)
primitive.fit()
X = primitive.produce(X=X)

X.shape

(2277, 55)

In [117]:
# This primitive transforms features by scaling each feature to a given range
params = {
    "feature_range": [-1, 1], 
    'X': X,
}
primitive = load_primitive('sklearn.preprocessing.MinMaxScaler', arguments=params)
primitive.fit()
X = primitive.produce(X=X)

X.shape

(2277, 55)

In [118]:
# Uses a rolling window approach to create the sub-sequences out of time series data
params = {
    "target_column": 0, 
    "window_size": 100, 
    'target_size': 1, 
    'step_size': 1
}
primitive = load_primitive('mlprimitives.custom.timeseries_preprocessing.rolling_window_sequences',
                           arguments=params)
X, y, index, target_index = primitive.produce(X=X, index=index)


# Target / target size is the next interval that is trying to predict.
# Index is the start of the interval
X.shape, y.shape, index.shape, target_index.shape

((2177, 100, 55), (2177, 1), (2177,), (2177,))

In [119]:
# Target
params = {
    "target_index": 0, 
    "axis": 2
}
primitive = load_primitive('orion.primitives.timeseries_preprocessing.slice_array_by_dims',
                           arguments=params)
y = primitive.produce(X=X)

# Trying to predict the target sequence which is the first column of X
X.shape, y.shape

((2177, 100, 55), (2177, 100, 1))

In [120]:
output = {
    'X_test': X,
    'y_test': y,
    'index_test': index,
}

In [121]:
msl_output_filepath = os.path.join(PROCESSED_DATA_DIRECTORY, 'MSL_test.pickle')

with open(msl_output_filepath, 'wb') as f:
    pickle.dump(output, f)