## Connect to Drive

In [None]:
from google.colab import drive
drive.mount('/gdrive')
%cd /gdrive/My Drive/Path-to-Data/

## Import libraries

In [None]:
# Fix randomness and hide warnings
seed = 42

import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
os.environ['PYTHONHASHSEED'] = str(seed)
os.environ['MPLCONFIGDIR'] = os.getcwd()+'/configs/'

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter(action='ignore', category=Warning)

import numpy as np
np.random.seed(seed)

import logging

import random
random.seed(seed)

In [None]:
# Import tensorflow
import tensorflow as tf
from tensorflow import keras as tfk
from tensorflow.keras import layers as tfkl
tf.autograph.set_verbosity(0)
tf.get_logger().setLevel(logging.ERROR)
tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR)
tf.random.set_seed(seed)
tf.compat.v1.set_random_seed(seed)
print(tf.__version__)

In [None]:
import pandas as pd
import seaborn as sns
from datetime import datetime
import matplotlib.pyplot as plt
plt.rc('font', size=16)
from sklearn.preprocessing import MinMaxScaler

## Load and process data

In [None]:
training_data = np.load('training_data.npy')
valid_periods = np.load('valid_periods.npy')
categories = np.load('categories.npy')

print("Training dataset shape: ", training_data.shape)
print("Valid periods shape: ", valid_periods.shape)
print("Categories shape:", categories.shape)

In [None]:
print(training_data)
print(training_data.dtype)

In [None]:
training_data = training_data.astype(np.float32)
print(training_data.dtype)

In [None]:
print(valid_periods)
print(valid_periods.dtype)

In [None]:
valid_periods = valid_periods.astype(np.int32)
print(valid_periods.dtype)

In [None]:
print(categories)
print(categories.dtype)

In [None]:
# Get the count of samples for each category
category_labels, count = np.unique(categories, return_counts=True)

# Print the list of category labels
print("Category labels: ", category_labels)

# Print the count of samples for each category
for cat, count in zip(category_labels, count):
    print(f"Category: {cat}, Number of samples: {count}")

dic = {'A':0,
       'B':0,
       'C':0,
       'D':0,
       'E':0,
       'F':0}


for row in range(0, len(valid_periods)):
  if valid_periods[row][1] - valid_periods[row][0] >= 220:
      dic[categories[row]]+=1
print(dic)

In [None]:
# Inspect the timeseries' lengths
min_length = training_data.shape[1] + 1
max_length = 0
idx_min = 0
idx_max = 0
sum = 0

for row in range(valid_periods.shape[0]):
  start = valid_periods[row][0]
  end = valid_periods[row][1]
  length = end - start
  sum += length

  if length > max_length:
    max_length = length
    idx_max = row
  if length < min_length:
    min_length = length
    idx_min = row
  # print(f'Time series {row}, length: {length}')

print(f'Minimum length: {min_length}, row index: {idx_min}')
print(f'Maximum length: {max_length}, row index: {idx_max}')
print(f'Average length: {sum/training_data.shape[0]}')

In [None]:
idx_set = []
idx = 0
category_labels = category_labels.tolist()

# Get the one row index (time series index) for each available category
while len(category_labels) != 0 & idx < training_data.shape[0]:
  category = categories[idx][0]
  if category in category_labels:
    idx_set.append(idx)
    category_labels.remove(category)
  idx += 1

# Plot one time series per category, removing the padding
for i in idx_set:
  start = valid_periods[i][0]
  end = valid_periods[i][1]
  x_values = np.arange(end - start)

  plt.figure(figsize=(11, 4))
  plt.plot(x_values, training_data[i][start:end])
  plt.title(f'Time Series from Category {categories[i][0]}')
  plt.xlabel('Timestamp')
  plt.ylabel('Values')
  plt.show()

In [None]:
# Set the size of the test set
test_set_percentage = 0.1
test_size = int(test_set_percentage * training_data.shape[0])

# Compute the number of samples for each category to be moved inside the test set
test_samples_per_cat = int(test_size / len(np.unique(categories)))

print("Test size: ", test_size)
print("Samples per category: ",test_samples_per_cat)

In [None]:
def get_samples_per_cat(categories, dataset, samples_per_cat, time_series_length):

  rows_to_delete = []
  test_F_category_len = 30

  training_set = np.copy(dataset)
  training_set_categories = np.copy(categories)
  training_set_indices = np.copy(valid_periods)
  unique_categories = np.unique(categories)
  test_len = (len(unique_categories)-1)*samples_per_cat + test_F_category_len #other categories can have a lot of samples in test set, category F is too short
  test_set_categories = [0] * test_len
  test_set_indices = [0] * test_len
  test_set = [[0] * time_series_length for _ in range(test_len)]
  print(test_len)
  i = 0
  for c in unique_categories:

    if c == 'F':
      counter = test_F_category_len
    else:
      counter = samples_per_cat

    for row in range (len(dataset)):
      if (counter == 0):
        break;

      if categories[row] == c :

        rows_to_delete.append(row)
        test_set_categories[i] = c
        test_set[i] = dataset[row]
        test_set_indices[i] = valid_periods[row]
        counter-=1
        i+=1

  training_set = np.delete(training_set, rows_to_delete, axis=0)
  training_set_categories = np.delete(training_set_categories, rows_to_delete, axis=0)
  training_set_indices = np.delete(training_set_indices, rows_to_delete, axis=0)
  test_set = np.array(test_set)
  test_set_categories = np.array(test_set_categories)
  test_set_indices = np.array(test_set_indices)

  return training_set, training_set_categories, training_set_indices, test_set, test_set_categories, test_set_indices

In [None]:
training_set, training_set_categories, training_set_indices, test_set, test_set_categories, test_set_indices = get_samples_per_cat(categories, training_data, test_samples_per_cat, training_data.shape[1])
print(training_set.shape)
print(training_set_categories.shape)
print(training_set_indices.shape)
print(test_set.shape)
print(test_set_categories.shape)
print(test_set_indices.shape)

In [None]:
window = 200
stride = 5
telescope = 18

In [None]:
def build_sequences(dataset,dataset_valid_idx,dataset_categories,window,stride,telescope):
  new_dataset = []
  new_labels = []
  new_categories = []

  for ts_idx in range(len(dataset)):
    start = dataset_valid_idx[ts_idx][0]
    end = dataset_valid_idx[ts_idx][1]
    ts_len = end - start

    #if the sequence is shorter than the window + the telescope, we pad it
    padding_check = window+telescope-ts_len

    if padding_check > 0:
      padding = np.zeros((padding_check), dtype='float32')
      ts_temp = np.concatenate((padding,dataset[ts_idx][start:end]))
    else:
      ts_temp = np.array(dataset[ts_idx][start:end])

    #make the window slide by a stride quantity
    i=0
    counter = 0
    while(i+window+telescope <= len(ts_temp)):
      new_dataset.append(ts_temp[i:i+window])
      new_labels.append(ts_temp[i+window:i+window+telescope])
      new_categories.append(dataset_categories[ts_idx])
      i+=stride

    #if by sliding at the last step, we surpassed the end of the sequence and missed some values at the end
    #go back and take a window that contains those values
    if(i+window+telescope > len(ts_temp) and (i+window+telescope - stride) < len(ts_temp)):
      new_dataset.append(ts_temp[len(ts_temp)-telescope-window:len(ts_temp)-telescope])
      new_labels.append(ts_temp[len(ts_temp)-telescope:len(ts_temp)])
      new_categories.append(dataset_categories[ts_idx])



  new_dataset = np.array(new_dataset)
  new_labels = np.array(new_labels)
  new_categories = np.array(new_categories)
  return new_dataset, new_labels, new_categories

In [None]:
X_train, y_train, train_categories = build_sequences(training_set,training_set_indices,training_set_categories,window,stride,telescope)
X_test, y_test, test_categories = build_sequences(test_set,test_set_indices,test_set_categories,window,stride,telescope)
dic = {'A':0,
       'B':0,
       'C':0,
       'D':0,
       'E':0,
       'F':0}
for c in train_categories:
  dic[c]+=1;
print(dic)

dic = {'A':0,
       'B':0,
       'C':0,
       'D':0,
       'E':0,
       'F':0}
for c in test_categories:
  dic[c]+=1;
print(dic)

print(X_train.shape, y_train.shape, train_categories.shape)
print(X_test.shape, y_test.shape, test_categories.shape)

In [None]:
X_train = X_train.reshape(-1, X_train.shape[1], 1)
y_train = y_train.reshape(-1, y_train.shape[1], 1)

print(X_train.shape, y_train.shape,)

## Build model


In [None]:
input_shape = X_train.shape[1:]
output_shape = y_train.shape[1:]
batch_size = 128
epochs = 200

print(input_shape)
print(output_shape)


In [None]:
def build_model(input_shape, output_shape):


  input_layer = tfkl.Input(shape=input_shape, name='input_layer')
  x = tfkl.LSTM(128, return_sequences=True, name='lstm1')(input_layer)
  x = tfkl.LSTM(128, name='lstm2',dropout = 0.4)(x)

  output_layer = tfkl.Dense(output_shape[0],name='output_layer')(x)

  model = tf.keras.Model(inputs=input_layer, outputs=output_layer, name='model')

  model.compile(loss=tf.keras.losses.MeanSquaredError(), optimizer=tf.keras.optimizers.Adam(1e-3))

  return model

In [None]:
model = build_model(input_shape, output_shape)
model.summary()
tfk.utils.plot_model(model, expand_nested=True, show_shapes=True)

## Train model

In [None]:
# Train the model
history = model.fit(
    x = X_train,
    y = y_train,
    batch_size = batch_size,
    epochs = epochs,
    validation_split=.2,
    callbacks = [
        tfk.callbacks.EarlyStopping(monitor='val_loss', mode='min', patience=12, min_delta= 0.0001 ,restore_best_weights=True),
        tfk.callbacks.ReduceLROnPlateau(monitor='val_loss', mode='min', patience=10, factor=0.1, min_lr=1e-5)
    ]
).history

In [None]:
best_epoch = np.argmin(history['val_loss'])
plt.figure(figsize=(17,4))
plt.plot(history['loss'], label='Training loss', alpha=.8, color='#ff7f0e')
plt.plot(history['val_loss'], label='Validation loss', alpha=.9, color='#5a9aa5')
plt.axvline(x=best_epoch, label='Best epoch', alpha=.3, ls='--', color='#5a9aa5')
plt.title('Mean Squared Error')
plt.legend()
plt.grid(alpha=.3)
plt.show()

plt.figure(figsize=(18,3))
plt.plot(history['lr'], label='Learning Rate', alpha=.8, color='#ff7f0e')
plt.axvline(x=best_epoch, label='Best epoch', alpha=.3, ls='--', color='#5a9aa5')
plt.legend()
plt.grid(alpha=.3)
plt.show()

In [None]:
#Predict the test set using the model
predictions = model.predict(X_test, verbose=0)

#Print the shape of the predictions
print(f"Predictions shape: {predictions.shape}")

#Calculate and print Mean Squared Error (MSE)
mean_squared_error = tfk.metrics.mean_squared_error(y_test.flatten(), predictions.flatten()).numpy()
print(f"Mean Squared Error: {mean_squared_error}")

#Calculate and print Mean Absolute Error (MAE)
mean_absolute_error = tfk.metrics.mean_absolute_error(y_test.flatten(), predictions.flatten()).numpy()
print(f"Mean Absolute Error: {mean_absolute_error}")

In [None]:
if isinstance(predictions, np.ndarray):
    print("Variable is a NumPy array")

## Save model

In [None]:
model.save('SubmissionModel')

## Load model