## Basic imports

In [1]:
import pandas as pd
import numpy as np
import io
import time
import json
import sagemaker.amazon.common as smac
import os
import boto3
import re
import sagemaker
from os.path import join
from sklearn.model_selection import train_test_split

## Setup the config

In [2]:
ROLE = sagemaker.get_execution_role()
REGION = boto3.Session().region_name

# S3 bucket for saving code and model artifacts.
# Feel free to specify a different bucket and prefix

BUCKET = 'virtual-regatta-ml'
PREFIX = 'non_linear_keras_sine' # place to upload training files within the bucket
TEST_NAME = 'default'

## Load the data

In [3]:
data = pd.read_csv('../all-in-one.csv')
# This column will be predicted using a classifier
data['cos_attack_angle'] = data['angle_of_attack'].apply(lambda x: np.cos(np.deg2rad(x)))
data['sin_attack_angle'] = data['angle_of_attack'].apply(lambda x: np.sin(np.deg2rad(x)))
data = data.drop(columns=['cos_boat_angle','angle_of_attack'])

# The column to predict has to be in last position
df_reordered = data[['boat_speed','cos_attack_angle', 'sin_attack_angle', 'wind_speed','cos_target_angle', 'sin_target_angle', 'sin_boat_angle']]

print(df_reordered.head(2), '\n\n\n\n')
df_reordered.describe()

   boat_speed  cos_attack_angle  sin_attack_angle  wind_speed  \
0     7.73236         -0.558392         -0.829577     9.43467   
1     7.73236         -0.558392         -0.829577     9.43467   

   cos_target_angle  sin_target_angle  sin_boat_angle  
0         -0.554636         -0.832093       -0.867056  
1         -0.564295         -0.825573       -0.867056   






Unnamed: 0,boat_speed,cos_attack_angle,sin_attack_angle,wind_speed,cos_target_angle,sin_target_angle,sin_boat_angle
count,3690.0,3690.0,3690.0,3690.0,3690.0,3690.0,3690.0
mean,6.461686,-0.165279,0.082917,8.597956,-0.211291,-0.054109,-0.062435
std,2.755074,0.565311,0.804048,5.374964,0.632321,0.743543,0.753753
min,0.0,-0.999884,-1.0,2.0,-1.0,-0.999996,-0.999999
25%,4.436315,-0.681583,-0.735986,4.68949,-0.814589,-0.784158,-0.766044
50%,5.79767,-0.146813,0.529605,6.638665,-0.308068,-0.266936,-0.300109
75%,7.90296,0.234136,0.819644,10.036542,0.21358,0.891168,0.870233
max,14.2913,0.999766,1.0,21.07713,0.999749,0.99999,1.0


## Create train-val-test split

In [4]:
def save_train_and_validation_data_to_s3(data) :
    test_col = 'sin_boat_angle'
    y_data = data[test_col]
    X_data = data.drop(columns=[test_col])
    X_data = X_data.to_numpy()
    y_data = y_data.to_numpy()
    
    train_X, test_X, train_y, test_y = train_test_split(X_data, y_data, train_size=0.7, random_state=42)
    
    val_X, test_X, val_y, test_y = train_test_split(test_X, test_y, train_size=0.66, random_state=41)

    train_file = 'non_linear_train.data'

    f = io.BytesIO()
    smac.write_numpy_to_dense_tensor(f, train_X.astype('float32'), train_y.astype('float32'))
    f.seek(0)
    boto3.Session().resource('s3').Bucket(BUCKET).Object(os.path.join(PREFIX, TEST_NAME, 'train', train_file)).upload_fileobj(f)
    validation_file = 'non_linear_validation.data'
    
    f = io.BytesIO()
    smac.write_numpy_to_dense_tensor(f, val_X.astype('float32'), val_y.astype('float32'))
    f.seek(0)
    boto3.Session().resource('s3').Bucket(BUCKET).Object(os.path.join(PREFIX, TEST_NAME,'validation', validation_file)).upload_fileobj(f)
    return train_X, train_y, val_X, val_y, test_X, test_y

## Create a non-linear regression model using keras

In [5]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Dropout


# Build the model
def wide_keras_model(size_1, size_2):
    model = Sequential()
    model.add(Dense(size_1, kernel_initializer='normal', activation='relu', name='hidden_1'))
    model.add(Dropout(0.5, seed=42))
    model.add(Dense(size_2, kernel_initializer='normal', activation='relu', name='hidden_2'))
    model.add(Dropout(0.5, seed=42))
    model.add(Dense(1, kernel_initializer='normal', activation='linear'))

    # Compile the model
    model.compile(loss="mean_squared_error",
                  metrics=[tf.keras.metrics.MeanAbsoluteError()],
                  optimizer='adam')
    return model

In [6]:
train_X, train_y, val_X, val_y, test_X, test_y = save_train_and_validation_data_to_s3(df_reordered)
train_y = np.expand_dims(train_y, axis=1)
val_y = np.expand_dims(val_y, axis=1)
test_y = np.expand_dims(test_y, axis=1)
print(f'Number of examples in training data: {len(train_y)}')
print(f'Number of examples in validation data: {len(val_y)}')
print(f'Number of examples in test data: {len(test_y)}')

NoCredentialsError: Unable to locate credentials

### Use a manual grid search for best architecture using Validation data and EarlyStopping

In [None]:
hidden_layer_sizes = [(32, 16), (64, 32), (128, 64), (256, 128), (512, 256)]
models = []
scores = []
callback = tf.keras.callbacks.EarlyStopping(
    monitor='val_loss',
    patience=15, 
    restore_best_weights=True,
    verbose=0)

for i, (size_1, size_2) in enumerate(hidden_layer_sizes):
    models.append(wide_keras_model(size_1, size_2))
    models[i].fit(
        train_X,
        train_y,
        epochs=200,
        batch_size=32,
        verbose=0,
        validation_data=(val_X, val_y),
        callbacks=[callback])
    training_score = models[i].evaluate(train_X, train_y, verbose=0)
    val_score = models[i].evaluate(val_X, val_y, verbose=0)
    scores.append(val_score[0])
    print(f'Scores for hidden layer sizes {size_1} and {size_2}:')
    print(f'training MAE:{training_score[1]:.4f}, MSE:{training_score[0]:.4f}')
    print(f'validation MAE:{val_score[1]:.4f}, MSE:{val_score[0]:.4f}')
    print('\n')

In [None]:
best_idx = np.argmin(scores)
best_sizes = hidden_layer_sizes[best_idx]
best_model = models[best_idx]
test_score = best_model.evaluate(test_X, test_y, verbose=0)
print(f'Best performing model is of sizes {best_sizes} for hidden layers')
print(f'Best model MAE on the test set: {test_score[1]:.4f}')
print(f'Best model MSE on the test set: {test_score[0]:.4f}')

In [None]:
from matplotlib import pyplot as plt

plt.plot(best_model.history.history['loss'], label='train')
plt.plot(best_model.history.history['val_loss'], label='validation')
plt.legend()
plt.show()

### Sanity check using different library

In [None]:
from sklearn.metrics import mean_squared_error
from scipy.stats import gaussian_kde

test_pred = best_model.predict(test_X)
absolute_error = np.squeeze(np.absolute(test_pred - test_y))
print(f'TEST MAE: {np.mean(absolute_error):.4f}')
mse = mean_squared_error(test_y, test_pred)
print(f"Test MSE: {round(mse,4)}, \n")

abs_error = [(-e, i) for i, e in enumerate(absolute_error)]
abs_error.sort()
highest_errors = [
    f'{round(-e,3)}: ({test_pred[i][0]:.3f}, {round(test_y[i][0], 3)})' for e, i in abs_error]
print(f'Top highest absolute value errors on test data: \n {highest_errors[:10]} \n')
print(f'Top lowest absolute value errors on test data: \n {highest_errors[-10:]}')

In [None]:
density = gaussian_kde(absolute_error)
xs = np.linspace(0,1,50)
plt.figure(figsize=(15,5))
plt.plot(xs,density(xs))
plt.locator_params(axis='y', nbins=20)
plt.locator_params(axis='x', nbins=30)
plt.title('Density plot of absolute error')
plt.show()

In [None]:
assert False

## Further experiments

Try a one layer keras model for alternative

In [None]:
# Build the model
def wider_keras_model(size):
    model = Sequential()
    model.add(Dense(size_1, kernel_initializer='normal', activation='relu', name='hidden_1'))
    model.add(Dropout(0.5, seed=42))
    model.add(Dense(1, kernel_initializer='normal', activation='linear'))

    # Compile the model
    model.compile(loss="mean_squared_error",
                  metrics=[tf.keras.metrics.MeanAbsoluteError()],
                  optimizer='adam')
    return model

In [None]:
hidden_layer_sizes = [8, 16, 32, 64, 128, 512]
models = []
scores = []
callback = tf.keras.callbacks.EarlyStopping(
    monitor='val_loss',
    patience=15, 
    restore_best_weights=True,
    verbose=0)

for i, size in enumerate(hidden_layer_sizes):
    models.append(wider_keras_model(size))
    models[i].fit(
        train_X,
        train_y,
        epochs=200,
        batch_size=32,
        verbose=0,
        validation_data=(val_X, val_y),
        callbacks=[callback])
    training_score = models[i].evaluate(train_X, train_y, verbose=0)
    val_score = models[i].evaluate(val_X, val_y, verbose=0)
    scores.append(val_score[0])
    print(f'Scores for hidden layer size {size}:')
    print(f'training MAE:{training_score[1]:.4f}, MSE:{training_score[0]:.4f}')
    print(f'validation MAE:{val_score[1]:.4f}, MSE:{val_score[0]:.4f}')
    print('\n')