In [7]:
import tensorflow as tf
print("Num GPUs Available: ", len(tf.config.experimental.list_physical_devices('GPU')))

Num GPUs Available:  0


In [10]:
import importlib
import ensemble
importlib.reload(ensemble)

<module 'ensemble' from '/home/156/ob2720/cyclones/ensemble/ensemble.py'>

In [3]:
# Use the feature vectors Jack generated from one of the fusion model's intermediary layers

def get_arrays(partition):
    import numpy as np
    features = np.load(f"/g/data/x77/jm0124/feature_vectors/feature-array-{partition}.npy")
    import json
    with open(f"/g/data/x77/jm0124/feature_vectors/{partition}_feature_labels.json", "r") as f:
        labels_dict = json.load(f)
    ids = list(labels_dict.keys())
    labels = np.array([[y[1]-y for y in labels_dict[id_]['label'][:2]] for id_ in ids])
    return features, labels

# each row is the feature vector for a time interval
train_feature_vectors, train_label_vectors = get_arrays("train")
print(train_feature_vectors.shape, train_label_vectors.shape)

valid_feature_vectors, valid_label_vectors = get_arrays("val")
print(valid_feature_vectors.shape, valid_label_vectors.shape)

(166686, 4111) (166686, 6)
(21122, 4111) (21122, 6)


In [4]:
# train all of the models in model_classes on the training data
def train(model_class):
    model = model_class()
    print("\n\nTraining model " + model.NAME)
    model.train(train_feature_vectors, valid_feature_vectors, train_label_vectors, valid_label_vectors, verbose=True)
    print("Saving model " + model.NAME + " which had validation mean km error of " + str(model.mean_km))
    model.save(model.NAME + "-real-data")
# for model_class in ensemble.model_classes: train_model(model_class)

In [11]:
train(ensemble.ANNModel)



Training model ANN
Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_6 (Dense)             (None, 256)               1052672   
                                                                 
 batch_normalization_4 (Batc  (None, 256)              1024      
 hNormalization)                                                 
                                                                 
 dropout_4 (Dropout)         (None, 256)               0         
                                                                 
 dense_7 (Dense)             (None, 1024)              263168    
                                                                 
 batch_normalization_5 (Batc  (None, 1024)             4096      
 hNormalization)                                                 
                                                                 
 dropout_5 (Dropout)         (Non

In [8]:
train(ensemble.AdaBoostModel)



Training model ADA
Fitting 5 folds for each of 2 candidates, totalling 10 fits


KeyboardInterrupt: 

In [None]:
train(ensemble.KNNModel)



Training model KNN
Fitting 5 folds for each of 1 candidates, totalling 5 fits


In [150]:
train(ensemble.RandomForestModel)



Training model RF
Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  30 out of  30 | elapsed:    0.1s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  30 out of  30 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  30 out of  30 | elapsed:    0.1s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  30 out of  30 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  30 out of  30 | elapsed:    0.1s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  30 out of  30 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_j

fit models
fit best model
validation mean km error: 186.2
Saving model RF which had validation mean km error of 186.20463279667783


[Parallel(n_jobs=1)]: Done  30 out of  30 | elapsed:    0.1s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  30 out of  30 | elapsed:    0.1s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  30 out of  30 | elapsed:    0.0s finished


In [None]:
train(ensemble.XGBModel)

In [153]:
# Save/load models, check they work after loading from disk
models = [model_class.load(model_class.NAME + "-real-data") for model_class in ensemble.model_classes]

In [154]:
for model in models:
    print(f"{model.NAME:<8} val loss: {model.mean_km:.2f}")

Min possible loss: 145.56
ANN      val loss: 186.40
ADA      val loss: 182.16
KNN      val loss: 184.31
RF       val loss: 186.20
XGB      val loss: 203.14


In [164]:
# Ensemble these models together and see how the ensemble loss compares
preds = np.array([model.predict(valid_feature_vectors) for model in models])
print([model.NAME for model in models])
print([preds[i].shape for i in range(len(preds))])
preds.shape

['ANN', 'ADA', 'KNN', 'RF', 'XGB']
[(50, 2), (50, 2), (50, 2), (50, 2), (50, 2)]


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  30 out of  30 | elapsed:    0.0s finished


(5, 50, 2)

In [165]:
ensemble_preds = np.swapaxes(np.swapaxes(preds, 0, 2), 0, 1).mean(axis=2)
haversine_loss(ensemble_preds, valid_label_vectors).mean()

180.51402616749732