In [None]:
# Last Used/Edited: 4/24/2023

In [None]:
# PURPOSE: 
# Train a model using the generated images

In [None]:
# Setup
import numpy as np
import tensorflow as tf
from tensorflow import keras
import os
import pandas as pd
import glob
from typing import *
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from sklearn.model_selection import TimeSeriesSplit, GridSearchCV
from tensorflow.keras.optimizers import Adam
import datetime as dt
from tensorflow.keras import regularizers
#Import Libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow as tf
import keras
from tensorflow.keras import datasets,models,layers
from keras.callbacks import EarlyStopping
from keras.layers import Dense, Conv2D,  MaxPool2D, Flatten, GlobalAveragePooling2D,  BatchNormalization, Layer, Add
from keras.models import Sequential
from keras.models import Model
import tensorflow as tf

import keras_tuner
from keras_tuner import Hyperband, GridSearch
# Tune model training
# https://keras.io/guides/keras_tuner/getting_started/

# It is generally not needed to tune the number of epochs because a built-in callback is 
# passed to model.fit() to save the model at its best epoch evaluated by the validation_data.

# fix random seed for reproducibility
seed = 2
tf.random.set_seed(seed)

In [None]:
#SET HERE
horizon = 'SCALP'
# horizon='POSITION'
image_type = 'GAF'
# image_type = 'GAF_AGG'
# image_type = 'CS'
# image_type = 'TI'

EPOCHS = 50
BATCH_SIZE = 64
splits = 3
val_size = 150

#Time-series split
tscv = TimeSeriesSplit(n_splits=splits, test_size=val_size)

In [None]:
%matplotlib inline

import matplotlib.pyplot as plt
import matplotlib.image as mpimg
#POSITION
#GAF /Users/rurikoimai/Desktop/thesis/sample_pipeline/TRAIN/POSITION/GAF/LONG/2001_06_24_00_00_00.png
#AGG GAF GAF_AGG/LONG/2001_06_24_00_00_00.png
#CS CS/LONG/2001_06_24_00_00_00.png
#TI
img = mpimg.imread('/Users/rurikoimai/Desktop/thesis/sample_pipeline/TRAIN/POSITION/GAF_AGG/LONG/2001_06_24_00_00_00.png')
imgplot = plt.imshow(img)
print("size of image {}".format(imgplot.get_size()))
plt.show()

In [None]:
def image_paths_labels(path):
    dataframes = []
    for sub_folder in ['LONG', 'SHORT']:
        images = glob.glob(path + '/{}/*.png'.format(sub_folder))  # Get path to images
        dates = [dt.split('/')[-1].split('\\')[-1].split('.')[0].replace('_', '-') for dt in images]
        data_slice = pd.DataFrame({'Images': images, 'Labels': [sub_folder] * len(images), 'Dates': dates})
        data_slice['Dates'] = pd.to_datetime(data_slice['Dates'], format='%Y-%m-%d-%H-%M-%S')
        dataframes.append(data_slice)
    data = pd.concat(dataframes)
    data.sort_values(by='Dates', inplace=True)
    del data['Dates']
    return data.reset_index(drop=True)


def get_model_name(k, horizon, image_type):
    return 'model_' + horizon + '_' + image_type + '_' + str(k) + '.h5'

In [None]:
PATH = os.path.dirname(os.getcwd())
IMAGES_PATH = os.path.join(PATH, 'TRAIN/{}/{}'.format(horizon, image_type))
X = image_paths_labels(IMAGES_PATH)

In [None]:
# first split train, test
split_ratio = int(np.floor(0.8*len(X)))
train = X[:split_ratio]
test = X[split_ratio:]

In [None]:
test.iloc[-1,0]

In [None]:
test['Labels'].value_counts(0)

In [None]:
# Rescale images by 1./255
train_validate_datagen = ImageDataGenerator(rescale=1/255)#, validation_split=0.3) # set validation split
test_datagen = ImageDataGenerator(rescale=1/255)
IMG_SIZE = (255, 255)

In [None]:
def myModel(hp, tg, IMG_SIZE):
    IMG_SHAPE = IMG_SIZE + (3,)
    # tf.keras.applications.MobileNetV2
    # tf.keras.applications.resnet_v2.ResNet50V2
    base_model = tf.keras.applications.resnet_v2.ResNet50V2(input_shape=IMG_SHAPE,
                                                  include_top=False,
                                                  weights='imagenet')

    # This feature extractor converts each 160x160x3 image into a 5x5x1280 block of features
    # lets see what it does to an example batch of images

    image_batch, label_batch = next(iter(tg))
    feature_batch = base_model(image_batch)
#     print(feature_batch.shape)

    # Feature extraction
    # in this step, you will freeze the convolutional base created from the prev
    # step and to use as a feature extractor.
    # additionally, you add a classifier on top of it and train the top-level classifier

    # Freeze the convolutional base
    # it is important to freeze the convolutional base before you compile and train the model.
    # freezing (by setting layer.trainable = False) prevents the weights in a given layer from being updated during training.
    # MobileNet V2 has many layers, so setting the entire model's trainable flag False will freeze all of them

    base_model.trainable = False
#     base_model.trainable = True

    # base model architecture
#     base_model.summary()

    # Add a classification head

    # to generate from the block of features, average over the spatial 5x5 location,
    # using tf.keras.layers.GloabalAveragePooling2D layer to convert the features to a single 1280-element vector per image

    global_average_layer = tf.keras.layers.GlobalAveragePooling2D()
    feature_batch_average = global_average_layer(feature_batch)
#     print(feature_batch_average.shape)

    # apply tf.keras.layers.Dense layer to convert these features into a single prediction per image
    # you dont need an activation function here because this prediction will be treated as a logit, or a raw prediction value
    # positive numbers predict class 1, negative numbers predict class 0
    regularization = hp.Choice('regularization', values=[1e-4, 1e-5])
    prediction_layer = tf.keras.layers.Dense(units=1,
                                             kernel_regularizer=regularizers.L1L2(l1=regularization, l2=regularization),
                                             bias_regularizer=regularizers.L2(regularization),
                                             activity_regularizer=regularizers.L2(regularization))
    prediction_batch = prediction_layer(feature_batch_average)

    # build a model by chaining together the data augmentation, rescaling, base_model, and feature
    # extractor layers using Keras Functional API
    # as prev mentioned, use training=False b/c model contains a BatchNormalization layer

    inputs = tf.keras.Input(shape=(255, 255, 3))
    x = base_model(inputs, training=False)
    x = global_average_layer(x)
    
    drop_out_rate = hp.Float("drop_out_rate", min_value=0, max_value=1, step=0.2)
    x = tf.keras.layers.Dropout(drop_out_rate)(x)
    outputs = prediction_layer(x)
    model = tf.keras.Model(inputs, outputs)
    return model

def build_model(hp):

    model = myModel(hp, train_generator, (255, 255))
    # compile the model before training it.
    # since there are two classes, use the tf.keras.losses.BinaryCrossentropy loss 
    # with from_logits=True since hte model provides a linear output
    hp_learning_rate = hp.Choice('learning_rate', values=[1e-2, 1e-3, 1e-4])
    threshold = hp.Choice('thresholds', values=[0.4, 0.5, 0.6, 0.7])
    model.compile(optimizer=tf.keras.optimizers.legacy.Adam(learning_rate=hp_learning_rate),
                 loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
                 metrics=['accuracy', tf.keras.metrics.BinaryCrossentropy(from_logits=True), tf.keras.metrics.AUC(from_logits=True)])

    return model

In [None]:
### COMBINE & EDIT

VALIDATION_ACCURACY = []
VALIDATION_LOSS = []
BEST_HYPS = []
save_dir = PATH + '/saved_models/{}/'.format(horizon)
TIMESTAMP = dt.datetime.now().strftime("%Y%m%d%H%M%S")


for i, (train_index, val_index) in enumerate(tscv.split(train)):
    print(f"Fold {i}:")
    
    train_index = train_index[i*val_size:]
#     if i == 5:
#         break
    df_train = train.iloc[train_index]
    df_val = train.iloc[val_index]

    train_generator = train_validate_datagen.flow_from_dataframe(
        dataframe=df_train,
        directory=IMAGES_PATH,
        target_size=IMG_SIZE,
        x_col='Images',
        y_col='Labels',
        batch_size=BATCH_SIZE,
        shuffle=False,
        class_mode='binary',
    )
    
    validation_generator = train_validate_datagen.flow_from_dataframe(
        dataframe=df_val,
        directory=IMAGES_PATH,
        target_size=IMG_SIZE,
        x_col='Images',
        y_col='Labels',
        batch_size=BATCH_SIZE,
        shuffle=False,
        class_mode='binary',
    )

    print("train datagenerator : {}".format(train_generator.n))
    print("validation datagenerator : {}".format(validation_generator.n))
    steps_per_epoch = train_generator.n // train_generator.batch_size
    validation_steps = validation_generator.n//validation_generator.batch_size # if you have validation data 
    
    
    # Create Callbacks
    checkpoint = tf.keras.callbacks.ModelCheckpoint(save_dir+get_model_name(i, horizon, image_type),
                                                  monitor='val_accuracy', verbose=1,
                                                  save_best_only=True, mode='max')
    
    # create a callback to stop training early after reaching a certain value for the validation loss
    stop_early = tf.keras.callbacks.EarlyStopping(monitor='val_accuracy', patience=3,  min_delta=0.001, verbose=1, restore_best_weights=True, start_from_epoch=3)
    
    # https://keras.io/api/keras_tuner/tuners/base_tuner/#tuner-class
    tuner = keras_tuner.GridSearch(
        hypermodel = build_model,
        objective="val_accuracy",
        seed=5,
        max_trials=30,
        overwrite=True,
        directory="model_{}_{}".format(horizon, image_type),
        project_name="tune_hypermodel_{}".format(i),)
    
    tuner.search(train_generator, 
                 epochs=EPOCHS, 
                 steps_per_epoch=steps_per_epoch,
                 validation_data=validation_generator, 
                 validation_steps=validation_steps,
                 callbacks=[stop_early])

    # Get the optimal hyperparameters
    best_hps=tuner.get_best_hyperparameters(num_trials=1)[0]
    BEST_HYPS.append(best_hps.values)
    print(f"""
    The hyperparameter search is complete. The optimal regularization is {best_hps.get('regularization')},the optimal learning rate for the optimizer
    is {best_hps.get('learning_rate')}, the optimal drop out rate is {best_hps.get('drop_out_rate')}, and the best threshold is {best_hps.get('thresholds')}.
    """)
    
    # Build the model with the optimal hyperparameters and train it on the data for 50 epochs
    model = tuner.hypermodel.build(best_hps)
    history = model.fit(train_generator, 
                        epochs=EPOCHS,
                        steps_per_epoch=steps_per_epoch,
                        validation_data=validation_generator, 
                        validation_steps=validation_steps,
                        callbacks=[checkpoint, stop_early])

    # Load best model to evaluate performance
    model.load_weights(os.path.join(save_dir, "model_"+ horizon + '_' + image_type + '_' + str(i) + ".h5"))
    scores = model.evaluate(validation_generator)
    print("{0}s: {1:.2f}%".format(model.metrics_names[1], scores[1]*100))
    results = dict(zip(model.metrics_names, scores))
    VALIDATION_ACCURACY.append(results['accuracy'])
    VALIDATION_LOSS.append(results['loss'])

    # Good practice to explicitly close each 
    # tensorflow session prior to starting a 
    # new one in a loop for memory considerations
    tf.keras.backend.clear_session()

    
dict_results = {'VALIDATION_ACCURACY': VALIDATION_ACCURACY, 
                'VALIDATION_LOSS': VALIDATION_LOSS,
                'BEST_HYPS': BEST_HYPS}
df = pd.DataFrame(dict_results)
df.to_csv('results_{}_{}.csv'.format(horizon, image_type))


In [None]:
# learning curves

# take a look at the learning curves of the training and validation accuracy/loss
# when using the MobileNetV2 base model as a fixed feature extractor

acc = history.history['accuracy']
val_acc = history.history['val_accuracy']

loss = history.history['loss']
val_loss = history.history['val_loss']

plt.figure(figsize=(8, 8))
plt.subplot(2, 1, 1)
plt.plot(acc, label='Training Accuracy')
plt.plot(val_acc, label='Validation Accuracy')
plt.legend(loc='lower right')
plt.ylabel('Accuracy')
plt.ylim([min(plt.ylim()),1])
plt.title('Training and Validation Accuracy')

plt.subplot(2, 1, 2)
plt.plot(loss, label='Training Loss')
plt.plot(val_loss, label='Validation Loss')
plt.legend(loc='upper right')
plt.ylabel('Cross Entropy')
plt.ylim([0,1.0])
plt.title('Training and Validation Loss')
plt.xlabel('epoch')
plt.show()


In [None]:
results_df = pd.read_csv('./results_{}_{}.csv'.format(horizon, image_type))[['VALIDATION_ACCURACY', 'VALIDATION_LOSS', 'BEST_HYPS']]

VALIDATION_ACCURACY = list(results_df.VALIDATION_ACCURACY)
VALIDATION_LOSS = list(results_df.VALIDATION_LOSS)
BEST_HYPS = list(results_df.BEST_HYPS)


In [None]:
# Load saved model

best_model_index = VALIDATION_ACCURACY.index(max(VALIDATION_ACCURACY))
# load and evaluate a saved model
from numpy import loadtxt
from tensorflow.keras.models import load_model


# load model
# https://keras.io/guides/serialization_and_saving/
# model = load_model(save_dir + 'model_{}_{}{}.h5'.format(horizon, image_type, best_model_index), custom_objects={'mda': mda})
model = load_model(save_dir + 'model_{}_{}_{}.h5'.format(horizon, image_type, best_model_index))

# summarize model.
model.summary()

test_generator = test_datagen.flow_from_dataframe(
    dataframe=test,
    x_col='Images',
    y_col='Labels',
    directory=IMAGES_PATH,
    target_size=IMG_SIZE,
    class_mode='binary',
    shuffle=False,
)



# evaluate the model
score = model.evaluate(test_generator, verbose=0)


In [None]:
BEST_HYPS[best_model_index]

In [None]:
# Results
print("{}: {}%".format(model.metrics_names[0], score[0]*100))
print("{}: {}%".format(model.metrics_names[1], score[1]*100))
print("{}: {}%".format(model.metrics_names[2], score[2]*100))
print("{}: {}%".format(model.metrics_names[3], score[3]*100))

In [None]:
### Making predictions with the model
# Get the predicted values for the test set:
# test_generator.filenames #LONG = 0, SHORT = 1

y_pred = model.predict(test_generator)

In [None]:
print(np.column_stack((test_generator.labels, y_pred)))

In [None]:
true_pred = pd.DataFrame(np.column_stack((test_generator.labels, y_pred)), columns=["true", "predicted"])
true_pred.head()

In [None]:
test_generator.labels.count(1)

In [None]:
set(true_pred['predicted'])

In [None]:
true_pred.to_csv('../data/true_pred_2dCNN_{}_{}.csv'.format(horizon, image_type), index=False)

In [None]:
# split_half = int(len(train)*0.6)
# train_test = train[split_half:]
train_test = train[-15000:]
len(train_test)

In [None]:
# # how many labels for each class?
# splits = 3
# val_size = 150

#check class
# train_generator.class_indices

splits = 10
max_train_size=5000
val_size = 1000

tscv = TimeSeriesSplit(n_splits=splits, max_train_size=max_train_size, test_size=val_size)

# for i, (train_index, val_index) in enumerate(tscv.split(train)):
for i, (train_index, val_index) in enumerate(tscv.split(train_test)):
    print(f"Fold {i}:")
    
#     train_index = train_index[i*val_size:]
        
    print(f" Train: start datetime={train_test.iloc[train_index[0], 0][-30:]}")
    print(f" Train: end datetime={train_test.iloc[train_index[-1], 0][-30:]}")
    print(f" Validation start datetime={train_test.iloc[val_index[0], 0][-30:]}")
    print(f" Validation end datetime={train_test.iloc[val_index[-1], 0][-30:]}")

#     print(train_index)
#     print(len(train_index))
#     print(val_index)
#     print(len(val_index))
    
#     df_train = train.iloc[train_index]
    df_train = train_test.iloc[train_index]
    df_val = train_test.iloc[val_index]

    train_generator = train_validate_datagen.flow_from_dataframe(
        dataframe=df_train,
        directory=IMAGES_PATH,
        target_size=IMG_SIZE,
        x_col='Images',
        y_col='Labels',
        shuffle=False,
        class_mode='binary',
    )
    
    validation_generator = train_validate_datagen.flow_from_dataframe(
        dataframe=df_val,
        directory=IMAGES_PATH,
        target_size=IMG_SIZE,
        x_col='Images',
        y_col='Labels',
        shuffle=False,
        class_mode='binary',
    )
    
    print(f'train LONG :{train_generator.labels.count(0)}')
    print(f'train SHORT :{train_generator.labels.count(1)}')
    print(f'validation LONG :{validation_generator.labels.count(0)}')
    print(f'validation SHORT: {validation_generator.labels.count(1)}')

In [None]:
train_generator.classes

In [None]:
train_generator.class_mode

In [None]:
train_generator.class_indices