In [None]:
import numpy as np
import pandas as pd
import tensorflow as tf

AUTO = tf.data.experimental.AUTOTUNE

In [None]:
# Function to get hardware strategy
def get_hardware_strategy():
    try:
        # TPU detection. No parameters necessary if TPU_NAME environment variable is
        # set: this is always the case on Kaggle.
        tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
        print('Running on TPU ', tpu.master())
    except ValueError:
        tpu = None

    if tpu:
        tf.config.experimental_connect_to_cluster(tpu)
        tf.tpu.experimental.initialize_tpu_system(tpu)
        strategy = tf.distribute.experimental.TPUStrategy(tpu)
        policy = tf.keras.mixed_precision.Policy('mixed_bfloat16')
        tf.keras.mixed_precision.set_global_policy(policy)
    else:
        # Default distribution strategy in Tensorflow. Works on CPU and single GPU.
        strategy = tf.distribute.get_strategy()

    print("REPLICAS: ", strategy.num_replicas_in_sync)
    return tpu, strategy

tpu, strategy = get_hardware_strategy()
# Configuration
EPOCHS = 3
BATCH_SIZE = 2048 * strategy.num_replicas_in_sync
# Learning rate
LR = 0.001
# Verbosity
VERBOSE = 1
# For tf.dataset
AUTO = tf.data.experimental.AUTOTUNE
FEATURES = 300

## Training

In [None]:
feature_dict = {
    "target": tf.io.FixedLenFeature([], tf.float32),
    "features": tf.io.FixedLenFeature([FEATURES], tf.float32),
}

def read_tfrecord(example):
    example = tf.io.parse_single_example(example, feature_dict)
    X = example["features"]
    y = example["target"]

    return X, y

In [None]:
train_filenames = tf.io.gfile.glob("data/tfrecords/train/fold0/" + "*.tfrec")
train_filenames_ds = tf.data.Dataset.from_tensor_slices(train_filenames)
train_filenames_ds = train_filenames_ds.shuffle(len(train_filenames), reshuffle_each_iteration=True)
train_dataset = train_filenames_ds.interleave(lambda x: tf.data.TFRecordDataset(x),
                                              cycle_length=5,
                                              num_parallel_calls=AUTO)
train_dataset = train_dataset.map(read_tfrecord, num_parallel_calls=AUTO)
train_dataset = train_dataset.shuffle(100000, reshuffle_each_iteration=True)
train_dataset = train_dataset.batch(BATCH_SIZE).prefetch(AUTO)

In [None]:
valid_filenames = tf.io.gfile.glob("data/tfrecords/validation/fold0/" + "*.tfrec")
valid_filenames_ds = tf.data.Dataset.from_tensor_slices(valid_filenames)
valid_filenames_ds = valid_filenames_ds.shuffle(len(valid_filenames), reshuffle_each_iteration=True)
valid_dataset = valid_filenames_ds.interleave(lambda x: tf.data.TFRecordDataset(x),
                                              cycle_length=5,
                                              num_parallel_calls=AUTO)
valid_dataset = valid_dataset.map(read_tfrecord, num_parallel_calls=AUTO)
valid_dataset = valid_dataset.batch(BATCH_SIZE).prefetch(AUTO)

In [None]:
# parse an example
# ds = tf.data.TFRecordDataset('data/tfrecords/train/fold0/0.tfrec')
# iterator = iter(ds)
# raw_example = next(iterator)
# example = tf.io.parse_single_example(raw_example, feature_dict)

In [None]:
def build_model(shape):
    with strategy.scope(): 
        def fc_block(x, units):
            x = tf.keras.layers.Dropout(0.35)(x)
            x = tf.keras.layers.Dense(units, activation = 'relu')(x)
            return x
        
        inp = tf.keras.layers.Input((shape))
        x = fc_block(inp, units = 768)
        x = fc_block(x, units = 384)
        x = fc_block(x, units = 192)
        output = tf.keras.layers.Dense(1, activation = 'linear')(x)
        model = tf.keras.models.Model(inputs = [inp], outputs = [output])
        opt = tf.keras.optimizers.Adam(learning_rate=LR)
        model.compile(
            optimizer = opt,
            loss = [tf.keras.losses.MeanSquaredError()],
        )
        return model

In [None]:
# build a model
model = build_model(FEATURES)
model.summary()

In [None]:
index = 0
checkpoint = tf.keras.callbacks.ModelCheckpoint(f"data/keras_models/model_{index}", save_best_only=True)
history = model.fit(train_dataset, 
                    epochs=EPOCHS, 
                    verbose=VERBOSE,
                    validation_data=valid_dataset, 
                    callbacks=[checkpoint],
)

## Validation

In [None]:
index = 0
valid_df = pd.read_pickle("data/tfrecords/validation/fold0/validation.pkl")
features = [col for col in valid_df.columns if col not in ['row_id', 'time_id', 'investment_id', 'target']]
x_val = valid_df[features]
model = tf.keras.models.load_model(f"data/keras_models/model_{index}")
val_pred = model.predict(x_val, batch_size = BATCH_SIZE).astype(np.float32).reshape(-1)
valid_df['prediction'] = val_pred

In [None]:
# Calculate pearson correlation coefficient
def pearson_coef(data):
    return data.corr()['target']['prediction']

# Calculate mean pearson correlation coefficient
def comp_metric(valid_df):
    return np.mean(valid_df.groupby(['time_id']).apply(pearson_coef))

In [None]:
comp_metric(valid_df)