# Set-up

## Import Libraries

In [None]:
## General Libraries
import json
import pandas as pd
import numpy as np
import os, sys, re, io
import tensorflow as tf
import itertools, collections
import pickle
import graphviz
import pydot
import spacy
from time import time
import tensorflow_hub as hub
import matplotlib.pyplot as plt
from matplotlib import colors
from matplotlib.ticker import PercentFormatter
from csv import reader
## Tensorflow Models
from tensorflow.keras import backend as K
from tensorflow.keras import layers
from tensorflow.python.keras.layers import Lambda, Dense, Flatten
from tensorflow.keras.callbacks import TensorBoard
from tensorflow.keras.backend import sparse_categorical_crossentropy
from tensorflow_addons.utils.types import Number
from typeguard import typechecked
from typing import Optional
## Evaluation libraries
from sklearn.model_selection import KFold
from sklearn.linear_model import LinearRegression
from sklearn.metrics import cohen_kappa_score, confusion_matrix

In [None]:
## Additional packages for USE
#@title Load the Universal Sentence Encoder's TF Hub module
from absl import logging
import pysbd
from pysbd.utils import PySBDFactory
import seaborn as sns
from sklearn import preprocessing

from datetime import datetime

## Set Paths

In [None]:
## Constants and Paths

data_path = './'  # path to ner_dataset.csv file , from
# Constants
DATASET_DIR = './data/'
GLOVE_DIR = './glove.6B/'
SAVE_DIR = './'

os.environ["PATH"] += os.pathsep + \
    '/home/hardy_mike/anaconda3/lib/python3.7/site-packages'
checkpoint_path = "training_1/cp.ckpt"
checkpoint_dir = os.path.dirname(checkpoint_path)
now = datetime.datetime.now()  # current date and time

# make sure that the paths are accessible within the notebook
sys.path.insert(0, data_path)

## Load Data 

In [None]:
# make sure that the paths are accessible within the notebook
sys.path.insert(0, data_path)

X = pd.read_csv(os.path.join(DATASET_DIR, 'training_set_rel3.tsv'),
                sep='\t',
                encoding='ISO-8859-1')
X.index = X.essay_id

p1 = pd.read_csv(os.path.join(DATASET_DIR, 'Prompt-1.csv'),
                 index_col="EssayID")
p2 = pd.read_csv(os.path.join(DATASET_DIR, 'Prompt-2.csv'),
                 index_col="Essay ID")
p3 = pd.read_csv(os.path.join(DATASET_DIR, 'Prompt-3.csv'),
                 index_col="Essay ID")
p4 = pd.read_csv(os.path.join(DATASET_DIR, 'Prompt-4.csv'),
                 index_col="Essay ID")
p5 = pd.read_csv(os.path.join(DATASET_DIR, 'Prompt-5.csv'),
                 index_col="Essay ID")
p6 = pd.read_csv(os.path.join(DATASET_DIR, 'Prompt-6.csv'),
                 index_col="Essay ID")

### Load USE Embeddings (if already processed)

In [None]:
embeds = pickle.load(open('embeds_large.pkl', 'rb'))
X['embeds'] = embeds

# Sentence Parsing and Embedding

In [None]:
X = pd.read_csv(os.path.join(DATASET_DIR, 'training_set_rel3.tsv'),
                sep='\t',
                encoding='ISO-8859-1')
X.index = X.essay_id

## Create Universal Sentence Encodings

In [None]:
module_url = "https://tfhub.dev/google/universal-sentence-encoder-large/5"  #@param ["https://tfhub.dev/google/universal-sentence-encoder/4", "https://tfhub.dev/google/universal-sentence-encoder-large/5"]
model = hub.load(module_url)
print("module %s loaded" % module_url)


def embed(input):
    return model(input)


nlp = spacy.blank('en')
nlp.add_pipe(PySBDFactory(nlp))
essay_sentences = [
    embed(s.text) for s in [list(nlp(X.essay[i]).sents) for i in X.essay_id]
]
X['essay_sentences'] = essay_sentences

In [None]:
curr = len(X)
USE_embeds = np.array([
    embed([s.text for s in list(nlp(X[:curr].essay[i]).sents)])
    for i in X[:curr].essay_id
])

## Save USE embeddings for computation efficiency during experimentation
pickle.dump(USE_embeds, open('embeds_large.pkl', 'wb'))

## Pad Sentence Embeddings

In [None]:
sentlength = [len(e) for e in embeds]
max_len = max(sentlength) + 1
MAX_LEN = max_len

AVG_LEN = np.mean(sentlength)
embeddings = np.zeros((len(X), max_len, 512))
masks = np.full((len(X), max_len, 512), -1e9)
masks = np.zeros((len(X), max_len, 512))
mul_masks = np.ones((len(X), max_len, 512))

for i, e in enumerate(embeds):
    embeddings[i] = 1
    masks[i] = 1
    mul_masks[i] = 0
    for j, s in enumerate(e):
        embeddings[i][j + 1] = s
        masks[i][j + 1] = 1
        mul_masks[i][j + 1] = 0

masks_bool = tf.cast(masks, dtype=tf.bool)

# Global Constants and Functions

## Constants and Reference Values

In [None]:
X0 = X.copy(deep=True)
FOLDS = 5
EMBED_DIM = 512
min_num_classes = 4
cv = KFold(n_splits=FOLDS, shuffle=True)
curr = 512
count = 1
selected_essay_sets = [1, 2, 3, 4, 5, 6, 7, 8]
avg_class_est = 11
ytests = []
ypreds = []
kappa_table = []
kappas = []
result_history = []
avg_len_dict = {1: 350, 2: 350, 3: 150, 4: 150, 5: 150, 6: 150, 7: 250, 8: 650}
selected_essay_sets = [1, 2, 3, 4, 5, 6, 7, 8]
set_reduction = 1
pieces = set_reduction
avg_len_dict = {1: 350, 2: 350, 3: 150, 4: 150, 5: 150, 6: 150, 7: 250, 8: 650}
set_size_dict = {
    1: 1785,
    2: 1800,
    3: 1726,
    4: 1772,
    5: 1805,
    6: 1800,
    7: 1730,
    8: 918
}
max_set_size = 1805
avg_ratio_dict = {
    1: np.round(650 / 350),
    2: np.round(650 / 350),
    3: np.round(650 / 150),
    4: np.round(650 / 150),
    5: np.round(650 / 150),
    6: np.round(650 / 150),
    7: np.round(650 / 250),
    8: np.round(650 / 650)
}
num_heads = {1: 8, 2: 4, 3: 4, 4: 4, 5: 4, 6: 4, 7: 8, 8: 16}

## Global Functions

### Custom Ordinal Loss Function

In [None]:
class WeightedKappaLoss(tf.keras.losses.Loss):
    """Implements a custom Weighted Kappa loss function with an 
    label smoothing and weightage options. Adapted from 
    tensorflow addons kappa_loss.py
    """
    def __init__(
            self,
            num_classes: int,
            weightage: Optional[str] = "quadratic",
            name: Optional[str] = "cohen_kappa_loss",
            epsilon: Optional[Number] = 1e-6,
            dtype: Optional[tf.DType] = tf.float32,
            reduction: str = tf.keras.losses.Reduction.NONE,
            label_smoothing=0,
    ):

        super().__init__(name=name, reduction=reduction)
        if weightage not in ("linear", "quadratic"):
            raise ValueError("Unknown kappa weighting type.")
        if label_smoothing >= 1 or label_smoothing < 0:
            raise ValueError("Label smoothing must be from 0 to 1.")
        self.weightage = weightage
        self.num_classes = num_classes
        self.epsilon = epsilon
        self.dtype = dtype
        label_vec = tf.range(num_classes, dtype=dtype)
        self.row_label_vec = tf.reshape(label_vec, [1, num_classes])
        self.col_label_vec = tf.reshape(label_vec, [num_classes, 1])
        col_mat = tf.tile(self.col_label_vec, [1, num_classes])
        row_mat = tf.tile(self.row_label_vec, [num_classes, 1])
        self.smooth = tf.cast(label_smoothing, dtype=self.dtype)
        if weightage == "linear":
            self.weight_mat = tf.abs(col_mat - row_mat)
        else:
            self.weight_mat = (col_mat - row_mat)**2

    def call(self, y_true, y_pred):

        y_true = tf.one_hot(tf.cast(y_true, dtype=tf.int32),
                            depth=self.num_classes)[:, 0, :]
        y_true = y_true * (1 - self.smooth) + self.smooth / 2
        y_true = tf.cast(y_true, dtype=self.dtype)
        batch_size = tf.shape(y_true)[0]
        cat_labels = tf.matmul(y_true, self.col_label_vec)
        cat_label_mat = tf.tile(cat_labels, [1, self.num_classes])
        row_label_mat = tf.tile(self.row_label_vec, [batch_size, 1])
        if self.weightage == "linear":
            weight = tf.abs(cat_label_mat - row_label_mat)
        else:
            weight = (cat_label_mat - row_label_mat)**2
        numerator = tf.reduce_sum(weight * y_pred)
        label_dist = tf.reduce_sum(y_true, axis=0, keepdims=True)
        pred_dist = tf.reduce_sum(y_pred, axis=0, keepdims=True)
        w_pred_dist = tf.matmul(self.weight_mat, pred_dist, transpose_b=True)
        denominator = tf.reduce_sum(tf.matmul(label_dist, w_pred_dist))
        denominator /= tf.cast(batch_size, dtype=self.dtype)
        loss = tf.math.divide_no_nan(numerator, denominator)
        return loss + self.epsilon

    def get_config(self):
        config = {
            "num_classes": self.num_classes,
            "weightage": self.weightage,
            "epsilon": self.epsilon,
            "dtype": self.dtype,
        }
        base_config = super().get_config()
        return {**base_config, **config}


### Multi-head Attention for Transformers
Code adapted from https://www.tensorflow.org/tutorials/text/transformer

In [None]:
def create_padding_mask(seq):
    seq = tf.cast(tf.math.equal(seq, 0), tf.float32)
    # add extra dimensions to add the padding
    # to the attention logits.
    return seq[:, tf.newaxis, :]


#     return seq[:, tf.newaxis, tf.newaxis, :]


def scaled_dot_product_attention(q, k, v, mask):

    matmul_qk = tf.matmul(q, k,
                          transpose_b=True)  # (..., seq_len_q, seq_len_k)

    # scale matmul_qk
    dk = tf.cast(tf.shape(k)[-1], tf.float32)
    scaled_attention_logits = matmul_qk / tf.math.sqrt(dk)

    if mask is not None:

        mask = tf.matmul(mask, mask,
                         transpose_b=True)  # (..., seq_len_q, seq_len_k)

        mask = create_padding_mask(mask)

        scaled_attention_logits += (mask * -1e9)

    # softmax is normalized on the last axis (seq_len_k) so that the scores
    # add up to 1.
    attention_weights = tf.nn.softmax(scaled_attention_logits,
                                      axis=-1)  # (..., seq_len_q, seq_len_k)

    output = tf.matmul(attention_weights, v)  # (..., seq_len_q, depth_v)

    return output, attention_weights


class MultiHeadAttention(tf.keras.layers.Layer):
    def __init__(self, d_model, num_heads):
        super(MultiHeadAttention, self).__init__()
        self.num_heads = num_heads
        self.d_model = d_model

        assert d_model % self.num_heads == 0

        self.depth = d_model // self.num_heads

        self.wq = tf.keras.layers.Dense(d_model)
        self.wk = tf.keras.layers.Dense(d_model)
        self.wv = tf.keras.layers.Dense(d_model)

        self.dense = tf.keras.layers.Dense(
            d_model, kernel_initializer=tf.keras.initializers.he_uniform())

    def split_heads(self, x, batch_size):
        """Split the last dimension into (num_heads, depth).
        Transpose the result such that the shape is (batch_size, num_heads, seq_len, depth)
        """
        x = tf.reshape(x, (batch_size, -1, self.num_heads, self.depth))
        return tf.transpose(x, perm=[0, 2, 1, 3])

    def call(self, v, k, q, mask):
        batch_size = tf.shape(q)[0]
        seq_len = tf.shape(q)[1]

        q = self.wq(q)  # (batch_size, seq_len, d_model)
        k = self.wk(k)  # (batch_size, seq_len, d_model)
        v = self.wv(v)  # (batch_size, seq_len, d_model)

        q = self.split_heads(
            q, batch_size)  # (batch_size, num_heads, seq_len_q, depth)
        k = self.split_heads(
            k, batch_size)  # (batch_size, num_heads, seq_len_k, depth)
        v = self.split_heads(
            v, batch_size)  # (batch_size, num_heads, seq_len_v, depth)

        # scaled_attention.shape == (batch_size, num_heads, seq_len_q, depth)
        # attention_weights.shape == (batch_size, num_heads, seq_len_q, seq_len_k)
        scaled_attention, attention_weights = scaled_dot_product_attention(
            q, k, v, mask)

        scaled_attention = tf.transpose(
            scaled_attention,
            perm=[0, 2, 1, 3])  # (batch_size, seq_len_q, num_heads, depth)

        concat_attention = tf.reshape(
            scaled_attention,
            (batch_size, -1, self.d_model))  # (batch_size, seq_len_q, d_model)

        output = self.dense(
            concat_attention)  # (batch_size, seq_len_q, d_model)

        return output, attention_weights


def point_wise_feed_forward_network(d_model, dff):
    return tf.keras.Sequential([
        tf.keras.layers.Dense(dff,
                              activation='relu'),  # (batch_size, seq_len, dff)
        tf.keras.layers.Dense(d_model)  # (batch_size, seq_len, d_model)
    ])


class CustomSchedule(tf.keras.optimizers.schedules.LearningRateSchedule):
    def __init__(self,
                 d_model,
                 warmup_steps=4000,
                 factor=1,
                 num_classes=10,
                 is_sine=False):
        super(CustomSchedule, self).__init__()

        self.d_model = d_model
        self.d_model = tf.cast(self.d_model, tf.float32)
        self.factor = factor
        self.is_sine = is_sine
        self.num_classes = num_classes
        self.warmup_steps = warmup_steps

    def __call__(self, step):
        arg1 = tf.math.rsqrt(step)
        arg2 = step * (self.warmup_steps**-1.5)
        if self.is_sine:
            return 0.005 + 0.004 * tf.math.sin(0.1 * (step - 3.14))
        return tf.math.rsqrt(self.d_model) * tf.math.minimum(
            arg1, arg2) * (self.factor)

# LSTM Model

In [None]:
histories = []

for ess_set in selected_essay_sets:
    count = 1
    ess_history = []
    results = []
    class_results = []
    regress_results = []
    comb_results = []
    ## Create the new subsets of data
    current_span = round(len(X0[X0.essay_set == ess_set]) // pieces)

    X = X0[X0.essay_set == ess_set][:current_span]
    embs = embeddings[X0.essay_set == ess_set][:current_span]
    msks = masks[X0.essay_set == ess_set][:current_span]
    mms = mul_masks[X0.essay_set == ess_set][:current_span]
    sent_masks = 1 - tf.math.reduce_max(msks, axis=-1)
    sent_masks = sent_masks.numpy()
    sent_masks = tf.math.reduce_max(msks,
                                    axis=-1).numpy().astype(dtype=np.float32)
    sent_masks = np.max(msks, axis=-1).astype(dtype=np.float32)

    set_size = len(X)
    test_size = set_size // 6

    indices = np.random.permutation(np.arange(set_size))
    dev_indices, test_indices = indices[test_size:], indices[:test_size]

    ## Get the defining characteristics of the new data
    classes = list(set(X.domain1_score))

    max_class = max(classes)
    min_class = min(classes)
    num_classes = max_class - min_class + 1
    labels = X.domain1_score - min_class

    min_label = min(labels)
    max_label = max(labels)

    BATCH_SIZES = int(2**(np.ceil(np.log2(num_classes + 1)) + 2))
    CURR_ESS_LEN = int(avg_len_dict[ess_set])
    CURR_LEN_RATIO = avg_ratio_dict[ess_set]
    SET_SIZE_RATIO = max_set_size / set_size_dict[ess_set]
    HIDDEN_SIZE = int(2**np.ceil(np.log2(num_classes * 10)))
    LENGTH_RATIO = int(EMBED_DIM // (2 * avg_ratio_dict[ess_set]))
    EPOCHS = int(100 * np.sqrt(num_classes) +
                 50) * 6  #min((150 + num_classes**2) // 2,
    PATIENCE = int(min((num_classes) * 10, 285))
    DROPOUT = min(0.65 + 0.01 * np.sqrt(num_classes), 0.75)
    LEARNING_RATE = 0.0001 * num_classes + 0.01 * DROPOUT
    LOSS_WEIGHT = 0.90 / (1 + np.exp(0.5 *
                                     (num_classes - avg_class_est))) + 0.00075
    NORM_CLIP = 2
    NUM_HEADS = int(num_heads[ess_set])

    ## Get transformed labels for regression
    ys = np.array(labels)
    ys_regression = ys / num_classes

    ## Separate a test set
    X_test_df = X.iloc[test_indices]
    Xt = embs[test_indices]
    Mt = msks[test_indices]
    sMt = sent_masks[test_indices]
    yt = ys[test_indices]
    yt_regression = ys_regression[test_indices]

    ## Reset the dev data
    X = X.iloc[dev_indices]
    embs = embs[dev_indices]
    msks = msks[dev_indices]
    sMs = sent_masks[dev_indices]
    ys = ys[dev_indices]
    ys_regression = ys_regression[dev_indices]

    for traincv, testcv in cv.split(X):
        print("=" * 50 + "Essay " + str(ess_set) + "=" * 50)
        print("\n--------Fold {}--------\n".format(count))

        print("current number of classes = {}".format(num_classes))
        ## These are for 5-fold training sets
        Xn = embs[traincv]
        Mn = tf.cast(msks[traincv], dtype=tf.float32)  #, keepdims = True
        sMn = sMs[traincv]
        yn = ys[traincv]
        yn_regression = ys_regression[traincv]

        ## These are for 5-fold dev sets
        Xv = embs[testcv]
        Mv = tf.cast(msks[testcv], dtype=tf.float32)  #, keepdims = True
        sMv = sMs[testcv]
        yv = ys[testcv]
        yv_regression = ys_regression[testcv]
        model = tf.keras.models.Sequential([
            layers.Masking(mask_value=0., input_shape=(86, 512)),
            layers.BatchNormalization(),
            layers.LSTM(num_classes * 2,
                        dropout=0.60,
                        recurrent_dropout=0.10,
                        activation='relu',
                        kernel_initializer=tf.keras.initializers.he_normal()),
            layers.LayerNormalization(),
            layers.Dropout(0.2),
            layers.Dense(num_classes,
                         activation='relu',
                         bias_initializer=tf.keras.initializers.he_uniform(),
                         kernel_initializer=tf.keras.initializers.he_normal()),
            layers.Dropout(0.1),
            layers.BatchNormalization(),
            layers.Dense(num_classes, activation='softmax')
        ])

        loss = [
            WeightedKappaLoss(num_classes=num_classes),
            #                 tf.keras.losses.SparseCategoricalCrossentropy(from_logits=False, reduction=tf.keras.losses.Reduction.SUM_OVER_BATCH_SIZE),
        ]
        callback = tf.keras.callbacks.EarlyStopping(monitor='loss',
                                                    patience=PATIENCE,
                                                    min_delta=0.05,
                                                    restore_best_weights=True)
        optimizer = tf.keras.optimizers.Adam()
        model.compile(
            loss=loss,
            optimizer=optimizer,
            metrics=[
                'acc',
                tf.keras.metrics.SparseCategoricalCrossentropy(), 'mae'
            ],
        )

        history = model.fit(Xn,
                            yn,
                            validation_data=(Xv, yv),
                            batch_size=batch_sizes,
                            epochs=EPOCHS,
                            callbacks=[callback])
        ess_history.append(history)

        model.summary()

        y_pred = model.predict(Xt)
        ypreds.append(y_pred)

        y_pred = np.array([labels[i] for i in np.argmax(y_pred, axis=1)],
                          dtype=np.int32)

        print(confusion_matrix(yt, y_pred))
        print("\n--------Fold {} KAPPA--------\n".format(count))
        print(cohen_kappa_score(yt, y_pred, weights='quadratic'))

        model.evaluate(Xt, yt, batch_size=batch_sizes)
        result = cohen_kappa_score(yt, y_pred, weights='quadratic')
        print("Kappa Score: {}".format(result))
        results.append(result)
        if count == FOLDS:
            print("5__FOLD__KAPPA___SCORE___AVG: {}".format(np.mean(results)))

        count += 1

    histories2.append(ess_history)
    result_history.append(results)

# Multi-head Attention

In [None]:
histories_mha_combined_objective_final = []

# regression = True

FOLDS = 5
cv = KFold(n_splits=FOLDS, shuffle=True)
selected_essay_sets = [1, 2, 3, 4, 5, 6, 7, 8]
# selected_essay_sets = [3,8]
num_classes = 0

for ess_set in selected_essay_sets:
    count = 1
    ess_history = []
    results = []
    class_results = []
    regress_results = []
    comb_results = []
    # Create the new subsets of data
    current_span = round(len(X0[X0.essay_set == ess_set]) // 2.75)

    X = X0[X0.essay_set == ess_set][:current_span]
    embs = embeddings[X0.essay_set == ess_set][:current_span]
    msks = masks[X0.essay_set == ess_set][:current_span]
    mms = mul_masks[X0.essay_set == ess_set][:current_span]
    sent_masks = 1 - tf.math.reduce_max(msks, axis=-1)
    sent_masks = sent_masks.numpy()
    sent_masks = tf.math.reduce_max(msks,
                                    axis=-1).numpy().astype(dtype=np.float32)
    sent_masks = np.max(msks, axis=-1).astype(dtype=np.float32)

    set_size = len(X)
    test_size = set_size // 6

    indices = np.random.permutation(np.arange(set_size))
    dev_indices, test_indices = indices[test_size:], indices[:test_size]

    # Get the defining characteristics of the new data
    classes = list(set(X.domain1_score))

    max_class = max(classes)
    min_class = min(classes)
    num_classes = max_class - min_class + 1
    labels = X.domain1_score - min_class

    min_label = min(labels)
    max_label = max(labels)
    batch_sizes = int(2**np.ceil(np.log2(num_classes + 1)))

    # Get transformed labels for regression
    ys = np.array(labels)
    ys_regression = ys / num_classes

    # Separate a test set
    X_test_df = X.iloc[test_indices]
    Xt = embs[test_indices]
    Mt = msks[test_indices]
    sMt = sent_masks[test_indices]
    yt = ys[test_indices]
    yt_regression = ys_regression[test_indices]

    # Reset the dev data
    X = X.iloc[dev_indices]
    embs = embs[dev_indices]
    msks = msks[dev_indices]
    sMs = sent_masks[dev_indices]
    ys = ys[dev_indices]
    ys_regression = ys_regression[dev_indices]

    for traincv, testcv in cv.split(X):
        print("=" * 50 + "Essay " + str(ess_set) + "=" * 50)
        print("\n--------Fold {}--------\n".format(count))

        batch_sizes = int(2**np.ceil(np.log2(num_classes + 1)))

        print("current number of classes = {}".format(num_classes))
        # These are for 5-fold training sets
        Xn = embs[traincv]
        Mn = tf.cast(msks[traincv], dtype=tf.float32)  # , keepdims = True
        sMn = sMs[traincv]
        yn = ys[traincv]
        yn_regression = ys_regression[traincv]

        # These are for 5-fold dev sets
        Xv = embs[testcv]
        Mv = tf.cast(msks[testcv], dtype=tf.float32)  # , keepdims = True
        sMv = sMs[testcv]
        yv = ys[testcv]
        yv_regression = ys_regression[testcv]

        Input_embeds = tf.keras.Input(shape=(
            MAX_LEN,
            EMBED_DIM,
        ))
        Input_masks = tf.keras.Input(shape=(
            MAX_LEN,
            EMBED_DIM,
        ))
        Input_sentence_masks = tf.keras.Input(shape=(MAX_LEN, ))

        inputs = [Input_embeds, Input_masks, Input_sentence_masks]

        q_input = layers.Dense(EMBED_DIM, name="Query_Input")(inputs[0])
        q_input = layers.Multiply()([q_input, inputs[1]])
        v_input = layers.Dense(EMBED_DIM, name="Value_Input")(inputs[0])
        v_input = layers.Multiply()([v_input, inputs[1]])
        k_input = layers.Dense(EMBED_DIM, name="Key_Input")(inputs[0])
        k_input = layers.Multiply()([k_input, inputs[1]])

        mha_output1, attention_weights = MultiHeadAttention(d_model=EMBED_DIM,
                                                            num_heads=8)(
                                                                v=v_input,
                                                                k=k_input,
                                                                q=q_input,
                                                                mask=None)
        mha_masked = layers.Multiply(name="MHA_Masking_1")(
            [mha_output1, inputs[1]])
        mha_masked = layers.Dropout(0.25)(mha_masked)
        layer_norm_output = layers.LayerNormalization(
            epsilon=1e-6, name="MHA_Norm_1")(mha_masked + mha_output1)
        layer_norm_output = layers.Reshape(
            target_shape=(MAX_LEN, EMBED_DIM))(layer_norm_output)
        mha_output2, attention_weights = MultiHeadAttention(
            d_model=EMBED_DIM, num_heads=8)(v=v_input,
                                            k=k_input,
                                            q=layer_norm_output,
                                            mask=None)
        #         mha_output2= layers.Multiply()([mha_output2, inputs[1]])
        mha_output2 = layers.Dropout(0.25)(mha_output2)
        layer_norm_output2 = layers.LayerNormalization(
            epsilon=1e-6, name="MHA_Norm_2")(mha_output2 + layer_norm_output)
        ffn_output = layers.Dense(EMBED_DIM, activation='relu',
                                  name="DFF_1")(layer_norm_output2)
        ffn_output = layers.Dense(EMBED_DIM, name="DFF_2")(ffn_output)
        #         ffn_output= layers.Multiply()([ffn_output, inputs[1]])
        ffn_output = layers.Dropout(0.1)(ffn_output)
        layer_norm_output3 = layers.LayerNormalization(
            epsilon=1e-6, name="DFF_Norm")(ffn_output + layer_norm_output2)
        layer_norm_output3 = layers.Reshape(
            target_shape=(MAX_LEN, EMBED_DIM))(layer_norm_output3)
        layer_norm_output3 = layers.Multiply()([layer_norm_output3, inputs[1]])
        layer_norm_output3 = layers.Masking(name="DFF_Mask")(
            layer_norm_output3, )
        final_hidden = layers.Bidirectional(
            layers.LSTM(EMBED_DIM // 2,
                        dropout=0.40,
                        recurrent_dropout=0,
                        activation='tanh',
                        use_bias=True,
                        unroll=False,
                        recurrent_activation='sigmoid'),
            name="BiLSTM")(inputs=layer_norm_output3)

        outputs_class = layers.Dense(num_classes,
                                     activation='softmax',
                                     name="Classification")(final_hidden)

        outputs_regression = layers.Dense(1,
                                          activation='sigmoid',
                                          name="Regression")(final_hidden)

        final_outputs = [outputs_class, outputs_regression]

        model = tf.keras.models.Model(inputs=inputs, outputs=final_outputs)

        loss = [
            WeightedKappaLoss(num_classes=num_classes),
            tf.keras.losses.MeanSquaredError(),
        ]

        learning_rate = 0.001

        optimizer = tf.keras.optimizers.Adamax(learning_rate=learning_rate)
        loss_p = 0.95 / (
            1 + np.exp(num_classes - avg_class_est)
        ) + 0.005  # (min_num_classes*0.98)**2/(num_classes**2)
        model.compile(loss=loss,
                      optimizer=optimizer,
                      metrics=['acc', 'mae'],
                      loss_weights=[loss_p, (1 - loss_p) * num_classes])

        callback = [
            tf.keras.callbacks.EarlyStopping(monitor='Classification_loss',
                                             patience=(num_classes + 10) * 2,
                                             min_delta=0.005,
                                             restore_best_weights=True),
            tf.keras.callbacks.EarlyStopping(monitor='Regression_loss',
                                             patience=(num_classes + 10) * 2,
                                             min_delta=0.0005,
                                             restore_best_weights=True),
        ]

        history = model.fit([Xn, Mn, sMn], [yn, yn_regression],
                            validation_data=([Xv, Mv,
                                              sMv], [yv, yv_regression]),
                            batch_size=512,
                            epochs=100 + num_classes * 10,
                            callbacks=[callback])
        ess_history.append(history)

        model.summary()

        y_pred = model.predict([Xt, Mt, sMt])

        y_pred_class = y_pred[0]
        y_pred_regression = y_pred[1]

        y_pred_regression = np.round(y_pred_regression * num_classes)

        y_pred_class = np.argmax(y_pred_class, axis=1)

        y_comb = np.round(
            np.add((y_pred_class * loss_p),
                   (y_pred[1] * num_classes * (1 - loss_p))[:, 0]))

        print(confusion_matrix(yt, y_pred_class))
        print("\n--------Fold {} Classification KAPPA--------\n".format(count))
        class_result = cohen_kappa_score(yt, y_pred_class, weights='quadratic')
        print(class_result)

        print(confusion_matrix(yt, y_pred_regression))
        print("\n--------Fold {} Regression KAPPA--------\n".format(count))
        #         model.evaluate(Xt,yt,batch_size = batch_sizes)
        regress_result = cohen_kappa_score(yt,
                                           y_pred_regression,
                                           weights='quadratic')
        print(regress_result)

        print(confusion_matrix(yt, y_comb))
        print("\n--------Fold {} Combined KAPPA--------\n".format(count))
        comb_result = cohen_kappa_score(yt, y_comb, weights='quadratic')
        print(comb_result)

        class_results.append(class_result)
        regress_results.append(regress_result)
        comb_results.append(comb_result)
        if count == FOLDS:
            print("{}__FOLD_CLASS_KAPPA___SCORE___MAX: {}".format(
                FOLDS, np.max(class_results)))
            print("{}__FOLD_REGRESS_KAPPA___SCORE___MAX: {}".format(
                FOLDS, np.max(regress_results)))
            print("{}__FOLD_COMB_KAPPA___SCORE___MAX: {}".format(
                FOLDS, np.max(comb_results)))
        count += 1

    histories_mha_combined_objective_final.append(ess_history)
    result_history.append(results)

# Multi-head Multi-layer Attention

In [None]:
histories_mha_combined_objective_final4 = []

class CustomSchedule(tf.keras.optimizers.schedules.LearningRateSchedule):
    def __init__(self,
                 d_model,
                 warmup_steps=4000,
                 factor=1,
                 num_classes=10,
                 is_sine=False):
        super(CustomSchedule, self).__init__()

        self.d_model = d_model
        self.d_model = tf.cast(self.d_model, tf.float32)
        self.factor = factor
        self.is_sine = is_sine
        self.num_classes = num_classes
        self.warmup_steps = warmup_steps

    def __call__(self, step):
        arg1 = tf.math.rsqrt(step)
        arg2 = step * (self.warmup_steps**-1.5)
        if self.is_sine:
            return 0.005 + 0.004 * tf.math.sin(0.1 * (step - 3.14))
        return tf.math.rsqrt(self.d_model) * tf.math.minimum(
            arg1, arg2) * (self.factor)


FOLDS = 5
cv = KFold(n_splits=FOLDS, shuffle=True)
selected_essay_sets = [1, 2, 3, 4, 5, 6, 7, 8]
# selected_essay_sets = [7,8,1,2,4, 5, 6, 7,3]
selected_essay_sets = [8]
num_classes = 0
pieces = 1.5
avg_len_dict = {1: 350, 2: 350, 3: 150, 4: 150, 5: 150, 6: 150, 7: 250, 8: 650}
set_size_dict = {
    1: 1785,
    2: 1800,
    3: 1726,
    4: 1772,
    5: 1805,
    6: 1800,
    7: 1730,
    8: 918
}
max_set_size = 1805
avg_ratio_dict = {
    1: np.round(650 / 350),
    2: np.round(650 / 350),
    3: np.round(650 / 150),
    4: np.round(650 / 150),
    5: np.round(650 / 150),
    6: np.round(650 / 150),
    7: np.round(650 / 250),
    8: np.round(650 / 650)
}
num_heads = {1: 8, 2: 4, 3: 4, 4: 4, 5: 4, 6: 4, 7: 8, 8: 16}

for ess_set in selected_essay_sets:
    count = 1
    essay_set_counter = 0
    ess_history = []
    results = []
    class_results = []
    regress_results = []
    comb_results = []
    # Create the new subsets of data
    current_span = round(len(X0[X0.essay_set == ess_set]) // pieces)

    X = X0[X0.essay_set == ess_set][:current_span]
    embs = embeddings[X0.essay_set == ess_set][:current_span]
    msks = masks[X0.essay_set == ess_set][:current_span]
    mms = mul_masks[X0.essay_set == ess_set][:current_span]
    sent_masks = 1 - tf.math.reduce_max(msks, axis=-1)
    sent_masks = sent_masks.numpy()
    sent_masks = tf.math.reduce_max(msks,
                                    axis=-1).numpy().astype(dtype=np.float32)
    sent_masks = np.max(msks, axis=-1).astype(dtype=np.float32)

    set_size = len(X)
    test_size = set_size // 6

    indices = np.random.permutation(np.arange(set_size))
    dev_indices, test_indices = indices[test_size:], indices[:test_size]

    # Get the defining characteristics of the new data
    classes = list(set(X.domain1_score))

    max_class = max(classes)
    min_class = min(classes)
    num_classes = max_class - min_class + 1
    labels = X.domain1_score - min_class

    min_label = min(labels)
    max_label = max(labels)

    BATCH_SIZES = int(2**(np.ceil(np.log2(num_classes + 1)) + 2))
    CURR_ESS_LEN = int(avg_len_dict[ess_set])
    CURR_LEN_RATIO = avg_ratio_dict[ess_set]
    SET_SIZE_RATIO = max_set_size / set_size_dict[ess_set]
    HIDDEN_SIZE = int(2**np.ceil(np.log2(num_classes * 10)))
    LENGTH_RATIO = int(EMBED_DIM // (2 * avg_ratio_dict[ess_set]))
    EPOCHS = int(100 * np.sqrt(num_classes) +
                 50) * 6  # min((150 + num_classes**2) // 2,
    PATIENCE = int(min((num_classes) * 10, 285))
    DROPOUT = min(0.65 + 0.01 * np.sqrt(num_classes), 0.75)
    LEARNING_RATE = 0.0001 * num_classes + 0.01 * DROPOUT
    LOSS_WEIGHT = 0.90 / (1 + np.exp(0.5 *
                                     (num_classes - avg_class_est))) + 0.00075
    NORM_CLIP = 2
    NUM_HEADS = int(num_heads[ess_set])

    # Get transformed labels for regression
    ys = np.array(labels)
    ys_regression = ys / num_classes

    # Separate a test set
    X_test_df = X.iloc[test_indices]
    Xt = embs[test_indices]
    Mt = msks[test_indices]
    sMt = sent_masks[test_indices]
    yt = ys[test_indices]
    yt_regression = ys_regression[test_indices]

    # Reset the dev data
    X = X.iloc[dev_indices]
    embs = embs[dev_indices]
    msks = msks[dev_indices]
    sMs = sent_masks[dev_indices]
    ys = ys[dev_indices]
    ys_regression = ys_regression[dev_indices]

    for traincv, testcv in cv.split(X):
        print("=" * 50 + "Essay " + str(ess_set) + "=" * 50)
        print("\n--------Fold {}--------\n".format(count))

        print("current number of classes = {}".format(num_classes))
        # These are for 5-fold training sets
        Xn = embs[traincv]
        Mn = tf.cast(msks[traincv], dtype=tf.float32)  # , keepdims = True
        sMn = sMs[traincv]
        yn = ys[traincv]
        yn_regression = ys_regression[traincv]

        # These are for 5-fold dev sets
        Xv = embs[testcv]
        Mv = tf.cast(msks[testcv], dtype=tf.float32)  # , keepdims = True
        sMv = sMs[testcv]
        yv = ys[testcv]
        yv_regression = ys_regression[testcv]

        ### =======  Define Inputs ======= ###
        Input_embeds = tf.keras.Input(shape=(
            MAX_LEN,
            EMBED_DIM,
        ))
        Input_masks = tf.keras.Input(shape=(
            MAX_LEN,
            EMBED_DIM,
        ))
        Input_sentence_masks = tf.keras.Input(shape=(MAX_LEN, ))
        inputs = [Input_embeds, Input_masks, Input_sentence_masks]

        ### =======  Define Query, Key, and Value for Transformer ======= ###
        q_input = layers.Dense(EMBED_DIM, name="Query_Input")(inputs[0])
        v_input = layers.Dense(EMBED_DIM, name="Value_Input")(inputs[0])
        k_input = layers.Dense(EMBED_DIM, name="Key_Input")(inputs[0])

        ### =======  Masked MultiHead Attention ======= ###
        mha_output1, attention_weights = MultiHeadAttention(
            d_model=EMBED_DIM, num_heads=NUM_HEADS)(v=v_input,
                                                    k=k_input,
                                                    q=q_input,
                                                    mask=inputs[0])

        mha_masked = layers.Dropout(DROPOUT)(mha_output1)
        forLabel = layers.Masking(name="DFF_Mask2")(mha_masked, )

        if num_classes > 1:
            ### =======  Add and Norm ======= ###
            layer_norm_output = layers.LayerNormalization(
                epsilon=1e-6, name="MHA_Norm_1")(q_input + mha_output1)

            ### =======  Feed Forward Network ======= ###
            ffn_output = layers.Dense(
                EMBED_DIM,
                activation='relu',
                name="ffn_2",
                kernel_regularizer=tf.keras.regularizers.l2(0.001))(
                    layer_norm_output)

            ### =======  Add and Norm ======= ###
            layer_norm_output2 = layers.LayerNormalization(
                epsilon=1e-6,
                name="MHA_Norm_2")(ffn_output + layer_norm_output)
            q_ffn_output = layers.Dense(
                EMBED_DIM,
                name="q_weight_2",
                kernel_regularizer=tf.keras.regularizers.l2(0.001))(
                    layer_norm_output)

            v_ffn_output = layers.Dense(
                EMBED_DIM,
                name="v_weight_2",
                kernel_regularizer=tf.keras.regularizers.l2(0.001))(
                    layer_norm_output)
            k_ffn_output = layers.Dense(
                EMBED_DIM,
                name="k_weight_2",
                kernel_regularizer=tf.keras.regularizers.l2(0.001))(
                    layer_norm_output)

            ### =======  MultiHead Attention ======= ###
            mha_output2, attention_weights = MultiHeadAttention(
                d_model=EMBED_DIM, num_heads=NUM_HEADS)(v=v_ffn_output,
                                                        k=k_ffn_output,
                                                        q=q_ffn_output,
                                                        mask=inputs[0])
            mha_output2 = layers.Dropout(DROPOUT)(mha_output2)

            ### =======  Add and Norm ======= ###
            layer_norm_output2 = layers.LayerNormalization(
                epsilon=1e-6,
                name="MHA_Norm_2")(mha_output2 + layer_norm_output)

            ### =======  Feed Forward Network ======= ###
            ffn_output = layers.Dense(
                EMBED_DIM,
                activation='relu',
                name="DFF_1",
                kernel_regularizer=tf.keras.regularizers.l2(0.001))(
                    layer_norm_output2)
            ffn_output = layers.Dropout(DROPOUT)(
                ffn_output)  # 0.2 * SET_SIZE_RATIO
            q_ffn = layers.Dense(EMBED_DIM,
                                 name="Q_DFF_2",
                                 kernel_regularizer=tf.keras.regularizers.l2(
                                     0.0001))(ffn_output)
            q_ffn = layers.LayerNormalization(
                epsilon=1e-6,
                name="Qffn_normalization")(q_ffn + layer_norm_output2)

            ### =======  MultiHead Attention PART DEUX ======= ###
            mha_output3, attention_weights = MultiHeadAttention(
                d_model=EMBED_DIM, num_heads=NUM_HEADS)(v=v_ffn_output,
                                                        k=k_ffn_output,
                                                        q=q_ffn,
                                                        mask=inputs[0])
            mha_output3 = layers.Dropout(DROPOUT)(mha_output3)

            ### =======  Add and Norm ======= ###
            layer_norm_output3 = layers.LayerNormalization(
                epsilon=1e-6, name="MHA_Norm_3")(mha_output3 + q_ffn)

            ### =======  Feed Forward Network ======= ###
            ffn_output = layers.Dense(
                EMBED_DIM,
                activation='relu',
                name="DFF_B",
                kernel_regularizer=tf.keras.regularizers.l2(0.001))(
                    layer_norm_output3)
            ffn_output = layers.Dropout(DROPOUT)(
                ffn_output)  # 0.2 * SET_SIZE_RATIO
            q_ffn = layers.Dense(EMBED_DIM,
                                 name="Q_DFF_B",
                                 kernel_regularizer=tf.keras.regularizers.l2(
                                     0.0001))(ffn_output)
            q_ffn = layers.LayerNormalization(
                epsilon=1e-6,
                name="MHA_normalization_2")(q_ffn + layer_norm_output3)

            forLabel = q_ffn

        cls_token = forLabel[:, 0, :]
        outputs_combined_class = layers.Dense(num_classes,
                                              activation='softmax',
                                              name="Combined_Class")(cls_token)
        outputs_combined_reg = layers.Dense(1,
                                            activation='sigmoid',
                                            name="Combined_Regress")(cls_token)

        ### =======  Output ======= ###
        final_outputs = [outputs_combined_class, outputs_combined_reg]

        ### =======  Define Model Parameters and Compile ======= ###
        model = tf.keras.models.Model(inputs=inputs, outputs=final_outputs)

        loss = [
            WeightedKappaLoss(num_classes=num_classes, label_smoothing=0),
            tf.keras.losses.MeanSquaredError(),
            WeightedKappaLoss(num_classes=num_classes, label_smoothing=0),
        ]

        learning_rate = LEARNING_RATE
        learning_rate = CustomSchedule(d_model=EMBED_DIM,
                                       warmup_steps=750,
                                       is_sine=False)
        optimizer = tf.keras.optimizers.Adamax(learning_rate=learning_rate)

        model.compile(
            loss=loss,
            optimizer=optimizer,
            metrics=['acc', 'mse'],
            loss_weights=[LOSS_WEIGHT, 2 * num_classes * (1 - LOSS_WEIGHT)],
        )

        callback = [
            #             tensorboard_callback,
            tf.keras.callbacks.EarlyStopping(monitor='val_Combined_Class_loss',
                                             patience=PATIENCE * 2,
                                             min_delta=0.004,
                                             restore_best_weights=True),
            tf.keras.callbacks.EarlyStopping(
                monitor='val_Combined_Regress_loss',
                patience=PATIENCE * 4,
                min_delta=0.00005,
                restore_best_weights=True),
        ]
        history = model.fit(
            [Xn, Mn, sMn],
            [yn, yn_regression],
            validation_data=([Xv, Mv, sMv], [yv, yv_regression]),
            batch_size=512,  # BATCH_SIZES,
            epochs=EPOCHS,
            callbacks=[callback])
        ### =======  Record History and Summaries ======= ###
        ess_history.append(history)
        model.summary()

        print("BATCH_SIZES = {}".format(BATCH_SIZES))
        print("HIDDEN_SIZE = {}".format(HIDDEN_SIZE))
        print("TOTAL EPOCHS =  {}".format(EPOCHS))
        print("PATIENCE = {}".format(PATIENCE))
        print("DROPOUT = {}".format(DROPOUT))
        print("LEARNING_RATE =  {}".format(LEARNING_RATE))
        print("percent classification = {} ".format(LOSS_WEIGHT))
        print("NUM_HEADS = {}".format(NUM_HEADS))

        ### =======  Evaluate on Held-out Test Set ======= ###
        y_pred = model.predict([Xt, Mt, sMt])
        y_pred_class = y_pred[0]
        y_pred_class_c = np.argmax(y_pred[0], axis=1)
        y_pred_regression = y_pred[1]
        y_pred_regression = np.round(y_pred_regression * num_classes)
        y_pred_class = np.argmax(y_pred_class, axis=1)

        print(confusion_matrix(yt, y_pred_class))
        print("\n--------Fold {} Classification KAPPA--------\n".format(count))
        class_result = cohen_kappa_score(yt, y_pred_class, weights='quadratic')
        print(class_result)
        print(confusion_matrix(yt, y_pred_regression))
        print("\n--------Fold {} Regression KAPPA--------\n".format(count))
        regress_result = cohen_kappa_score(yt,
                                           y_pred_regression,
                                           weights='quadratic')
        print(regress_result)
        y_comb = np.round(
            np.add((np.argmax(y_pred[0], axis=1) * LOSS_WEIGHT),
                   (y_pred[1] * num_classes * (1 - LOSS_WEIGHT))[:, 0]))

        print(confusion_matrix(yt, y_comb))
        print("\n--------Fold {} Combined KAPPA--------\n".format(count))
        comb_result = cohen_kappa_score(yt, y_comb, weights='quadratic')
        print(comb_result)

        class_results.append(class_result)
        regress_results.append(regress_result)
        comb_results.append(comb_result)
        if count == FOLDS:
            print("{}__FOLD_CLASS_KAPPA___SCORE___MAX: {}".format(
                FOLDS, np.max(class_results)))
            print("{}__FOLD_REGRESS_KAPPA___SCORE___MAX: {}".format(
                FOLDS, np.max(regress_results)))
            print("{}__FOLD_COMB_KAPPA___SCORE___MAX: {}".format(
                FOLDS, np.max(comb_results)))

        count += 1

    histories_mha_combined_objective_final4.append(ess_history)
    result_history.append(results)
    f, ax = plt.subplots(nrows=1, ncols=FOLDS, figsize=[15, 3], sharey='row')
    for fd in range(FOLDS):
        ax[fd].plot(histories_mha_combined_objective_final4[essay_set_counter]
                    [fd].history['Combined_Regress_loss'])
        ax[fd].plot(histories_mha_combined_objective_final4[essay_set_counter]
                    [fd].history['val_Combined_Regress_loss'])
    plt.title('model regress loss essay {}'.format(ess_set))
    plt.ylabel('loss')
    plt.xlabel('epoch')
    plt.legend(['train', 'test'], loc='upper left')
    plt.show()
    plt.savefig(
        'mha2_combined_regress_features_8_6_attn_model_length2_essay_{}.png'.
        format(ess_set))

    f, ax = plt.subplots(nrows=1, ncols=FOLDS, figsize=[15, 3], sharey='row')
    for fd in range(FOLDS):
        ax[fd].plot(histories_mha_combined_objective_final4[essay_set_counter]
                    [fd].history['Combined_Class_loss'])
        ax[fd].plot(histories_mha_combined_objective_final4[essay_set_counter]
                    [fd].history['val_Combined_Class_loss'])
    plt.title('model class loss essay {}'.format(ess_set))
    plt.ylabel('loss')
    plt.xlabel('epoch')
    plt.legend(['train', 'test'], loc='upper left')
    plt.show()
    plt.savefig(
        'mha2_combined_class_features_8_6_attn_model_length2_essay_{}.png'.
        format(ess_set))
    essay_set_counter += 1

# Multi-head Attention with BLSTM processing

In [None]:
histories_mha_combined_objective_final4 = []

for ess_set in selected_essay_sets:
    count = 1
    essay_set_counter = 0
    ess_history = []
    results = []
    class_results = []
    regress_results = []
    comb_results = []
    # Create the new subsets of data
    current_span = round(len(X0[X0.essay_set == ess_set]) // pieces)

    X = X0[X0.essay_set == ess_set][:current_span]
    embs = embeddings[X0.essay_set == ess_set][:current_span]
    msks = masks[X0.essay_set == ess_set][:current_span]
    mms = mul_masks[X0.essay_set == ess_set][:current_span]
    sent_masks = 1 - tf.math.reduce_max(msks, axis=-1)
    sent_masks = sent_masks.numpy()
    sent_masks = tf.math.reduce_max(msks,
                                    axis=-1).numpy().astype(dtype=np.float32)
    sent_masks = np.max(msks, axis=-1).astype(dtype=np.float32)

    set_size = len(X)
    test_size = set_size // 6

    indices = np.random.permutation(np.arange(set_size))
    dev_indices, test_indices = indices[test_size:], indices[:test_size]

    # Get the defining characteristics of the new data
    classes = list(set(X.domain1_score))
    max_class = max(classes)
    min_class = min(classes)
    num_classes = max_class - min_class + 1
    labels = X.domain1_score - min_class

    min_label = min(labels)
    max_label = max(labels)

    BATCH_SIZES = int(2**(np.ceil(np.log2(num_classes + 1)) + 2))
    CURR_ESS_LEN = int(avg_len_dict[ess_set])
    CURR_LEN_RATIO = avg_ratio_dict[ess_set]
    SET_SIZE_RATIO = max_set_size / set_size_dict[ess_set]
    HIDDEN_SIZE = int(2**np.ceil(np.log2(num_classes * 10)))
    LENGTH_RATIO = int(EMBED_DIM // (2 * avg_ratio_dict[ess_set]))
    EPOCHS = int(100 * np.sqrt(num_classes) +
                 50) * 6  # min((150 + num_classes**2) // 2,
    PATIENCE = int(min((num_classes) * 10, 285))
    DROPOUT = min(0.65 + 0.01 * np.sqrt(num_classes), 0.85)
    LEARNING_RATE = 0.0001 * num_classes + 0.01 * DROPOUT
    LOSS_WEIGHT = 0.90 / (1 + np.exp(0.5 *
                                     (num_classes - avg_class_est))) + 0.00075
    NORM_CLIP = 2
    NUM_HEADS = int(num_heads[ess_set])

    # Get transformed labels for regression
    ys = np.array(labels)
    ys_regression = ys / num_classes

    # Separate a test set
    X_test_df = X.iloc[test_indices]
    Xt = embs[test_indices]
    Mt = msks[test_indices]
    sMt = sent_masks[test_indices]
    yt = ys[test_indices]
    yt_regression = ys_regression[test_indices]

    # Reset the dev data
    X = X.iloc[dev_indices]
    embs = embs[dev_indices]
    msks = msks[dev_indices]
    sMs = sent_masks[dev_indices]
    ys = ys[dev_indices]
    ys_regression = ys_regression[dev_indices]

    for traincv, testcv in cv.split(X):
        print("=" * 50 + "Essay " + str(ess_set) + "=" * 50)
        print("\n--------Fold {}--------\n".format(count))

        print("current number of classes = {}".format(num_classes))
        # These are for 5-fold training sets
        Xn = embs[traincv]
        Mn = tf.cast(msks[traincv], dtype=tf.float32)  # , keepdims = True
        sMn = sMs[traincv]
        yn = ys[traincv]
        yn_regression = ys_regression[traincv]

        # These are for 5-fold dev sets
        Xv = embs[testcv]
        Mv = tf.cast(msks[testcv], dtype=tf.float32)  # , keepdims = True
        sMv = sMs[testcv]
        yv = ys[testcv]
        yv_regression = ys_regression[testcv]

        ### =======  Define Inputs ======= ###
        Input_embeds = tf.keras.Input(shape=(
            MAX_LEN,
            EMBED_DIM,
        ))
        Input_masks = tf.keras.Input(shape=(
            MAX_LEN,
            EMBED_DIM,
        ))
        Input_sentence_masks = tf.keras.Input(shape=(MAX_LEN, ))
        inputs = [Input_embeds, Input_masks, Input_sentence_masks]

        ### =======  Define Query, Key, and Value for Transformer ======= ###
        q_input = layers.Dense(EMBED_DIM, name="Query_Input")(inputs[0])

        q_input = layers.Multiply()([q_input, inputs[1]])

        v_input = layers.Dense(EMBED_DIM, name="Value_Input")(inputs[0])

        v_input = layers.Multiply()([v_input, inputs[1]])

        if num_classes > 8:
            k_input = layers.Dense(EMBED_DIM, name="Key_Input")(inputs[0])

            k_input = layers.Multiply()([k_input, inputs[1]])

        else:
            k_input = v_input

        ### =======  Masked MultiHead Attention ======= ###
        mha_output1, attention_weights = MultiHeadAttention(
            d_model=EMBED_DIM, num_heads=NUM_HEADS)(v=v_input,
                                                    k=k_input,
                                                    q=q_input,
                                                    mask=None)

        mha_masked = layers.Multiply(name="MHA_Masking_1")(
            [mha_output1, inputs[1]])
        mha_masked = layers.Dropout(DROPOUT)(mha_masked)
        forLSTM = layers.Masking(name="DFF_Mask2")(mha_masked, )

        if num_classes > 5:
            ### =======  Add and Norm ======= ###
            layer_norm_output = layers.LayerNormalization(
                epsilon=1e-6, name="MHA_Norm_1")(mha_masked + mha_output1)

            ### =======  Masked MultiHead Attention ======= ###
            mha_output2, attention_weights = MultiHeadAttention(
                d_model=EMBED_DIM, num_heads=NUM_HEADS)(v=v_input,
                                                        k=k_input,
                                                        q=layer_norm_output,
                                                        mask=None)
            mha_output2 = layers.Multiply()([mha_output2, inputs[1]])
            mha_output2 = layers.Dropout(DROPOUT)(mha_output2)

            ### =======  Add and Norm ======= ###
            layer_norm_output2 = layers.LayerNormalization(
                epsilon=1e-6,
                name="MHA_Norm_2")(mha_output2 + layer_norm_output)

            ### =======  Feed Forward Network ======= ###
            ffn_output = layers.Dense(
                EMBED_DIM,
                activation='relu',
                name="DFF_1",
                kernel_regularizer=tf.keras.regularizers.l2(0.001))(
                    layer_norm_output2)
            ffn_output = layers.Dropout(DROPOUT)(
                ffn_output)  # 0.2 * SET_SIZE_RATIO
            q_ffn = layers.Dense(EMBED_DIM,
                                 name="Q_DFF_2",
                                 kernel_regularizer=tf.keras.regularizers.l2(
                                     0.0001))(ffn_output)
            q_ffn = layers.LayerNormalization(epsilon=1e-6,
                                              name="Qffn_normalization")(q_ffn)
            q_ffn = layers.Multiply()([q_ffn, inputs[1]])
            forLSTM = layers.Masking(name="DFF_Mask2")(q_ffn, )

            if num_classes > 100:
                ### =======  Feed Forward Network for Second Layer ======= ###
                DROPOUT_2 = 0.5
                v_ffn = layers.Dense(
                    EMBED_DIM,
                    name="V_DFF_2",
                    kernel_regularizer=tf.keras.regularizers.l2(0.0001))(
                        ffn_output)
                v_ffn = layers.Multiply()([v_ffn, inputs[1]])
                v_ffn = layers.LayerNormalization(
                    epsilon=1e-6, name="Vffn_normalization")(v_ffn)
                v_ffn = layers.Multiply()([v_ffn, inputs[1]])
                k_ffn = layers.Dense(
                    EMBED_DIM,
                    name="K_DFF_2",
                    kernel_regularizer=tf.keras.regularizers.l2(0.0001))(
                        ffn_output)
                k_ffn = layers.Multiply()([k_ffn, inputs[1]])
                k_ffn = layers.LayerNormalization(
                    epsilon=1e-6, name="Kffn_normalization")(k_ffn)
                k_ffn = layers.Multiply()([k_ffn, inputs[1]])
                #         ffn_output= layers.Multiply()([ffn_output, inputs[1]])
                ffn_output = layers.Dropout(DROPOUT)(ffn_output)

                ### =======  Add and Norm ======= ###
                layer_norm_output3 = layers.LayerNormalization(
                    epsilon=1e-6,
                    name="DFF_Norm")(ffn_output + layer_norm_output2)

                ### =======  Masked MultiHead Attention ======= ###
                mha_output3, attention_weights = MultiHeadAttention(
                    d_model=EMBED_DIM, num_heads=NUM_HEADS)(v=v_input,
                                                            k=k_input,
                                                            q=q_ffn,
                                                            mask=None)
                mha_masked3 = layers.Multiply(name="MHA_Masking_3")(
                    [mha_output3, inputs[1]])
                mha_masked3 = layers.Dropout(DROPOUT)(mha_masked3)

                ### =======  Add and Norm ======= ###
                layer_norm_output4 = layers.LayerNormalization(
                    epsilon=1e-6, name="MHA_Norm_3")(mha_masked3 + mha_output3)

                ### =======  Masked MultiHead Attention ======= ###
                mha_output4, attention_weights = MultiHeadAttention(
                    d_model=EMBED_DIM,
                    num_heads=NUM_HEADS)(v=v_input,
                                         k=k_input,
                                         q=layer_norm_output4,
                                         mask=None)
                mha_output4 = layers.Multiply()([mha_output4, inputs[1]])
                mha_output4 = layers.Dropout(DROPOUT)(mha_output4)

                ### =======  Add and Norm ======= ###
                layer_norm_output5 = layers.LayerNormalization(
                    epsilon=1e-6,
                    name="MHA_Norm_5")(mha_output4 + layer_norm_output4)

                ### =======  Feed Forward Network ======= ###
                ffn2_output = layers.Dense(
                    EMBED_DIM,
                    activation='relu',
                    name="DFF_4",
                    kernel_regularizer=tf.keras.regularizers.l2(0.001))(
                        layer_norm_output5)
                ffn2_output = layers.Dropout(DROPOUT)(
                    ffn2_output)  # 0.2 * SET_SIZE_RATIO
                ffn2_output = layers.Dense(
                    EMBED_DIM,
                    name="DFF_5",
                    kernel_regularizer=tf.keras.regularizers.l2(0.0001))(
                        ffn2_output)
                ffn2_output = layers.Dropout(DROPOUT)(ffn2_output)

                ### =======  Add and Norm ======= ###
                layer_norm_output6 = layers.LayerNormalization(
                    epsilon=1e-6,
                    name="DFF_Norm2")(ffn2_output + layer_norm_output5)
                layer_norm_output6 = layers.Multiply()(
                    [layer_norm_output6, inputs[1]])
                forLSTM = layers.Masking(name="DFF_Mask2")(
                    layer_norm_output6, )

        ### =======  Bidirectional LSTM  ======= ###
        LSTM3 = layers.Bidirectional(
            layers.LSTM(
                EMBED_DIM // 2,  # EMBED_DIM//2,
                dropout=DROPOUT,
                recurrent_dropout=0,
                activation='tanh',
                use_bias=True,
                unroll=False,
                recurrent_activation='sigmoid'),
            name="BLSTM")(inputs=forLSTM)
        LSTM3 = layers.LayerNormalization(epsilon=1e-6,
                                          name="LSTM_Norm_3")(LSTM3)

        LSTM4 = layers.LSTM(
            EMBED_DIM,  # EMBED_DIM//2,
            dropout=DROPOUT,
            recurrent_dropout=0,
            activation='tanh',
            use_bias=True,
            unroll=False,
            recurrent_activation='sigmoid')(inputs=inputs[0])
        LSTM4 = layers.Dense(
            HIDDEN_SIZE // 2,
            activation='relu',
            name="DFF_small_sets",
            kernel_regularizer=tf.keras.regularizers.l2(0.0001))(LSTM4)
        LSTM4 = layers.LayerNormalization(epsilon=1e-6,
                                          name="LSTM_Norm_small")(LSTM4)

        ### =======  Feed Forward Network ======= ###
        final_hidden = layers.Dense(
            EMBED_DIM,
            activation='relu',
            name="DFF_last",
            kernel_regularizer=tf.keras.regularizers.l2(0.0001))(LSTM3)
        final_hidden = layers.LayerNormalization(
            epsilon=1e-6, name="DFF_normalize")(final_hidden)
        final_hidden = layers.Dropout(DROPOUT)(final_hidden)

        ### =======  Define Multiple Outputs ======= ###
        outputs_class = layers.Dense(num_classes,
                                     activation='relu',
                                     name="Classification")(final_hidden)
        outputs_regression = layers.Dense(1,
                                          activation='relu',
                                          name="Regression")(final_hidden)

        outputs_combined = layers.Concatenate()(
            [outputs_class, outputs_regression, LSTM3])

        if avg_len_dict[ess_set] < 200:
            outputs_combined = layers.Concatenate()(
                [outputs_class, outputs_regression])
            outputs_combined = layers.Dense(
                HIDDEN_SIZE, activation='relu',
                name="Hidden_Class_small")(outputs_combined)
            outputs_combined = layers.Dropout(DROPOUT * 1.3)(outputs_combined)
            outputs_combined = layers.Concatenate()([outputs_combined, LSTM3])
            outputs_combined = layers.Dense(
                HIDDEN_SIZE, activation='relu',
                name="Hidden_Class_small2")(outputs_combined)
            outputs_combined = layers.Dropout(DROPOUT * 1.3)(outputs_combined)
            outputs_combined = layers.Dense(
                HIDDEN_SIZE // 2,
                activation='relu',
                name="Hidden_Class_small3")(outputs_combined)
            outputs_combined = layers.Dropout(DROPOUT * 1.3)(outputs_combined)

        elif avg_len_dict[ess_set] < 400:
            outputs_combined = layers.Concatenate()(
                [outputs_class, outputs_regression, LSTM3])
            outputs_combined = layers.Dense(
                HIDDEN_SIZE // 2, activation='relu',
                name="Hidden_Class_med1")(outputs_combined)
            outputs_combined = layers.Dropout(DROPOUT * 1.25)(outputs_combined)
            outputs_combined = layers.Dense(
                HIDDEN_SIZE // 4, activation='relu',
                name="Hidden_Class_med2")(outputs_combined)
            outputs_combined = layers.Dropout(DROPOUT * 1.25)(outputs_combined)
        else:
            outputs_combined = layers.Concatenate()(
                [outputs_class, outputs_regression, LSTM3])
            outputs_combined = layers.Dropout(DROPOUT * 1.25)(outputs_combined)

        outputs_combined_reg = layers.Dense(
            1, activation='sigmoid', name="Combined_Regress")(outputs_combined)
        outputs_combined_class = layers.Dense(
            HIDDEN_SIZE, activation='relu',
            name="Final_hidden_Class")(outputs_combined)

        outputs_combined_class = layers.Dense(
            num_classes, activation='softmax',
            name="Combined_Class")(outputs_combined_class)

        ### =======  Output ======= ###
        final_outputs = [outputs_combined_class, outputs_combined_reg]

        ### =======  Define Model Parameters and Compile ======= ###
        model = tf.keras.models.Model(inputs=inputs, outputs=final_outputs)

        loss = [
            WeightedKappaLoss(num_classes=num_classes, label_smoothing=0),
            tf.keras.losses.MeanSquaredError(),
            WeightedKappaLoss(num_classes=num_classes, label_smoothing=0),
        ]

        learning_rate = LEARNING_RATE
        learning_rate = CustomSchedule(d_model=EMBED_DIM,
                                       warmup_steps=750,
                                       is_sine=False)
        optimizer = tf.keras.optimizers.Adamax(learning_rate=learning_rate)

        model.compile(
            loss=loss,
            optimizer=optimizer,
            metrics=['acc', 'mse'],
            loss_weights=[LOSS_WEIGHT, num_classes * (1 - LOSS_WEIGHT)],
        )

        ### =======  Train Model ======= ###

        callback = [
            tf.keras.callbacks.EarlyStopping(monitor='val_Combined_Class_loss',
                                             patience=PATIENCE,
                                             min_delta=0.01,
                                             restore_best_weights=True),
            tf.keras.callbacks.EarlyStopping(
                monitor='val_Combined_Regress_loss',
                patience=PATIENCE * 6,
                min_delta=0.0001,
                restore_best_weights=True),
        ]
        history = model.fit(
            [Xn, Mn, sMn],
            [yn, yn_regression],
            validation_data=([Xv, Mv, sMv], [yv, yv_regression]),
            batch_size=512,  # BATCH_SIZES,
            epochs=EPOCHS,
            callbacks=[callback])

        ### =======  Record History and Summaries ======= ###
        ess_history.append(history)
        model.summary()

        print("BATCH_SIZES = {}".format(BATCH_SIZES))
        print("HIDDEN_SIZE = {}".format(HIDDEN_SIZE))
        print("TOTAL EPOCHS =  {}".format(EPOCHS))
        print("PATIENCE = {}".format(PATIENCE))
        print("DROPOUT = {}".format(DROPOUT))
        print("LEARNING_RATE =  {}".format(LEARNING_RATE))
        print("percent classification = {} ".format(LOSS_WEIGHT))
        print("NUM_HEADS = {}".format(NUM_HEADS))

        ### =======  Evaluate on Held-out Test Set ======= ###
        y_pred = model.predict([Xt, Mt, sMt])
        y_pred_class = y_pred[0]
        y_pred_class_c = np.argmax(y_pred[0], axis=1)
        y_pred_regression = y_pred[1]
        y_pred_regression = np.round(y_pred_regression * num_classes)
        y_pred_class = np.argmax(y_pred_class, axis=1)

        print(confusion_matrix(yt, y_pred_class))
        print("\n--------Fold {} Classification KAPPA--------\n".format(count))
        class_result = cohen_kappa_score(yt, y_pred_class, weights='quadratic')
        print(class_result)

        print(confusion_matrix(yt, y_pred_regression))
        print("\n--------Fold {} Regression KAPPA--------\n".format(count))
        regress_result = cohen_kappa_score(yt,
                                           y_pred_regression,
                                           weights='quadratic')
        print(regress_result)
        y_comb = np.round(
            np.add((np.argmax(y_pred[0], axis=1) * LOSS_WEIGHT),
                   (y_pred[1] * num_classes * (1 - LOSS_WEIGHT))[:, 0]))

        print(confusion_matrix(yt, y_comb))
        print("\n--------Fold {} Combined KAPPA--------\n".format(count))
        comb_result = cohen_kappa_score(yt, y_comb, weights='quadratic')
        print(comb_result)

        class_results.append(class_result)
        regress_results.append(regress_result)
        comb_results.append(comb_result)
        if count == FOLDS:
            print("{}__FOLD_CLASS_KAPPA___SCORE___MAX: {}".format(
                FOLDS, np.max(class_results)))
            print("{}__FOLD_REGRESS_KAPPA___SCORE___MAX: {}".format(
                FOLDS, np.max(regress_results)))
            print("{}__FOLD_COMB_KAPPA___SCORE___MAX: {}".format(
                FOLDS, np.max(comb_results)))

        count += 1

    histories_mha_combined_objective_final4.append(ess_history)
    result_history.append(results)
    f, ax = plt.subplots(nrows=1, ncols=FOLDS, figsize=[15, 3], sharey='row')
    for fd in range(FOLDS):
        ax[fd].plot(histories_mha_combined_objective_final4[essay_set_counter]
                    [fd].history['Combined_Regress_loss'])
        ax[fd].plot(histories_mha_combined_objective_final4[essay_set_counter]
                    [fd].history['val_Combined_Regress_loss'])
    plt.title('model regress loss essay {}'.format(ess_set))
    plt.ylabel('loss')
    plt.xlabel('epoch')
    plt.legend(['train', 'test'], loc='upper left')
    plt.show()
    plt.savefig(
        'mha2_combined_regress_features_8_6_capacity_essay_{}.png'.format(
            ess_set))

    f, ax = plt.subplots(nrows=1, ncols=FOLDS, figsize=[15, 3], sharey='row')
    for fd in range(FOLDS):
        ax[fd].plot(histories_mha_combined_objective_final4[essay_set_counter]
                    [fd].history['Combined_Class_loss'])
        ax[fd].plot(histories_mha_combined_objective_final4[essay_set_counter]
                    [fd].history['val_Combined_Class_loss'])
    plt.title('model class loss essay {}'.format(ess_set))
    plt.ylabel('loss')
    plt.xlabel('epoch')
    plt.legend(['train', 'test'], loc='upper left')
    plt.show()
    plt.savefig(
        'mha2_combined_class_features_8_6_capacity_essay_{}.png'.format(
            ess_set))
    essay_set_counter += 1