In [None]:
import tensorflow as tf
tf.random.set_seed(1234)
AUTO = tf.data.experimental.AUTOTUNE
!pip install tensorflow-datasets==1.2.0
import tensorflow_datasets as tfds
import re
import sys
from time import time
import numpy as np 
import pandas as pd 
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Read Data

In [None]:
d1 = pd.read_json('/kaggle/input/covid-chitchat/9L_dataset.json')
d1 = d1[:50000]
dt = pd.read_csv('/kaggle/input/emphetic-dialog-fb/emotion-emotion_69k.csv')
d = pd.read_csv('/kaggle/input/exemplary-empathy-2490/emotion_train.csv')

# Concat data

In [None]:
concat_q = pd.concat([dt['Situation'], d['seeker_post'],d1['question']], ignore_index=True)
concat_q.dropna(inplace=True)
prompt = concat_q.tolist()

concat_a = pd.concat([dt['labels'], d['response_post'], d1['answer']], ignore_index=True)
concat_a.dropna(inplace=True)
response = concat_a.tolist()

print(len(prompt))
print(len(response))

# Hyperparameters

In [None]:
max_len = 60
max_sample = 117125
batch_size = 64
buffer_size = 70000
number_of_layer = 2
d_model = 512
number_of_head = 8
unit = 128
dropout = 0.1

# Data preprocess

In [None]:
def text_preprocess(s):
    s = s.lower().strip()
    s= re.sub(r"([?.!,])", r" \1 ", s)
    s = re.sub(r'[" "]+', " ", s)
    s = re.sub(r"[^a-zA-Z?.!,]+", " ", s)
    s = s.strip()
    return s

prompt = [text_preprocess(s) for s in prompt]
response = [text_preprocess(s) for s in response]

# Build Prompt and Response

In [None]:
tokenizer = tfds.features.text.SubwordTextEncoder.build_from_corpus(prompt + response, target_vocab_size=8000)

s_token, e_token = [tokenizer.vocab_size], [tokenizer.vocab_size + 1]

vocab_size = tokenizer.vocab_size + 2

t_prompt, t_response = [], []

for (i, j) in zip(prompt, response):
    i = s_token + tokenizer.encode(i) + e_token
    j = s_token + tokenizer.encode(j) + e_token
    if len(i) <= max_len and len(j) <= max_len:
        t_prompt.append(i)
        t_response.append(j)

prompt = tf.keras.preprocessing.sequence.pad_sequences(t_prompt, maxlen=max_len, padding='post')
response = tf.keras.preprocessing.sequence.pad_sequences(t_response, maxlen=max_len, padding='post')

# Create Train and Validation Data

In [None]:
data = tf.data.Dataset.from_tensor_slices(({ 'inputs': prompt,'dec_inputs': response[:, :-1] },{'outputs': response[:, 1:]},))
data = data.cache()
data = data.shuffle(buffer_size)
data = data.batch(batch_size)
data = data.prefetch(tf.data.experimental.AUTOTUNE)
dataset_size = len(data)
train_size = int(0.8 * dataset_size)
train_dataset = data.take(train_size)
val_dataset = data.skip(train_size)

# Multi Head Attention

In [None]:
class MultiHeadAttention(tf.keras.layers.Layer):

    def __init__(self, d_model, num_heads, name="multi_head_attention"):
        super(MultiHeadAttention, self).__init__(name=name)
        self.num_heads = num_heads
        self.d_model = d_model
        assert d_model % self.num_heads == 0
        self.depth = d_model // self.num_heads

        self.query_dense = tf.keras.layers.Dense(units=d_model)
        self.key_dense = tf.keras.layers.Dense(units=d_model)
        self.value_dense = tf.keras.layers.Dense(units=d_model)
        self.dense = tf.keras.layers.Dense(units=d_model)

    def get_config(self):
        config = super(MultiHeadAttention,self).get_config()
        config.update({ 'num_heads':self.num_heads,'d_model':self.d_model,})
        return config

    def split_heads(self, inputs, batch_size):
        inputs = tf.keras.layers.Lambda(lambda inputs:tf.reshape(inputs, shape=(batch_size, -1, self.num_heads, self.depth)))(inputs)
        return tf.keras.layers.Lambda(lambda inputs: tf.transpose(inputs, perm=[0, 2, 1, 3]))(inputs)

    def call(self, inputs):
        query, key, value, mask = inputs['query'], inputs['key'], inputs['value'], inputs['mask']
        batch_size = tf.shape(query)[0]
   
        query = self.query_dense(query)
        key = self.key_dense(key)
        value = self.value_dense(value)

        query = self.split_heads(query, batch_size)
        key = self.split_heads(key, batch_size)
        value = self.split_heads(value, batch_size)
        
        # scaled_dot_product_attention   
        matmul_qk = tf.matmul(query, key, transpose_b=True)
        depth = tf.cast(tf.shape(key)[-1], tf.float32)
        logits = matmul_qk / tf.math.sqrt(depth)
        if mask is not None:
            logits += (mask * -1e9)
        attention_weights = tf.nn.softmax(logits, axis=-1)
        scaled_attention = tf.matmul(attention_weights, value)
        #scaled_attention = scaled_dot_product_attention(query, key, value, mask)
        scaled_attention = tf.keras.layers.Lambda(lambda scaled_attention: tf.transpose(scaled_attention, perm=[0, 2, 1, 3]))(scaled_attention)

        concat_attention = tf.keras.layers.Lambda(lambda scaled_attention: tf.reshape(scaled_attention,(batch_size, -1, self.d_model)))(scaled_attention)
        outputs = self.dense(concat_attention)
        return outputs

# Positional Encoding

In [None]:
class PositionalEncoding(tf.keras.layers.Layer):

    def __init__(self, position, d_model):
        super(PositionalEncoding, self).__init__()
        self.pos_encoding = self.positional_encoding(position, d_model)

    def get_config(self):
        config = super(PositionalEncoding, self).get_config()
        config.update({'position': self.position,'d_model': self.d_model,})
        return config

    def get_angles(self, position, i, d_model):
        angles = 1 / tf.pow(10000, (2 * (i // 2)) / tf.cast(d_model, tf.float32))
        return position * angles

    def positional_encoding(self, position, d_model):
        angle_rads = self.get_angles(position=tf.range(position, dtype=tf.float32)[:, tf.newaxis],i=tf.range(d_model, dtype=tf.float32)[tf.newaxis, :],d_model=d_model)
        sines = tf.math.sin(angle_rads[:, 0::2])
        cosines = tf.math.cos(angle_rads[:, 1::2])
        pos_encoding = tf.concat([sines, cosines], axis=-1)
        pos_encoding = pos_encoding[tf.newaxis, ...]
        return tf.cast(pos_encoding, tf.float32)

    def call(self, inputs):
        return inputs + self.pos_encoding[:, :tf.shape(inputs)[1], :]

# Encoder Blocks

In [None]:
def encoder_layer(units, d_model, num_heads, dropout, name="encoder_layer"):
    inputs = tf.keras.Input(shape=(None, d_model), name="inputs")
    padding_mask = tf.keras.Input(shape=(1, 1, None), name="padding_mask")

    attention = MultiHeadAttention(d_model, num_heads, name="attention")({'query': inputs,'key': inputs,'value': inputs,'mask': padding_mask})
    attention = tf.keras.layers.Dropout(rate=dropout)(attention)
    add_attention = tf.keras.layers.add([inputs,attention])
    attention = tf.keras.layers.LayerNormalization(epsilon=1e-6)(add_attention)

    outputs = tf.keras.layers.Dense(units=units, activation='relu')(attention)
    outputs = tf.keras.layers.Dense(units=d_model)(outputs)
    outputs = tf.keras.layers.Dropout(rate=dropout)(outputs)
    add_attention = tf.keras.layers.add([attention,outputs])
    outputs = tf.keras.layers.LayerNormalization(epsilon=1e-6)(add_attention)

    return tf.keras.Model(inputs=[inputs, padding_mask], outputs=outputs, name=name)




def encoder(vocab_size,num_layers, units,d_model,num_heads,dropout, name="encoder"):
    inputs = tf.keras.Input(shape=(None,), name="inputs")
    padding_mask = tf.keras.Input(shape=(1, 1, None), name="padding_mask")

    embeddings = tf.keras.layers.Embedding(vocab_size, d_model)(inputs)
    embeddings *= tf.keras.layers.Lambda(lambda d_model: tf.math.sqrt(tf.cast(d_model, tf.float32)))(d_model)
    embeddings = PositionalEncoding(vocab_size,d_model)(embeddings)

    outputs = tf.keras.layers.Dropout(rate=dropout)(embeddings)

    for i in range(num_layers):
        outputs = encoder_layer(units=units,d_model=d_model,num_heads=num_heads,dropout=dropout,name="encoder_layer_{}".format(i),)([outputs, padding_mask])

    return tf.keras.Model(inputs=[inputs, padding_mask], outputs=outputs, name=name)

# Decoder Blocks

In [None]:
def decoder_layer(units, d_model, num_heads, dropout, name="decoder_layer"):
    inputs = tf.keras.Input(shape=(None, d_model), name="inputs")
    enc_outputs = tf.keras.Input(shape=(None, d_model), name="encoder_outputs")
    look_ahead_mask = tf.keras.Input(shape=(1, None, None), name="look_ahead_mask")
    padding_mask = tf.keras.Input(shape=(1, 1, None), name='padding_mask')

    attention1 = MultiHeadAttention(d_model, num_heads, name="attention_1")(inputs={'query': inputs,'key': inputs,'value': inputs,'mask': look_ahead_mask})
    add_attention = tf.keras.layers.add([attention1,inputs])
    attention1 = tf.keras.layers.LayerNormalization(epsilon=1e-6)(add_attention)

    attention2 = MultiHeadAttention(d_model, num_heads, name="attention_2")(inputs={'query': attention1,'key': enc_outputs,'value': enc_outputs,'mask': padding_mask})
    attention2 = tf.keras.layers.Dropout(rate=dropout)(attention2)
    add_attention = tf.keras.layers.add([attention2,attention1])
    attention2 = tf.keras.layers.LayerNormalization(epsilon=1e-6)(add_attention)

    outputs = tf.keras.layers.Dense(units=units, activation='relu')(attention2)
    outputs = tf.keras.layers.Dense(units=d_model)(outputs)
    outputs = tf.keras.layers.Dropout(rate=dropout)(outputs)
    add_attention = tf.keras.layers.add([outputs,attention2])
    outputs = tf.keras.layers.LayerNormalization(epsilon=1e-6)(add_attention)

    return tf.keras.Model(inputs=[inputs, enc_outputs, look_ahead_mask, padding_mask],outputs=outputs,name=name)




def decoder(vocab_size, num_layers, units,d_model,num_heads,dropout,name='decoder'):
    inputs = tf.keras.Input(shape=(None,), name='inputs')
    enc_outputs = tf.keras.Input(shape=(None, d_model), name='encoder_outputs')
    look_ahead_mask = tf.keras.Input(shape=(1, None, None), name='look_ahead_mask')
    padding_mask = tf.keras.Input(shape=(1, 1, None), name='padding_mask')

    embeddings = tf.keras.layers.Embedding(vocab_size, d_model)(inputs)
    embeddings *= tf.keras.layers.Lambda(lambda d_model: tf.math.sqrt(tf.cast(d_model, tf.float32)))(d_model)
    embeddings = PositionalEncoding(vocab_size, d_model)(embeddings)

    outputs = tf.keras.layers.Dropout(rate=dropout)(embeddings)

    for i in range(num_layers):
        outputs = decoder_layer(units=units,d_model=d_model,num_heads=num_heads,dropout=dropout,name='decoder_layer_{}'.format(i),)(inputs=[outputs, enc_outputs, look_ahead_mask, padding_mask])

    return tf.keras.Model(inputs=[inputs, enc_outputs, look_ahead_mask, padding_mask],outputs=outputs,name=name)

# Masking

In [None]:
class PaddingMaskLayer(tf.keras.layers.Layer):
    def call(self, x):
        mask = tf.cast(tf.math.equal(x, 0), tf.float32)
        return mask[:, tf.newaxis, tf.newaxis, :]
    
    
class LookAheadMaskLayer(tf.keras.layers.Layer):
    def call(self, x):
        seq_len = tf.shape(x)[1]
        look_ahead_mask = 1 - tf.linalg.band_part(tf.ones((seq_len, seq_len)), -1, 0)
        padding_mask = PaddingMaskLayer()
        padding_mask = padding_mask(x)
        return tf.maximum(look_ahead_mask, padding_mask)

# Transformer

In [None]:
def transformer(vocab_size, num_layers,units, d_model,num_heads,dropout, name="transformer"):
    inputs = tf.keras.Input(shape=(None,), name="inputs")
    dec_inputs = tf.keras.Input(shape=(None,), name="dec_inputs")
    
    padding_mask_layer = PaddingMaskLayer()
    enc_padding_mask = padding_mask_layer(inputs)
    
    mask_layer = LookAheadMaskLayer()
    look_ahead_mask = mask_layer(dec_inputs)

    d_mask_layer = PaddingMaskLayer()
    dec_padding_mask = d_mask_layer(inputs)

    enc_outputs = encoder(vocab_size=vocab_size,num_layers=num_layers,units=units,d_model=d_model,num_heads=num_heads,dropout=dropout,)(inputs=[inputs, enc_padding_mask])

    dec_outputs = decoder( vocab_size=vocab_size,num_layers=num_layers,units=units,d_model=d_model,num_heads=num_heads,dropout=dropout,)(inputs=[dec_inputs, enc_outputs, look_ahead_mask, dec_padding_mask])

    outputs = tf.keras.layers.Dense(units=vocab_size, name="outputs")(dec_outputs)

    return tf.keras.Model(inputs=[inputs, dec_inputs], outputs=outputs, name=name)

# Optimizer and Loss 

In [None]:
optimizer = tf.keras.optimizers.Adam(learning_rate=1e-4)
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
accuracy = tf.keras.metrics.SparseCategoricalAccuracy(name='accuracy')

# Model

In [None]:
model = transformer(vocab_size=vocab_size,num_layers=number_of_layer,units=unit,d_model=d_model,num_heads=number_of_head,dropout= 0.1)

model.compile(optimizer=optimizer, loss=[loss], metrics=[accuracy])

# Train

In [None]:
model.fit(train_dataset, epochs=60, validation_data = val_dataset)

# Perplexity

In [None]:
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction='none')
total_loss = 0.0
num_batches = 0
for inputs, targets_dict in val_dataset:
    targets = targets_dict['outputs']
    predictions = model(inputs, training=False)
    batch_loss = loss_object(targets, predictions)
    average_batch_loss = tf.reduce_mean(batch_loss)
    total_loss += average_batch_loss
    num_batches += 1
average_loss = total_loss / num_batches
perplexity = tf.exp(average_loss)
a = perplexity.numpy()
print(f"Perplexity: {a}")

# Inference

In [None]:
while True:
    a = input("\nInput: ")
    if a == "exit":
        break
    s = text_preprocess(a)
    s = tf.expand_dims(s_token + tokenizer.encode(s) + e_token, axis=0)
    output = tf.expand_dims(s_token, 0)
    for i in range(max_len):
        predictions = model(inputs=[s, output], training=False)
        predictions = predictions[:, -1:, :]
        predicted_id = tf.cast(tf.argmax(predictions, axis=-1), tf.int32)
        if tf.equal(predicted_id, e_token[0]):
            break
        output = tf.concat([output, predicted_id], axis=-1)
    
    p = tf.squeeze(output, axis=0)
    pre_prompt = tokenizer.decode([i for i in p if i < tokenizer.vocab_size])
    print('Output: {}'.format(pre_prompt))
  