In [None]:
from google.colab import drive
drive.mount('/gdrive')
%cd /gdrive

Drive already mounted at /gdrive; to attempt to forcibly remount, call drive.mount("/gdrive", force_remount=True).
/gdrive


Install packages

In [None]:
pip install -q -U tensorflow-hub tensorflow-text tensorflow-addons

In [None]:
!pip install pydot --quiet
!pip install gensim==3.8.3 --quiet
!pip install tensorflow-datasets --quiet
!pip install -U tensorflow-text --quiet
!pip install transformers --quiet
!pip install pydot --quiet

In [None]:
!pip install -q tensorflow-recommenders
!pip install -q scann

In [None]:
import pandas as pd
import regex as re
import csv
from itertools import islice
import pickle
import numpy as np
import json
import os
import sys
import argparse
from pathlib import Path
import pprint
import tempfile
import re
from typing import Dict, Text

import sklearn as sk
from sklearn.model_selection import train_test_split
import nltk
from nltk.corpus import reuters
from nltk.data import find

import matplotlib.pyplot as plt
import numpy as np
from sklearn.metrics import ndcg_score
from sklearn.metrics import average_precision_score

In [None]:
import os
import collections
import json
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import tensorflow_hub as hub
import tensorflow_text as text
import tensorflow_addons as tfa
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
from tqdm import tqdm

# Suppressing tf.hub warnings
tf.get_logger().setLevel("ERROR")

In [None]:
from tensorflow.keras.layers import Embedding, Input, Dense, Lambda
from tensorflow.keras.models import Model
import tensorflow.keras.backend as K
import tensorflow_datasets as tfds
import tensorflow_text as tf_text
import tensorflow_recommenders as tfrs

from transformers import BertTokenizer, TFBertModel

In [None]:
%cd /gdrive/MyDrive/nlp-yuan_code/FinBERT-QA
from src.process_data import *

/gdrive/MyDrive/nlp-yuan_code/FinBERT-QA


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Setting up the path

In [None]:
RO_FiQA_DATA_PATH = '/gdrive/MyDrive/nlp-data/nlp-qa-datasets/FiQA/FiQA_train_task2/'
RO_FIQA_INDEX = "/gdrive/MyDrive/nlp-yuan_code/FinBERT-QA/retriever/lucene-index-fiqa/"
WR_PICKLE_DATA = '/gdrive/MyDrive/nlp-yuan_code/FinBERT-QA/data/data_pickle/'
WR_PICKLE_TRANSIENT_DATA = '/gdrive/MyDrive/nlp-yuan_code/FinBERT-QA/data/data_pickle/transient/'
WR_INTERIM_DATA = '/gdrive/MyDrive/nlp-yuan_code/FinBERT-QA/data/interim/'

In [None]:
train_set = load_pickle(WR_INTERIM_DATA + "train_set.pickle")
test_set = load_pickle(WR_INTERIM_DATA + "test_set.pickle")
qid_to_text = load_pickle(WR_INTERIM_DATA + "qid_to_text.pickle")
docid_to_text = load_pickle(WR_INTERIM_DATA + "docid_to_text.pickle")
text_to_docid = load_pickle(WR_INTERIM_DATA + "text_to_docid.pickle")
qa_pairs = load_pickle(WR_INTERIM_DATA + "qa_pairs.pickle")

Generate Train and Test dataset

In [None]:
unique_qids = []
unique_aids = []
for qid, aid in qa_pairs:
  unique_qids.append(qid)
  unique_aids.append(aid)
unique_qids = np.unique(unique_qids)
unique_aids = np.unique(unique_aids)

questions = tf.data.Dataset.from_tensor_slices([qid_to_text[qid] if isinstance(qid_to_text.get(qid),str) else '' for qid, aid in qa_pairs])
answers = tf.data.Dataset.from_tensor_slices([docid_to_text[aid] if isinstance(docid_to_text.get(aid),str) else '' for qid, aid in qa_pairs])

In [None]:
print(len(unique_qids), len(unique_aids), len(qa_pairs))
len(answers), len(questions)

6648 17110 17110


(17110, 17110)

In [None]:
ds = tf.data.Dataset.zip((questions, answers))
ds = ds.map(lambda x, y : {"question": x, "answer": y})

tf.random.set_seed(42)
shuffled = ds.shuffle(17_110, seed=42, reshuffle_each_iteration=False)

In [None]:
cached_train = shuffled.shuffle(17_110).batch(1300)
cached_test =  shuffled.take(856).batch(150)

In [None]:
def project_embeddings(
    embeddings, num_projection_layers, projection_dims, dropout_rate
):
    projected_embeddings = layers.Dense(units=projection_dims)(embeddings)
    for _ in range(num_projection_layers):
        x = tf.nn.gelu(projected_embeddings)
        x = layers.Dense(projection_dims)(x)
        x = layers.Dropout(dropout_rate)(x)
        x = layers.Add()([projected_embeddings, x])
        projected_embeddings = layers.LayerNormalization()(x)
    return projected_embeddings


Using BERT to generate embeddings

In [None]:
def create_text_encoder(num_projection_layers, projection_dims, dropout_rate, trainable=False):
  
    # Load the BERT preprocessing module.
    preprocess = hub.KerasLayer(
        "https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/2",
        name="text_preprocessing",
    )
    # Load the pre-trained BERT model to be used as the base encoder.
    bert = hub.KerasLayer(
        "https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-4_H-512_A-8/1",
        name = "bert",
    )
    # Set the trainability of the base encoder.
    bert.trainable = trainable
    # Receive the text as inputs.
    inputs = layers.Input(shape=(), dtype=tf.string, name="text_input")
    # Preprocess the text.
    bert_inputs = preprocess(inputs)
    # Generate embeddings for the preprocessed text using the BERT model.
    embeddings = bert(bert_inputs)["pooled_output"]
    # Project the embeddings produced by the model.
    outputs = project_embeddings(
        embeddings, num_projection_layers, projection_dims, dropout_rate
    )
    # Create the text encoder model.
    return keras.Model(inputs, outputs, name="text_encoder")

Generating the Two Tower Model

In [None]:
class DualEncoder(keras.Model):
    def __init__(self, question_encoder, answer_encoder, temperature=1.0, **kwargs):
        super(DualEncoder, self).__init__(**kwargs)
        self.question_encoder = question_encoder
        self.answer_encoder = answer_encoder
        self.temperature = temperature
        self.loss_tracker = keras.metrics.Mean(name="loss")

    @property
    def metrics(self):
        return [self.loss_tracker]

    def call(self, features, training=False):
        # Place each encoder on a separate GPU (if available).
        # TF will fallback on available devices if there are fewer than 2 GPUs.
        with tf.device("/gpu:0"):
            # Get the embeddings for the captions.
            question_embeddings = question_encoder(features["question"], training=training)
        with tf.device("/gpu:1"):
            # Get the embeddings for the images.
            answer_embeddings = answer_encoder(features["answer"], training=training)
        return question_embeddings, answer_embeddings

    def compute_loss(self, question_embeddings, answer_embeddings):
        # logits[i][j] is the dot_similarity(caption_i, image_j).
        logits = (
            tf.matmul(question_embeddings, answer_embeddings, transpose_b=True)
            / self.temperature
        )
        # images_similarity[i][j] is the dot_similarity(image_i, image_j).
        answer_similarity = tf.matmul(
            answer_embeddings, answer_embeddings, transpose_b=True
        )
        # captions_similarity[i][j] is the dot_similarity(caption_i, caption_j).
        question_similarity = tf.matmul(
            question_embeddings, question_embeddings, transpose_b=True
        )
        # targets[i][j] = avarage dot_similarity(caption_i, caption_j) and dot_similarity(image_i, image_j).
        targets = keras.activations.softmax(
            (question_similarity + answer_similarity) / (2 * self.temperature)
        )
        # Compute the loss for the captions using crossentropy
        question_loss = keras.losses.categorical_crossentropy(
            y_true=targets, y_pred=logits, from_logits=True
        )
        # Compute the loss for the images using crossentropy
        answer_loss = keras.losses.categorical_crossentropy(
            y_true=tf.transpose(targets), y_pred=tf.transpose(logits), from_logits=True
        )
        # Return the mean of the loss over the batch.
        return (question_loss + answer_loss) / 2

    def train_step(self, features):
        with tf.GradientTape() as tape:
            # Forward pass
            question_embeddings, answer_embeddings = self(features, training=True)
            loss = self.compute_loss(question_embeddings, answer_embeddings)
        # Backward pass
        gradients = tape.gradient(loss, self.trainable_variables)
        self.optimizer.apply_gradients(zip(gradients, self.trainable_variables))
        # Monitor loss
        self.loss_tracker.update_state(loss)
        return {"loss": self.loss_tracker.result()}

    def test_step(self, features):
        question_embeddings, answer_embeddings = self(features, training=False)
        loss = self.compute_loss(question_embeddings, answer_embeddings)
        self.loss_tracker.update_state(loss)
        return {"loss": self.loss_tracker.result()}

In [None]:
num_epochs = 30  # In practice, train for at least 30 epochs
batch_size = 256
embedding_size = 128

answer_encoder = create_text_encoder(
    num_projection_layers=1, projection_dims=embedding_size, dropout_rate=0.1
)
question_encoder = create_text_encoder(
    num_projection_layers=1, projection_dims=embedding_size, dropout_rate=0.1
)
dual_encoder = DualEncoder(question_encoder, answer_encoder, temperature=0.05)
dual_encoder.compile(
    optimizer=tfa.optimizers.AdamW(learning_rate=0.001, weight_decay=0.001)
)

Executing the models with different embedding size

In [None]:
print(f"Number of GPUs: {len(tf.config.list_physical_devices('GPU'))}")
# Create a learning rate scheduler callback.
reduce_lr = keras.callbacks.ReduceLROnPlateau(
    monitor="val_loss", factor=0.2, patience=3
)
# Create an early stopping callback.
early_stopping = tf.keras.callbacks.EarlyStopping(
    monitor="val_loss", patience=5, restore_best_weights=True
)
history = dual_encoder.fit(
    cached_train,
    epochs=num_epochs,
    validation_data=cached_test,
    callbacks=[reduce_lr, early_stopping],
)
print("Training completed. Saving vision and text encoders...")
answer_encoder.save("answer_encoder_128")
question_encoder.save("question_encoder_128")
print("Models are saved.")

Number of GPUs: 1
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30


In [None]:
plt.plot(history.history["loss"])
plt.plot(history.history["val_loss"])
plt.ylabel("Loss")
plt.xlabel("Epoch")
plt.legend(["train", "valid"], loc="upper right")
plt.show()

In [None]:
print("Loading vision and text encoders...")
answer_encoder = keras.models.load_model("answer_encoder_128")
question_encoder = keras.models.load_model("question_encoder_128")
print("Models are loaded.")


questions_answers = tf.data.Dataset.zip((questions, answers))
#answers = questions_answers.map(lambda x, y : {"text_input": y})

In [None]:
cached_answers = answers.batch(1300)
answer_embeddings = answer_encoder.predict(
    cached_answers,
    verbose=1,
)
print(f"answer embeddings shape: {answer_embeddings.shape}.")

Evaluate the scores by calculating the similarity

In [None]:
def find_matches(answer_embeddings, questions, k=9, normalize=True):
    # Get the embedding for the query.
    query_embedding = question_encoder(tf.convert_to_tensor(questions))
    # Normalize the query and the image embeddings.
    if normalize:
        image_embeddings = tf.math.l2_normalize(answer_embeddings, axis=1)
        query_embedding = tf.math.l2_normalize(query_embedding, axis=1)
    # Compute the dot product between the query and the image embeddings.
    dot_similarity = tf.matmul(query_embedding, answer_embeddings, transpose_b=True)
    # Retrieve top k indices.
    results = tf.math.top_k(dot_similarity, k).indices.numpy()
    scores = tf.math.top_k(dot_similarity, k).values.numpy()

    # Return matching image paths.
    return [[value for value in values] for values in scores], [[idx for idx in indices] for indices in results]
    #return [[idx for idx in indices] for indices in results]

In [None]:
query = "a family standing next to the ocean on a sandy beach with a surf board"
scores, matches = find_matches(answer_embeddings, [qid_to_text[8]], normalize=True)

In [None]:
print([qid_to_text[qid]  for qid, aid in qa_pairs][:1])
print([qid for qid, aid in qa_pairs][:2])

print([docid_to_text[aid]  for qid, aid in qa_pairs][:1])
print([aid for qid, aid in qa_pairs][:1])

print(qa_pairs[8])

In [None]:
[(eid_to_docid[m], m) for m in list(matches[0])]

In [None]:
train_set[8]

Metrics Evaluation

In [None]:
def run_twotower_scores(data):
  ap = []
  rr = []
  cg = []
  skipcnt = 0

  #for i, seq in enumerate(tqdm(train_set)):
  for j, seq in enumerate(data):
    qid, ans_labels, _ , _ = seq[0], seq[1], seq[2], seq[3]
    score, cand_ans = find_matches(answer_embeddings, [qid_to_text[qid]], normalize=True)
    cands = []
    cands_score = []
    cnt = 0

    for cand_anss, cand_score in zip(cand_ans[0], np.ravel(score).tolist()):
      print(cand_anss)
      cands.append(text_to_docid[cand_anss.decode("utf-8")])
      cnt += 1

    max_width = -1
    rr_ = 0
    ap_ = 0.0
    precision_ = 0.0
    relcnt_ = 0

    top_k = 10

    # For each answer in the candidates
    for i in range(cnt):
      docid = cands[i]
      #print(docid)
      if docid in ans_labels and rr_ == 0:
        rr_ = 1/(i+1)
    
    relscores = [1 if docid in ans_labels else 0 for docid in cands]
    pos = [1.0/(i+1) for i in range(cnt)]
    ap_ = average_precision_score(relscores,pos) if sum(relscores) != 0 else 0 

    m = dict([(ai, 1/(i+1)) for i, ai in enumerate(ans_labels)])
    if len(cands) > 0:
      if len(ans_labels)==1:
        cg_ = 1.0 if cands[0] in ans_labels else 0
      else:
        relscores = np.asarray([[ m[cands[i]] if m.get(cands[i]) else 0 for i in range(len(ans_labels))]])
        pos = np.asarray([[1/(i+1) for i, ai in enumerate(ans_labels)]])
        cg_ = ndcg_score(relscores, pos)
    else:
      cg_ = 0

    ap.append(ap_)
    rr.append(rr_)
    cg.append(cg_)
  return rr, ap, cg

In [None]:
mapped_answers = questions_answers.map(lambda x, y : {"text_input": y})
l = list(mapped_answers)
print(l[:3])
cnt = 0
for a in l:
  if a['text_input'].numpy().decode('utf-8') == '':
    cnt += 1

print(cnt)


eid_to_docid = dict([(i, text_to_docid[a['text_input'].numpy().decode('utf-8')])  for i, a in enumerate(l) if a['text_input'].numpy().decode('utf-8') != '' ])

In [None]:
for k, v in eid_to_docid.items():
  if l[k]['text_input'].numpy().decode('utf-8') != docid_to_text[v]:
    print(k, v)

Metrics Evaluation:

In [None]:
def ABC(data):
  ap = []
  rr = []
  cg = []
  skipcnt = 0

  for j, seq in enumerate(data):
    qid, ans_labels, _ , _ = seq[0], seq[1], seq[2], seq[3]
    score, cand_ans = find_matches(answer_embeddings, [qid_to_text[qid]], normalize=True)
    cands = []
    cands_score = []

    for cand_, cand_score in zip(cand_ans[0], np.ravel(score).tolist()):
      cands.append(eid_to_docid[cand_])
      print('XXXX', docid_to_text[eid_to_docid[cand_]])
      print('YYYY', l[cand_])
      cands_score.append(cand_score)
    print(cands)
    print(ans_labels)


def run_twotower_scores__(data):
  ap = []
  rr = []
  cg = []
  skipcnt = 0

  for j, seq in enumerate(data):
    qid, ans_labels, _ , _ = seq[0], seq[1], seq[2], seq[3]
    score, cand_ans = find_matches(answer_embeddings, [qid_to_text[qid]], normalize=True)
    cands = []
    cands_score = []

    for cand_, cand_score in zip(cand_ans[0], np.ravel(score).tolist()):
      cands.append(eid_to_docid[cand_])
      cands_score.append(cand_score)


    max_width = -1
    rr_ = 0
    ap_ = 0.0
    precision_ = 0.0
    relcnt_ = 0

    top_k = 10

    for i in range(len(cands)):
      docid = cands[i]
      if docid in ans_labels and rr_ == 0:
        rr_ = 100.0/(i+1)
    
    relscores = [1 if docid in ans_labels else 0 for docid in cands]
    pos = [1.0/(i+1) for i in range(len(cands))]
    ap_ = average_precision_score(relscores,pos)*100 if sum(relscores) != 0 else 0 

    mcnt = max(len(cands), len(ans_labels))
    m = dict([(ai, 1.0/(i+1)) for i, ai in enumerate(ans_labels)])

    if len(ans_labels)==0 or len(cands)==0:
      cg_ = 0.0
    elif len(ans_labels)==1:
      cg_ = 1.0 if len(cands)>0 and cands[0] in ans_labels else 0
    elif len(cands) == 1:
      cg_ = 1.0 if len(ans_labels) and cands[0] in ans_labels else 0
    else:
      relscores = np.asarray([[ m[cands[i]] if i<len(cands) and m.get(cands[i]) else 0 for i in range(mcnt)]])
      pos = np.asarray([[1/(i+1) for i in range(mcnt)]])
      cg_ = ndcg_score(relscores, pos)*100

    ap.append(ap_)
    rr.append(rr_)
    cg.append(cg_)
  return rr, ap, cg

In [None]:
ABC(train_set[:5])

In [None]:
rr, ap, cg = run_twotower_scores__(train_set)

print('Mean Reciprocal Rank (MRR):', np.mean(rr))
print('Mean average Precision (MAP)', np.mean(ap))
print('Normalized Discounted Cumulative Gain (NDCG)', np.mean(cg))

In [None]:
rr[:5]