## Text extraction

In [1]:
import re
import pypdfium2 as pdfium

In [3]:
pdf = pdfium.PdfDocument("./books/02450_Book.pdf")

text_all = ""
for page in pdf:
    textpage = page.get_textpage()
    text_all += " ".join(textpage.get_text_range().splitlines())


text_split = list(re.split('(\. |\? |\! )', text_all))
text_sentences = []
i = 0
n = len(text_split)
while i < n-1:
    text_sentences.append(text_split[i]+text_split[i+1][:-1])
    i += 2
if i == n-1:
    text_sentences.append(text_split[i])
text_sentences = [s for s in text_sentences if len(s) > 10]

## Context generation

In [None]:
import pickle
with open('./neural_search/data/2450.pkl', 'rb') as f:
    df_questions = pickle.loads(f.read())

In [None]:
u_ctx_short = df_questions['context'].unique()

ctx_mapping_short_to_medium = {}

for i in range(len(u_ctx_short)-1):
    if i % 2 == 0:
        ctx_mapping_short_to_medium[u_ctx_short[i]] = u_ctx_short[i]+u_ctx_short[i+1]
    else:
        ctx_mapping_short_to_medium[u_ctx_short[i]] = u_ctx_short[i-1]+u_ctx_short[i]


data = []
for ctx in df_questions['context']:
    val = ctx_mapping_short_to_medium.get(ctx)
    if val:
        data.append(val)
    else:
        data.append(ctx)

df_questions['context_medium'] = data

u_ctx_medium = df_questions['context_medium'].unique()

ctx_mapping_medium_to_long = {}

n = len(u_ctx_medium)
for i in range(n-1,0,-1):
    if i % 2 == n % 2:
        ctx_mapping_medium_to_long[u_ctx_medium[i]] = u_ctx_medium[i]+u_ctx_medium[i+1]
    else:
        ctx_mapping_medium_to_long[u_ctx_medium[i]] = u_ctx_medium[i-1]+u_ctx_medium[i]

data = []
for ctx in df_questions['context_medium']:
    val = ctx_mapping_medium_to_long.get(ctx)
    if val:
        data.append(val)
    else:
        data.append(ctx)

df_questions['context_long'] = data

## Generating vector embeddings

In [None]:
from sentence_transformers import SentenceTransformer, util

embedding_model = SentenceTransformer('sentence-transformers/multi-qa-mpnet-base-dot-v1')

In [None]:
question_embeddings = {q: embedding_model.encode(q) for q in df_questions['question'].unique()}
context_embeddings = {ctx: embedding_model.encode(ctx) for ctx in df_questions['context'].unique()}

In [None]:
with open('./neural_search/data/2450_question_embeddings.pkl', 'wb') as f:
    f.write(pickle.dumps(question_embeddings))
with open('./neural_search/data/2450_context_embeddings_1000.pkl', 'wb') as f:
    f.write(pickle.dumps(context_embeddings))

## Preparing dataset

In [None]:
from collections import defaultdict
import numpy as np

is_added = defaultdict(lambda: 0)

contexts = df_questions['context'].unique()
data = []

for i,row in df_questions.iterrows():
    #Avoid duplicate data-points. Skip if question if it has already been matched with contexts
    if is_added[row['question']]:
        continue

    dft = pd.DataFrame(columns=['context', 'question', 'label'])
    dft['context'] = contexts
    dft['question'] = row['question']
    dft['label'] = 0

    #Update label to 1 if question was generated from the context
    for ctx in df_questions.loc[df_questions['question'] == row['question'],'context']:
        dft.loc[dft['context']==ctx,'label'] = 1

    data.append(dft)

    is_added[row['question']] = 1
df = pd.concat(data, ignore_index=True)

In [None]:
n_test_q = int(0.25*len(df['question'].unique()))
test_q = np.random.choice(df['question'].unique(), n_test_q, replace=False)

df_test = df.loc[df['question'].isin(test_q)]
df_test.reset_index(inplace=True, drop=True)
df_train = df.loc[~df['question'].isin(test_q)]
df_train.reset_index(inplace=True, drop=True)

In [None]:
#Converting context question pairs to vector embeddings

X_train = np.array([np.concatenate((context_embeddings[ctx],question_embeddings[q])) for ctx,q in zip(df_train['context'].values, df_train['question'].values)])
y_train = np.array([i for i in df_train['label'].values])

X_test = np.array([np.concatenate((context_embeddings[ctx],question_embeddings[q])) for ctx,q in zip(df_test['context'].values, df_test['question'].values)])
y_test = np.array([i for i in df_test['label'].values])

## Training NS model

In [None]:
from keras.models import Sequential
from keras.layers import Dense, Dropout
import matplotlib.pyplot as plt

neural_net = Sequential()
neural_net.add(Dense(768, input_dim=768*2, activation='relu'))
neural_net.add(Dense(384, activation='relu'))
neural_net.add(Dense(1, activation='sigmoid'))

neural_net.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [None]:
import tensorflow as tf
class BatchBalancerSequence(tf.keras.utils.Sequence):

    def __init__(self, x_set, y_set, batch_size):
        self.x_p = x_set[y_set==1]
        self.x_n = x_set[y_set==0]
        self.batch_size = batch_size
        self.batch_size_n = batch_size//2

    def __len__(self):
        #For each n y=0 we will oversample with n y=1
        return int(np.ceil(len(self.x_n)*2 / self.batch_size))

    def __getitem__(self, idx):
        low = idx * self.batch_size_n
        # Cap upper bound at array length; the last batch may be smaller
        # if the total number of items is not a multiple of batch size.
        high = min(low + self.batch_size_n, len(self.x_n))
        batch_x = self.x_n[low:high]
        n_neg = len(batch_x)
        batch_y = [0]*n_neg
        
        x_p_idx = np.random.choice(len(self.x_p), n_neg, replace=False)
        batch_x = np.concatenate((batch_x, self.x_p[x_p_idx]))
        batch_y = np.append(batch_y, [1]*n_neg)

        return batch_x, batch_y

In [None]:
history = neural_net.fit(BatchBalancerSequence(X_train, y_train, 128), epochs=10)

## Testing model

In [None]:
from sklearn.metrics import roc_curve, auc, roc_auc_score
import matplotlib.pyplot as plt

def plot_roc(name, labels, predictions, **kwargs):
  fp, tp, _ = roc_curve(labels, predictions)

  plt.plot(100*fp, 100*tp, label=f'{name} AUC: {roc_auc_score(labels, predictions):.3f}', linewidth=2, **kwargs)
  plt.xlabel('False positives [%]')
  plt.ylabel('True positives [%]')
  plt.xlim([-0.5,100])
  plt.ylim([0,100.5])
  plt.grid(True)
  ax = plt.gca()
  ax.set_aspect('equal')

In [None]:
y_nn_preds = neural_net.predict(X_test)
y_cosine_preds = [np.dot(a, b)/(np.linalg.norm(a)*np.linalg.norm(b)) for a,b in zip(X_test[:,:1536//2], X_test[:,1536//2:])]

In [None]:
plt.figure(figsize=(16,8))
plot_roc("NS", y_test, y_nn_preds)
plot_roc("Cosine", y_test, y_cosine_preds)
plt.legend(loc='lower right')

In [None]:
with open('./neural_search/data/y_2450_labels.pkl', 'wb') as f:
    f.write(pickle.dumps(y_test))
with open('./neural_search/data/y_2450_nn.pkl', 'wb') as f:
    f.write(pickle.dumps(y_nn_preds))
with open('./neural_search/data/y_2450_cos.pkl', 'wb') as f:
    f.write(pickle.dumps(y_cosine_preds))