# Deep Model and Training

In [None]:
!pip install trectools

In [None]:
# implement this in torch or keras
import tensorflow as tf
import keras.backend as K


from sklearn.metrics import ndcg_score
from trectools import TrecQrel, TrecRun, TrecEval

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

from keras.layers import Embedding
from keras.layers import Input
from keras.layers import Conv1D
from keras.layers import Dot
from keras.layers import Dense
from keras.layers import Lambda
from keras.layers import Activation
from keras.layers import Dropout


from keras import Model

In [None]:
!wget -c "https://s3.amazonaws.com/dl4j-distribution/GoogleNews-vectors-negative300.bin.gz"

--2020-05-29 10:21:53--  https://s3.amazonaws.com/dl4j-distribution/GoogleNews-vectors-negative300.bin.gz
Resolving s3.amazonaws.com (s3.amazonaws.com)... 52.216.94.93
Connecting to s3.amazonaws.com (s3.amazonaws.com)|52.216.94.93|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1647046227 (1.5G) [application/x-gzip]
Saving to: ‘GoogleNews-vectors-negative300.bin.gz’


2020-05-29 10:23:33 (15.9 MB/s) - ‘GoogleNews-vectors-negative300.bin.gz’ saved [1647046227/1647046227]



In [None]:
!cp "drive/My Drive/data_folds.zip" ./
!unzip data_folds.zip


Archive:  data_folds.zip
   creating: data/
  inflating: data/fold_2.train       
  inflating: data/fold_2.val         
  inflating: data/fold_1.train       
  inflating: data/fold_4.val         
  inflating: data/fold_1.val         
  inflating: data/fold_5.val         
  inflating: data/fold_3.test        
  inflating: data/fold_3.val         
  inflating: data/document_contents.json  
  inflating: data/fold_4.train       
  inflating: data/fold_3.train       
  inflating: data/fold_5.test        
  inflating: data/fold_5.train       
  inflating: data/fold_1.test        
  inflating: data/fold_4.test        
  inflating: data/fold_2.test        


In [None]:
import json

with open('drive/My Drive/document_contents.json') as json_file:
    document_contents = json.load(json_file)

In [None]:
!wget https://raw.githubusercontent.com/Georgetown-IR-Lab/cedr/master/data/robust/queries.tsv
with open('queries.tsv','r') as f:
  queries = {}
  for line in f:
    cols = line.rstrip().split('\t')
    c_type, c_id, c_text = cols
    queries[c_id] = c_text
queries

In [None]:
!wget https://trec.nist.gov/data/robust/qrels.robust2004.txt

--2020-05-29 14:18:41--  https://trec.nist.gov/data/robust/qrels.robust2004.txt
Resolving trec.nist.gov (trec.nist.gov)... 129.6.13.51, 2610:20:6b01:4::36
Connecting to trec.nist.gov (trec.nist.gov)|129.6.13.51|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 6543541 (6.2M) [text/plain]
Saving to: ‘qrels.robust2004.txt.1’


2020-05-29 14:18:44 (4.02 MB/s) - ‘qrels.robust2004.txt.1’ saved [6543541/6543541]



In [None]:
qrels = {}
with open('qrels.robust2004.txt','r') as f:
  for line in f:
    qid, _, docid, score = line.split()
    qrels.setdefault(qid, {})[docid] = int(score)

In [None]:
# try one fold for now
fold_name = "fold_1"
training_file = fold_name+".train"
test_file = fold_name+".test"
val_file = fold_name+".val"

In [None]:
training_docs = []
training_doc_ids = []
training_query_ids = []
training_queries = []
training_labels = []
errors = set()

with open('data/'+training_file,'r') as f:
  for line in f:
    query = line.split()[0]
    doc = line.split()[1]
    
    try:
      qrels[query]
      document_contents[doc]
      # get label (if in qrels and >0 then label = 1)
      label = 0.0
      if(doc in qrels[query] and (qrels[query][doc] > 0)):
        label = 1.0
      training_labels.append(label)

      training_doc_ids.append(doc)
      training_query_ids.append(query)


      training_docs.append(document_contents[doc])
      training_queries.append(queries[query])

    except:
      errors.add(doc)



In [None]:
# manual checking if labels are valid
i = 1000
print(training_doc_ids[i])
print(training_query_ids[i])
print(training_labels[i])
print("real label")
print(qrels[training_query_ids[i]][training_doc_ids[i]])

FT933-939
331
1.0
real label
1


In [None]:
training_queries[0]

'International Organized Crime'

In [None]:
max_len  = 0
for q in training_docs:
  if(max_len < len(q.split())):
    max_len = len(q.split())
print('longest doc')
max_len

longest doc


110467

In [None]:
max_len  = 0
for q in training_queries:
  if(max_len < len(q.split())):
    max_len = len(q.split())
print('longest query')
max_len

longest query


4

In [None]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(training_docs)

tokenized_docs = tokenizer.texts_to_sequences(training_docs)
tokenized_queries = tokenizer.texts_to_sequences(training_queries)

vocab_size = len(tokenizer.word_index) + 1

doc_maxlen = 1024
query_max_len = 5

X_docs = pad_sequences(tokenized_docs, maxlen=doc_maxlen)
X_queries = pad_sequences(tokenized_queries, maxlen=query_max_len)

In [None]:
test_docs = []
test_doc_ids = []
test_query_ids = []
test_queries = []
test_labels = []
errors = []
with open('data/'+test_file,'r') as f:
  for line in f:
    try:
      query = line.split()[0]
      doc = line.split()[1]
      document_contents[doc]
      queries[query]
      qrels[query][doc]



      test_doc_ids.append(doc)
      test_query_ids.append(query)

      
      test_docs.append(document_contents[doc])
      test_queries.append(queries[query])

      # get label (if in qrels and >0 then label = 1)
      label = 0
      if(doc in qrels[query] and (qrels[query][doc] > 0)):
        label = 1
      test_labels.append(label)

      
    except:
      errors.append([query,doc])


tokenized_docs = tokenizer.texts_to_sequences(test_docs)
tokenized_queries = tokenizer.texts_to_sequences(test_queries)

X_docs_test = pad_sequences(tokenized_docs, maxlen=doc_maxlen)
X_queries_test = pad_sequences(tokenized_queries, maxlen=query_max_len)

In [None]:
len(X_docs_test)

6669

In [None]:
len(test_labels)

6669

In [None]:
from gensim.models.keyedvectors import KeyedVectors
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
print("loading word2vec model…")
word2vec_model = KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin.gz', binary=True)
def getVector(str):
  if str in word2vec_model:
    return word2vec_model[str]
  else:
    return None;
def isInModel(str):
  return str in word2vec_model

2020-05-29 18:19:04,362 : INFO : loading projection weights from GoogleNews-vectors-negative300.bin.gz


loading word2vec model…


  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL
2020-05-29 18:21:03,704 : INFO : loaded (3000000, 300) matrix from GoogleNews-vectors-negative300.bin.gz


In [None]:
import numpy as np

EMBEDDING_DIM = 300 
# create a weight matrix for words in training docs
embedding_matrix = np.zeros((vocab_size, EMBEDDING_DIM))
for word, i in tokenizer.word_index.items():
  embedding_vector = getVector(word)
  if embedding_vector is not None:
    embedding_matrix[i] = embedding_vector

In [None]:
embedding_matrix.shape

(193308, 300)

In [None]:
training_labels_dicts = {}
for i in range(len(training_query_ids)):
  training_labels_dicts.setdefault(training_query_ids[i], {})[training_doc_ids[i]] = training_labels[i]


test_labels_dicts = {}
for i in range(len(test_query_ids)):
  test_labels_dicts.setdefault(test_query_ids[i], {})[test_doc_ids[i]] = test_labels[i]

In [None]:
#  Gaussian kernel layer in KNRM
def Kernel(mu, sigma):

  def kernel(x):
    return tf.math.exp(-0.5 * (x - mu) * (x - mu) / sigma / sigma)
    
  return Activation(kernel)

In [None]:
n_filters = 50
n_kernels = 11
max_ngram = 3
conv_activation = 'relu'


#TODO use_crossmatch
use_crossmatch= False

MUs = [-0.9, -0.7, -0.5, -0.3, -0.1, 0.1, 0.3, 0.5, 0.7, 0.9, 1.0]
SIGMAs = [0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.001]

query = Input(name='query_input',shape=(query_max_len,))
doc =  Input(name='doc_input',shape=(doc_maxlen,))


q_embed = Embedding(vocab_size,
                            EMBEDDING_DIM,
                            weights=[embedding_matrix],
                            input_length=query_max_len,
                            trainable=True)(query)

d_embed = Embedding(vocab_size,
                            EMBEDDING_DIM,
                            weights=[embedding_matrix],
                            input_length=doc_maxlen,
                            trainable=True)(doc)




q_convs = []
d_convs = []
for i in range(max_ngram):
    c = Conv1D(
        n_filters, i + 1,
        activation=conv_activation,
        padding='same'
    )
    q_convs.append(c(q_embed))
    d_convs.append(c(d_embed))

KM = []
for qi in range(max_ngram):
    for di in range(max_ngram):
        # do not match n-gram with different length if use crossmatch
        if not use_crossmatch and qi != di:
            continue
        q_ngram = q_convs[qi]
        d_ngram = d_convs[di]
        mm = Dot(axes=[2, 2],
                              normalize=True)([q_ngram, d_ngram])

        for i in range(n_kernels):
            mu = MUs[i]
            sigma = SIGMAs[i]
            mm_exp = Kernel(mu, sigma)(mm)
            mm_doc_sum = Lambda(
                lambda x: tf.reduce_sum(x, 2))(
                mm_exp)
            mm_log = Activation(tf.math.log1p)(mm_doc_sum)
            mm_sum = Lambda(
                lambda x: tf.reduce_sum(x, 1))(mm_log)
            KM.append(mm_sum)

phi = Lambda(lambda x: tf.stack(x, 1))(KM)
# out = Dense(1000, activation='relu')(phi)
# out = Dropout(0.2)(out)
out = Dense(1, activation='linear')(phi) # ranking
model = Model(inputs=[query, doc], outputs=[out])

In [None]:
def pairwise_rank_loss(y_true,y_pred):
  pos=K.sum(y_true*y_pred,axis=-1)
  neg=K.max((1-y_true)*y_pred,axis=-1)
  loss = K.maximum(neg - pos + 1, 0)
  return K.mean(loss)

In [None]:
model.compile(loss=pairwise_rank_loss, optimizer='adam')


In [None]:
val_docs = []
val_doc_ids = []
val_query_ids = []
val_queries = []
val_labels = []
errors = []
with open('data/'+val_file,'r') as f:
  for line in f:
    query = line.split()[0]
    doc = line.split()[1]

    try:

      label = 0.0
      if(doc in qrels[query] and (qrels[query][doc] > 0)):
        label = 1.0
      training_labels.append(label)

      training_doc_ids.append(doc)
      training_query_ids.append(query)


      training_docs.append(document_contents[doc])
      training_queries.append(queries[query])

    # get label (if in qrels and >0 then label = 1)


    except:
      # ignore missing 
      errors.append(query)


tokenized_docs = tokenizer.texts_to_sequences(val_docs)
tokenized_queries = tokenizer.texts_to_sequences(val_queries)

X_docs_val = pad_sequences(tokenized_docs, maxlen=doc_maxlen)
X_queries_val = pad_sequences(tokenized_queries, maxlen=query_max_len)

In [None]:
epochs = 10
for i in range(epochs):
  print(i)
  history = model.fit([X_queries,X_docs], training_labels,
                      epochs=1,
                      verbose=True,
                      # validation_data=([X_queries_test,X_docs_test], test_labels),
                      batch_size=32)
  
  training_preds = model.predict([X_queries,X_docs])
  training_preds_dicts = {}
  for i in range(len(training_query_ids)):
    training_preds_dicts.setdefault(training_query_ids[i], {})[training_doc_ids[i]] = training_preds[i][0]
  scores = []
  for qid in set(training_query_ids):
    scores.append(ndcg_score([list(training_labels_dicts[qid].values())], [list(training_preds_dicts[qid].values())], k=20))
  print('training ndcg@20 score = ',(sum(scores)/len(scores)))


  test_preds = model.predict([X_queries_test,X_docs_test])
  test_preds_dicts = {}
  for i in range(len(test_query_ids)):
    test_preds_dicts.setdefault(test_query_ids[i], {})[test_doc_ids[i]] = test_preds[i][0]

  scores = []
  for qid in set(test_query_ids):
    scores.append(ndcg_score([list(test_labels_dicts[qid].values())], [list(test_preds_dicts[qid].values())], k=20))

  print('test ndcg@20 score = ',sum(scores)/len(scores))

0


  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Epoch 1/1
training ndcg@20 score =  0.911139947318693
test ndcg@20 score =  0.31342377186425563
1
Epoch 1/1
training ndcg@20 score =  0.9888974628839058
test ndcg@20 score =  0.3389081817203834
2
Epoch 1/1
training ndcg@20 score =  0.9926706093884164
test ndcg@20 score =  0.3290152485395734
3
Epoch 1/1
training ndcg@20 score =  0.9927664548125343
test ndcg@20 score =  0.3297702324194314
4
Epoch 1/1
training ndcg@20 score =  0.9930351005420304
test ndcg@20 score =  0.3197413641103011
5
Epoch 1/1

KeyboardInterrupt: ignored

# Experiments



In [None]:
# write a custom qrels file only containing the queries from the training split
with open('qrels.robust2004.txt','r') as qrels_file:
  with open('custom_training_qrels.txt','w') as new_file:
    for line in qrels_file:
      qid, _, docid, score = line.split()
      if(qid in set(training_query_ids)):
        new_file.write(line)

In [None]:
# write a custom qrels file only containing the queries from the test split
with open('qrels.robust2004.txt','r') as qrels_file:
  with open('custom_test_qrels.txt','w') as new_file:
    for line in qrels_file:
      qid, _, docid, score = line.split()
      if(qid in set(test_query_ids)):
        new_file.write(line)

In [None]:
qrels_q = []
qrels_d = []
qrels_to_test_doc = []
qrels_to_test_query = []
qdset = []
errors = []
with open('qrels.robust2004.txt','r') as qrels_file:
  for line in qrels_file:
    qid, _, docid, score = line.split()
    if(qid in set(test_query_ids)):
      try:
        qrels_to_test_doc.append(document_contents[docid])
        qrels_q.append(qid)
        qrels_d.append(docid)
        qrels_to_test_query.append(queries[qid])
        qdset.append("%s,%s"%(qid,docid))
      except:
        errors.append(docid)


In [None]:
tokenized_docs = tokenizer.texts_to_sequences(qrels_to_test_doc)
tokenized_queries = tokenizer.texts_to_sequences(qrels_to_test_query)

X_docs_test = pad_sequences(tokenized_docs, maxlen=doc_maxlen)
X_queries_test = pad_sequences(tokenized_queries, maxlen=query_max_len)

In [None]:
preds = model.predict([X_queries_test,X_docs_test])

In [None]:
with open('run.txt','w') as new_file:
  for i in range(len(qrels_to_test_doc)):
    line = "%s %s %s %s %s %s\n" % (qrels_q[i],0,qrels_d[i],1,preds[i][0],"run-name")
    new_file.write(line)

In [None]:
qrels_to_test_query

[]

In [None]:
import pickle

with open('drive/My Drive/errors', 'wb') as fp:
    pickle.dump(errors, fp)

In [None]:
len(set(errors))

0

In [None]:
set(test_doc_ids) == set(qrels_d)

False

In [None]:
qdset2 = []

for q,d in zip(test_query_ids,test_doc_ids):
  qdset2.append("%s,%s"%(q,d))


In [None]:
len(set(test_doc_ids))

7311

In [None]:
test_doc_ids = []
test_query_ids = []

In [None]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

tokenized_docs = tokenizer.texts_to_sequences(test_docs)
tokenized_queries = tokenizer.texts_to_sequences(test_queries)

X_docs_test = pad_sequences(tokenized_docs, maxlen=doc_maxlen)
X_queries_test = pad_sequences(tokenized_queries, maxlen=query_max_len)

In [None]:
preds = model.predict([X_queries_test,X_docs_test])

In [None]:
preds[i][0]

0.045687955

In [None]:
with open('run.txt','w') as new_file:
  for i in range(len(qrels_to_test_doc)):
    line = "%s %s %s %s %s %s\n" % (qrels_q[i],0,qrels_d[i],i,preds[i][0],"run-name")
    new_file.write(line)

In [None]:
with open('run.txt','w') as new_file:
  for i in range(len(test_docs)):
    line = "%s %s %s %s %s %s\n" % (test_query_ids[i],0,test_doc_ids[i],i,preds[i][0],"run-name")
    new_file.write(line)

In [None]:
!ls ../

anserini			       qrels.robust2004.txt
custom_test_qrels.txt		       qrels.robust2004.txt.1
custom_training_qrels.txt	       queries.tsv
data				       queries.tsv.1
data_folds.zip			       queries.tsv.2
drive				       run.txt
GoogleNews-vectors-negative300.bin.gz  sample_data
index-robust04-20191213		       training_run.txt
index-robust04-20191213.tar.gz


In [None]:
with open('training_run.txt','w') as new_file:
  for i in range(len(training_query_ids)):
    line = "%s %s %s %s %s %s\n" % (training_query_ids[i],0,training_doc_ids[i],1,1,"run-name")
    new_file.write(line)

# BM25 Retrieval

In [None]:
%%capture
!pip install pyserini==0.8.1.0

import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-11-openjdk-amd64"

In [None]:
!wget https://git.uwaterloo.ca/jimmylin/anserini-indexes/raw/master/index-robust04-20191213.tar.gz
!tar xvfz index-robust04-20191213.tar.gz

--2020-05-29 14:59:11--  https://git.uwaterloo.ca/jimmylin/anserini-indexes/raw/master/index-robust04-20191213.tar.gz
Resolving git.uwaterloo.ca (git.uwaterloo.ca)... 129.97.83.4
Connecting to git.uwaterloo.ca (git.uwaterloo.ca)|129.97.83.4|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1821814915 (1.7G) [application/x-gzip]
Saving to: ‘index-robust04-20191213.tar.gz’


2020-05-29 15:01:38 (12.8 MB/s) - ‘index-robust04-20191213.tar.gz’ saved [1821814915/1821814915]

index-robust04-20191213/
index-robust04-20191213/_h_Lucene50_0.doc
index-robust04-20191213/_h_Lucene50_0.tip
index-robust04-20191213/_h_Lucene50_0.pos
index-robust04-20191213/segments_2
index-robust04-20191213/_h_Lucene50_0.tim
index-robust04-20191213/_h_Lucene80_0.dvd
index-robust04-20191213/_h.fdt
index-robust04-20191213/_h_Lucene80_0.dvm
index-robust04-20191213/_h.nvm
index-robust04-20191213/_h.nvd
index-robust04-20191213/_h.si
index-robust04-20191213/write.lock
index-robust04-20191213/_h.fdx
i

In [None]:
from pyserini.search import pysearch
from pyserini.index import pyutils

searcher = pysearch.SimpleSearcher('index-robust04-20191213')
index_utils = pyutils.IndexReaderUtils('index-robust04-20191213/')

In [None]:
!git clone https://github.com/castorini/anserini.git
%cd anserini
# !mvn clean package appassembler:assemble -DskipTests -Dmaven.javadoc.skip=true
!cd eval && tar xvfz trec_eval.9.0.4.tar.gz && cd trec_eval.9.0.4 && make

In [None]:
from bs4 import BeautifulSoup
qs = []
ds = []
test_q = []
test_d = []

for qid in set(test_query_ids):
  hits = searcher.search(queries[qid],500)
  for hit in hits:
    
    ds.append(hit.docid)
    qs.append(qid)
    try:
      content  = BeautifulSoup(index_utils.get_raw_document_contents(hit.docid), "lxml").text.replace('\t', ' ').replace('\r', ' ').replace('\n', ' ').strip()
      test_d.append(content)
    except:
      test_d.append(document_contents[hit.docid])
    test_q.append(queries[qid])


In [None]:
tokenized_docs = tokenizer.texts_to_sequences(test_d)
tokenized_queries = tokenizer.texts_to_sequences(test_q)

X_docs_test = pad_sequences(tokenized_docs, maxlen=doc_maxlen)
X_queries_test = pad_sequences(tokenized_queries, maxlen=query_max_len)

In [None]:
preds = model.predict([X_queries_test,X_docs_test])

In [None]:
with open('run.txt','w') as new_file:
  for i in range(len(qs)):
    line = "%s %s %s %s %s %s\n" % (qs[i],0,ds[i],0,preds[i][0],"run-name")
    new_file.write(line)