# Install and importing requirements

In [None]:
! pip install xmltodict
! pip install transformers

Collecting xmltodict
  Downloading https://files.pythonhosted.org/packages/28/fd/30d5c1d3ac29ce229f6bdc40bbc20b28f716e8b363140c26eff19122d8a5/xmltodict-0.12.0-py2.py3-none-any.whl
Installing collected packages: xmltodict
Successfully installed xmltodict-0.12.0
Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/ae/05/c8c55b600308dc04e95100dc8ad8a244dd800fe75dfafcf1d6348c6f6209/transformers-3.1.0-py3-none-any.whl (884kB)
[K     |████████████████████████████████| 890kB 2.9MB/s 
Collecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/7d/34/09d19aff26edcc8eb2a01bed8e98f13a1537005d31e95233fd48216eed10/sacremoses-0.0.43.tar.gz (883kB)
[K     |████████████████████████████████| 890kB 16.5MB/s 
Collecting tokenizers==0.8.1.rc2
[?25l  Downloading https://files.pythonhosted.org/packages/80/83/8b9fccb9e48eeb575ee19179e2bdde0ee9a1904f97de5f02d19016b8804f/tokenizers-0.8.1rc2-cp36-cp36m-manylinux1_x86_64.whl (3.0MB)
[K     |███████████████

In [None]:
import xmltodict
import json
import numpy as np
from sklearn.model_selection import train_test_split
from tqdm import tqdm
from transformers import BertTokenizer, BertForNextSentencePrediction, AdamW
import torch
from collections import defaultdict
import scipy
import logging

# this to prevent warning when using bert tokenizer 
logging.getLogger('transformers').setLevel(logging.ERROR)

# load in queries and documents


### Create a dictionary where the keys are queries and the list of document to re-rank for that specific query is the results

In [None]:
articles_linking = dict()
with open('drive/My Drive/TREC 2020 News/Data/2018 Topics/background qrels.exp-gains.txt','r') as file:
  for line in file.readlines():
    splitted = line.split()
    links = []
    if(splitted[0] in articles_linking):
      links = articles_linking[splitted[0]]
    links.append([splitted[2],splitted[3]])
    links = sorted(links,key= lambda x: int(x[1]),reverse=True)
    articles_linking[splitted[0]] = links

with open('drive/My Drive/TREC 2020 News/Data/2018 Topics/newsir18-background topics_fixed.xml') as fd:
    doc = xmltodict.parse(fd.read(), process_namespaces=True)
article_entity_mapping_dict = dict()
num_to_docid_mapping = dict()
for article in doc['root']['top']:
  # print(article['num'])
  article_num_cleaned = article['num'].split(':')[1].strip()
  num_to_docid_mapping[article_num_cleaned] = article['docid']

article_contents_dict = dict()
with open('drive/My Drive/entity ranking/Experiments/processed_data/backgroundlinking_article_contents_TREC2018.json', 'r') as fp:
    article_contents_dict  = json.load(fp)

with open('drive/My Drive/TREC 2020 News/Data/2019 Topics/newsir19-qrels-background.txt','r') as file:
  for line in file.readlines():
    splitted = line.split()
    links = []
    if(splitted[0] in articles_linking):
      links = articles_linking[splitted[0]]
    links.append([splitted[2],splitted[3]])
    links = sorted(links,key= lambda x: int(x[1]),reverse=True)
    articles_linking[splitted[0]] = links


with open('drive/My Drive/TREC 2020 News/Data/2019 Topics/newsir19-background-linking-topics_with_xmlroot.xml') as fd:
    doc = xmltodict.parse(fd.read(), process_namespaces=True)
article_entity_mapping_dict = dict()

for article in doc['root']['top']:
  # print(article['num'])
  article_num_cleaned = article['num'].split(':')[1].strip()
  num_to_docid_mapping[article_num_cleaned] = article['docid']

test_article_contents_dict = dict()
with open('drive/My Drive/entity ranking/Experiments/processed_data/backgroundlinking_article_contents_TREC2019.json', 'r') as fp:
    test_article_contents_dict  = json.load(fp)

article_contents_dict.update(test_article_contents_dict)

# preprocess queries and docs

In [None]:
# we crop documents and queries so they are both less than the max length 512 tokens (combined)
def crop_doc(doc,max_len=210):
  cropped_doc = doc.split()[:max_len]
  cropped_doc = " ".join(cropped_doc)
  return cropped_doc

In [None]:
query_X = [] #  text_1
docs_X = [] # text_2
y = []
inds_q = []
inds_d = []
tuple_list = []
# keep track of how  many negs we have so far
query_neg_counts = defaultdict(int)
for query_num,articles_list in articles_linking.items():
  for article_tuple in articles_list:

    # keep a balanced dataset
    # if(int(article_tuple[1]) != 0 or query_neg_counts[query_num] < query_pos_counts[query_num]*1):

    query_X.append(crop_doc(article_contents_dict[num_to_docid_mapping[query_num]]))
    docs_X.append(crop_doc(article_contents_dict[article_tuple[0]]))
    label = int(int(article_tuple[1]) == 0)
    y.append([label])

    inds_q.append(query_num)
    inds_d.append(article_tuple[0])

    tuple_list.append([crop_doc(article_contents_dict[num_to_docid_mapping[query_num]]),crop_doc(article_contents_dict[article_tuple[0]]),[label],query_num,article_tuple[0]])


    # # update the neg count
    # if(int(article_tuple[1]) == 0):
    #   query_neg_counts[query_num] += 1
    

X = list(map(list, zip(query_X, docs_X)))

np.random.seed(42)
np.random.shuffle(tuple_list)

query_X,docs_X,y,inds_q,inds_d= zip(*tuple_list)

In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForNextSentencePrediction.from_pretrained('bert-base-uncased').cuda()
model.train()

optimizer = AdamW(model.parameters(), lr=1e-6)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=433.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=440473133.0, style=ProgressStyle(descri…




Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForNextSentencePrediction: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForNextSentencePrediction from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing BertForNextSentencePrediction from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


BertForNextSentencePrediction(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

# Hyperparameters and training


In [None]:
max_length = 512
batch_size = 8
# update_size = 8

## train for 1 epoch

In [None]:
model.train()
training_steps = int(len(X)/batch_size)+1
losses = []
# 1 epoch over X_train
with tqdm(total=training_steps) as progress_bar:
  for i in range(0, len(X), batch_size):
    batch_X = X[i:i+batch_size]
    batch_y = torch.LongTensor(y[i:i+batch_size]).cuda()
    batch_q,batch_d= zip(*batch_X)


    encoding = tokenizer(batch_q, batch_d,padding='max_length',truncation="longest_first", max_length  = max_length,return_tensors='pt')
    input_ids = encoding['input_ids'].cuda()
    attention_mask = encoding['attention_mask'].cuda()
    token_type_ids = encoding['token_type_ids'].cuda()

    # loss, logits = model(input_ids= input_ids,attention_mask=attention_mask, token_type_ids=token_type_ids,next_sentence_label=batch_y)
    logits = model(input_ids= input_ids,attention_mask=attention_mask, token_type_ids=token_type_ids)[0]

    # pairwise ranking loss
    loss = torch.mean(1. - logits.softmax(dim=1)[:, 0])


    loss.backward()
    optimizer.step()
    model.zero_grad()

    losses.append(loss.item())
    avg_loss = sum(losses)/len(losses)
    progress_bar.update(1)
    progress_bar.set_description("avg loss so far = {}".format(avg_loss))



avg loss so far = 0.0034549802625585733: 100%|██████████| 3021/3021 [1:16:15<00:00,  1.51s/it]


# save or load model

In [None]:
path = "drive/My Drive/BERT_FINE_TUNING_TREC2020/backg_2019_pairwise_e2"

In [None]:
torch.save(model, path)

In [None]:
model = torch.load(path)


# load testing data

In [None]:
test_articles_linking = dict()
with open('drive/My Drive/TREC 2020 News/Data/2019 Topics/newsir19-qrels-background.txt','r') as file:
  for line in file.readlines():
    splitted = line.split()
    links = []
    if(splitted[0] in test_articles_linking):
      links = test_articles_linking[splitted[0]]
    links.append([splitted[2],splitted[3]])
    links = sorted(links,key= lambda x: int(x[1]),reverse=True)
    test_articles_linking[splitted[0]] = links


with open('drive/My Drive/TREC 2020 News/Data/2019 Topics/newsir19-background-linking-topics_with_xmlroot.xml') as fd:
    doc = xmltodict.parse(fd.read(), process_namespaces=True)
article_entity_mapping_dict = dict()
test_num_to_docid_mapping = dict()
for article in doc['root']['top']:
  # print(article['num'])
  article_num_cleaned = article['num'].split(':')[1].strip()
  test_num_to_docid_mapping[article_num_cleaned] = article['docid']

test_article_contents_dict = dict()
with open('drive/My Drive/entity ranking/Experiments/processed_data/backgroundlinking_article_contents_TREC2019.json', 'r') as fp:
    test_article_contents_dict  = json.load(fp)

KeyboardInterrupt: ignored

In [None]:
all_article_ids = np.load("/content/drive/My Drive/trec_files/all_article_ids.npy")

In [None]:
test_articles_linking = defaultdict(list)
errors = []
with open('result_wapo_bow_18.txt','r') as file:
  for line in file.readlines():
    splitted = line.split()
    if(splitted[2] in all_article_ids):
      test_articles_linking[splitted[0]].append([splitted[2],splitted[4]])
    else:
      errors.append(splitted[0])

In [None]:
needed_articles_ids

[]

In [None]:
# a list of all needed article ids to preform re-ranking
needed_articles_ids = []
for doc_num in test_articles_linking.keys():
  needed_articles_ids.append(num_to_docid_mapping[doc_num])

  for article_tuple in test_articles_linking[doc_num]:
    needed_articles_ids.append(article_tuple[0])

In [None]:
import csv
csv.field_size_limit(100000000)
file_path = 'drive/My Drive/entity ranking/cleaned_washington_data/washington_cleaned.csv'
article_contents_dict = dict()
csv_file = open(file_path)
csv_reader = csv.reader(csv_file)
for i,row in enumerate(csv_reader):
  if(row[0] in needed_articles_ids):
    article_contents_dict[row[0]] = row[1]
  if(i%25000 == 0):
    print(i)
csv_file.close()

0
25000
50000
75000
100000
125000
150000
175000
200000
225000
250000
275000
300000
325000
350000
375000
400000
425000
450000
475000
500000
525000
550000
575000


# preprocess testing data


In [None]:
test_query_X = [] #  text_1
test_docs_X = [] # text_2
test_y = []
test_inds_q = []
test_inds_d = []


for query_num,articles_list in test_articles_linking.items():
  for article_tuple in articles_list:


    test_query_X.append(crop_doc(article_contents_dict[num_to_docid_mapping[query_num]]))
    test_docs_X.append(crop_doc(article_contents_dict[article_tuple[0]]))
    # label = int(int(article_tuple[1]) == 0)
    # test_y.append([label])

    test_inds_q.append(query_num)
    test_inds_d.append(article_tuple[0])

# test_X = list(map(list, zip(test_query_X, test_docs_X)))

# getting predictions

In [None]:
model.eval()
eval_steps = int(len(test_query_X)/batch_size)+1
list_of_logits = []

with tqdm(total=eval_steps) as progress_bar:
  for i in range(0, len(test_query_X), batch_size):
    batch_q = test_query_X[i:i+batch_size]
    batch_d = test_docs_X[i:i+batch_size]

    batch_y = torch.LongTensor(test_y[i:i+batch_size]).cuda()


    encoding = tokenizer(batch_q, batch_d,padding='max_length',truncation="longest_first", max_length  = max_length,return_tensors='pt')
    input_ids = encoding['input_ids'].cuda()
    attention_mask = encoding['attention_mask'].cuda()
    token_type_ids = encoding['token_type_ids'].cuda()

    logits = model(input_ids= input_ids,attention_mask=attention_mask, token_type_ids=token_type_ids)[0]
    list_of_logits.extend(logits.tolist())

    progress_bar.update(1)


100%|█████████▉| 625/626 [05:55<00:00,  1.76it/s]


# evaluation using trectools library to get ndcg@5

In [None]:
!pip install trectools

Collecting trectools
  Downloading https://files.pythonhosted.org/packages/bf/c7/8c6dfc2db67997bc9582e52bb8c994e57d8adbe1af5a1f02f88e1faa519b/trectools-0.0.44.tar.gz
Collecting sarge>=0.1.1
  Downloading https://files.pythonhosted.org/packages/2c/39/d5994d2060edef17c03e70eb8d9c4ac44ffae0294fe7bb3dc953e67133d8/sarge-0.1.6.tar.gz
Building wheels for collected packages: trectools, sarge
  Building wheel for trectools (setup.py) ... [?25l[?25hdone
  Created wheel for trectools: filename=trectools-0.0.44-cp36-none-any.whl size=26300 sha256=9ae3f593e3f5c2492fa8321fbf2535a9e584e319527b5fd329f2f343fcb650e8
  Stored in directory: /root/.cache/pip/wheels/aa/92/11/5468a137dda8190aaecb450bf60c3a505578833c8a4cdbd37a
  Building wheel for sarge (setup.py) ... [?25l[?25hdone
  Created wheel for sarge: filename=sarge-0.1.6-cp36-none-any.whl size=19052 sha256=27b05e7a2f2d77aa1ae05b488a3e0969483e2c67fc86d82aa3631c5ee3649d17
  Stored in directory: /root/.cache/pip/wheels/1a/df/8d/6f4893750b5ad722ee6f7

### write predictions to file

In [None]:
for i in range(len(test_inds_q)):
  line ='%s Q0 %s 0 %f OzU_wiki' % (test_inds_q[i],test_inds_d[i],list_of_logits[i][0])

  with open('bl_1_2019_finetune2','a') as file:
    file.write(line)
    file.write('\n')

### we need the original qrels to compare our run 

In [None]:
from trectools import TrecQrel, TrecRun, TrecEval
run = TrecRun('bl_1_2019_finetune2')
qrels = TrecQrel('drive/My Drive/TREC 2020 News/Data/2018 Topics/background qrels.exp-gains.txt')
trec_eval = TrecEval(run, qrels)
trec_eval.get_ndcg(5)

0.34212823329294684