In [None]:
!cp "drive/My Drive/data_folds.zip" ./
!unzip data_folds.zip

import json
with open('drive/My Drive/document_contents.json') as json_file:
    document_contents = json.load(json_file)

!wget https://raw.githubusercontent.com/Georgetown-IR-Lab/cedr/master/data/robust/queries.tsv
with open('queries.tsv','r') as f:
  queries = {}
  for line in f:
    cols = line.rstrip().split('\t')
    c_type, c_id, c_text = cols
    queries[c_id] = c_text
queries

!wget https://trec.nist.gov/data/robust/qrels.robust2004.txt


qrels = {}
with open('qrels.robust2004.txt','r') as f:
  for line in f:
    qid, _, docid, score = line.split()
    qrels.setdefault(qid, {})[docid] = int(score)

# try one fold for now
fold_name = "fold_1"
training_file = fold_name+".train"
test_file = fold_name+".test"
val_file = fold_name+".val"


In [None]:
# errors here are the docs that we didn't find in the document_contents file. Those should be fetched 

training_docs = []
training_doc_ids = []
training_query_ids = []
training_queries = []
training_labels = []
errors = set()

with open('data/'+training_file,'r') as f:
  for line in f:
    query = line.split()[0]
    doc = line.split()[1]
    
    try:
      qrels[query]
      document_contents[doc]
      # get label (if in qrels and >0 then label = 1)
      label = 0
      if(doc in qrels[query] and (qrels[query][doc] > 0)):
        label = 1
      training_labels.append(label)

      training_doc_ids.append(doc)
      training_query_ids.append(query)


      training_docs.append(document_contents[doc])
      training_queries.append(queries[query])

    except:
      errors.add(doc)

errors

{'FBIS4-32079',
 'FBIS4-6139',
 'FT921-15469',
 'FT923-13931',
 'FT923-9920',
 'FT924-2909',
 'FT924-5422',
 'FT933-5923',
 'FT941-15450',
 'LA011689-0082',
 'LA021490-0146',
 'LA051690-0006',
 'LA062889-0137',
 'LA101790-0156'}

In [None]:
assert len(training_docs) == len(training_queries)
assert len(training_docs) == len(training_labels)

In [None]:
!pip install transformers
import numpy as np
import tensorflow as tf
from tqdm import tqdm

In [None]:
import torch
import transformers
from transformers import BertTokenizer, AdamW, BertForNextSentencePrediction


LR = 0.001
max_len = 512
BATCH_SIZE = 16


tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForNextSentencePrediction.from_pretrained('bert-base-uncased')
optimizer = torch.optim.Adam(model.parameters(), lr=LR)



In [None]:
model.train()
training_steps = int(len(training_docs)/BATCH_SIZE)+1
losses = []
# 1 epoch over X_train
with tqdm(total=training_steps) as progress_bar:
  for i in range(0, len(training_docs), BATCH_SIZE):
    batch_q = training_queries[i:i+BATCH_SIZE]
    batch_d = training_docs[i:i+BATCH_SIZE]
    batch_y = torch.LongTensor(training_labels[i:i+BATCH_SIZE]).cuda()


    encoding = tokenizer(batch_q, batch_d,padding='max_length',truncation="longest_first", max_length  = max_len,return_tensors='pt')
    input_ids = encoding['input_ids'].cuda()
    attention_mask = encoding['attention_mask'].cuda()
    token_type_ids = encoding['token_type_ids'].cuda()

    # loss, logits = model(input_ids= input_ids,attention_mask=attention_mask, token_type_ids=token_type_ids,next_sentence_label=batch_y)
    logits = model(input_ids= input_ids,attention_mask=attention_mask, token_type_ids=token_type_ids)[0]

    # pairwise ranking loss
    loss = torch.mean(1. - logits.softmax(dim=1)[:, 0])


    loss.backward()
    optimizer.step()
    model.zero_grad()

    losses.append(loss.item())
    avg_loss = sum(losses)/len(losses)
    progress_bar.update(1)
    progress_bar.set_description("avg loss so far = {}".format(avg_loss))