In [1]:
import xmltodict
import json
import numpy as np
from sklearn.model_selection import train_test_split
from tqdm import tqdm
from transformers import BertTokenizer, BertForNextSentencePrediction, AdamW
import torch
from collections import defaultdict
import scipy
import logging

# this to prevent warning when using bert tokenizer 
logging.getLogger('transformers').setLevel(logging.ERROR)

# load training topics

In [6]:
TREC_DATAFILES_DIR = "/DATA/users/kenanfayoumi/EntityRanking/trec_files"


with open("data_files/articles_linking.json", 'r') as fp:
    articles_linking=json.load(fp)
    
with open("data_files/article_texts.json", 'r') as fp:
    article_texts=json.load(fp)

article_texts = defaultdict(str,article_texts)
    
num_to_docid_mapping = {}
 
TREC_YEAR = 2018 # 2019, using V2 for both
with open(f'{TREC_DATAFILES_DIR}/num_to_docid_mapping_TREC{TREC_YEAR}.json', 'r') as fp:
    num_to_docid_mapping.update(json.load(fp))
    
    
# TREC_YEAR = 2019 # 2019, using V2 for both
# with open(f'{TREC_DATAFILES_DIR}/num_to_docid_mapping_TREC{TREC_YEAR}.json', 'r') as fp:
#     num_to_docid_mapping.update(json.load(fp))
    
docid_to_num_mapping  = {v: k for k, v in num_to_docid_mapping.items()}

In [3]:
training_topics = list(num_to_docid_mapping.keys())

In [4]:
# we crop documents and queries so they are both less than the max length 512 tokens (combined)
def crop_doc(doc,max_len=210):
  cropped_doc = doc.split()[:max_len]
  cropped_doc = " ".join(cropped_doc)
  return cropped_doc

In [8]:
for query_num,articles_list in articles_linking.items():
    if query_num in training_topics:
        for article_tuple in articles_list:

In [13]:
query_X = [] #  text_1
docs_X = [] # text_2
y = []
inds_q = []
inds_d = []
tuple_list = []
# keep track of how  many negs we have so far
query_neg_counts = defaultdict(int)
for query_num,articles_list in articles_linking.items():
    if query_num in training_topics:
        for article_tuple in articles_list:

            # keep a balanced dataset
            # if(int(article_tuple[1]) != 0 or query_neg_counts[query_num] < query_pos_counts[query_num]*1):

            query_X.append(crop_doc(article_texts[num_to_docid_mapping[query_num]]))
            docs_X.append(crop_doc(article_texts[article_tuple[0]]))
            label = int(int(article_tuple[1]) == 0)
            y.append([label])

            inds_q.append(query_num)
            inds_d.append(article_tuple[0])

            tuple_list.append([crop_doc(article_texts[num_to_docid_mapping[query_num]]),crop_doc(article_texts[article_tuple[0]]),[label],query_num,article_tuple[0]])


            # # update the neg count
            # if(int(article_tuple[1]) == 0):
            #   query_neg_counts[query_num] += 1


X = list(map(list, zip(query_X, docs_X)))

np.random.seed(42)
np.random.shuffle(tuple_list)

query_X,docs_X,y,inds_q,inds_d= zip(*tuple_list)

In [5]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForNextSentencePrediction.from_pretrained('bert-base-uncased').cuda()
model.train()

optimizer = AdamW(model.parameters(), lr=1e-6)

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/455k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/420M [00:00<?, ?B/s]

In [22]:
max_length = 512
batch_size = 4

In [23]:
model.train()
training_steps = int(len(X)/batch_size)+1
losses = []
# 1 epoch over X_train
with tqdm(total=training_steps) as progress_bar:
  for i in range(0, len(X), batch_size):
    batch_X = X[i:i+batch_size]
    batch_y = torch.LongTensor(y[i:i+batch_size]).cuda()
    batch_q,batch_d= zip(*batch_X)


    encoding = tokenizer(batch_q, batch_d,padding='max_length',truncation="longest_first", max_length  = max_length,return_tensors='pt')
    input_ids = encoding['input_ids'].cuda()
    attention_mask = encoding['attention_mask'].cuda()
    token_type_ids = encoding['token_type_ids'].cuda()

    # loss, logits = model(input_ids= input_ids,attention_mask=attention_mask, token_type_ids=token_type_ids,next_sentence_label=batch_y)
    logits = model(input_ids= input_ids,attention_mask=attention_mask, token_type_ids=token_type_ids)[0]

    # pairwise ranking loss
    loss = torch.mean(1. - logits.softmax(dim=1)[:, 0])


    loss.backward()
    optimizer.step()
    model.zero_grad()

    losses.append(loss.item())
    avg_loss = sum(losses)/len(losses)
    progress_bar.update(1)
    progress_bar.set_description("avg loss so far = {}".format(avg_loss))



  0%|          | 0/2128 [00:00<?, ?it/s]


RuntimeError: CUDA out of memory. Tried to allocate 20.00 MiB (GPU 0; 7.80 GiB total capacity; 6.61 GiB already allocated; 13.25 MiB free; 6.66 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF