In [1]:
import torch
import transformers
import pickle
import numpy as np
from transformers import AdamW
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from transformers import RobertaTokenizer, RobertaForQuestionAnswering , RobertaForSequenceClassification
from transformers import BertTokenizer , BertForSequenceClassification
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import pandas as pd
import re






def process_text(text):
    text = str(text)
    text = re.sub(r'[^\w\s]','',text)
    text = text.lower()
    text = text.replace('\n',' ').replace('\t' , ' ')
    text = text.strip()
    return text.split()

def get_rel_score(query,document):
    pass



def get_chunks(document,window_size,stride):
    chunks = []
    for i in range(0,len(document),stride):
        chunks.append(document[i : i+window_size])
    return chunks

# def load_data():
doc_df = pd.read_csv('documents.csv')
docs_id = doc_df['doc_id'].tolist()
docs = doc_df['doc_text'].apply(process_text).tolist()
doc_id2idx = {}
for i in range(len(docs_id)):
    doc_id2idx[docs_id[i]] = i
train_query_df = pd.read_csv('train_queries.csv')
train_queries = train_query_df['query_text'].apply(process_text).tolist()
train_queries_id = train_query_df['query_id'].astype('str').tolist()
train_queries_pos_docs = train_query_df['pos_doc_ids'].tolist()
train_queries_top_bm25 = train_query_df['bm25_top1000'].tolist()
train_queries_top_bm25_scores = train_query_df['bm25_top1000_scores'].tolist()

print(len(train_queries))
print(len(train_queries_id))
print(len(train_queries_pos_docs))
print(len(train_queries_top_bm25))
print(len(train_queries_top_bm25_scores))
test_query_df = pd.read_csv('test_queries.csv')
test_queries = test_query_df['query_text'].apply(process_text).tolist()
test_queries_id = test_query_df['query_id'].astype('str').tolist()
test_queries_top_bm25 = test_query_df['bm25_top1000'].tolist()
test_queries_top_bm25_scores = test_query_df['bm25_top1000_scores'].tolist()

with open('doc_chunks', 'rb') as handle:
    doc_chunks = pickle.load(handle)

print(len(test_queries))
print(len(test_queries_id))
print(len(test_queries_top_bm25))
print(len(test_queries_top_bm25_scores))
print(len(doc_chunks))

pretrained_model = 'bert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(pretrained_model)
model = AutoModelForSequenceClassification.from_pretrained(pretrained_model)
device='cuda'
checkpoint = 'bert-base-uncased_epoch1_size120000'
model.load_state_dict(torch.load(checkpoint,map_location='cpu'))
model.eval().to(device)

class TestDataset(Dataset):
    def __init__(self, input_ids, token_type_ids , attention_mask):
        self.input_ids = input_ids
        self.token_type_ids = token_type_ids
        self.attention_mask = attention_mask        
    def __getitem__(self,idx):
        inputid = self.input_ids[idx]
        tokentype = self.token_type_ids[idx]
        attentionmask = self.attention_mask[idx]
        return inputid , tokentype , attentionmask
    def __len__(self):
        return len(self.input_ids)

120
120
120
120
120
80
80
80
80
100000


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [2]:
def get_relqc(query,chunks):
    input_ids = []
    token_type_ids = []
    attention_mask = []
    query = ' '.join(query)
    for chunk in chunks:
        chunk = ' '.join(chunk)
        tmp_dict =  tokenizer(query,
                          chunk,
                          max_length=20,
                          return_tensors='pt',
                          return_token_type_ids = True,
                          pad_to_max_length=True,
                          padding='max_length',
                          truncation=True)
        input_ids.append(tmp_dict['input_ids'][0])
        token_type_ids.append(tmp_dict['token_type_ids'][0])
        attention_mask.append(tmp_dict['attention_mask'][0])
    BATCH_SIZE = 32
    test_set = TestDataset(input_ids, token_type_ids,attention_mask)
    loader =  DataLoader(test_set, batch_size=BATCH_SIZE,shuffle=False)
    model_scores = np.array([])
    for data in loader:
        tokens_tensors, segments_tensors, masks_tensors = [t.to(device) for t in data]
        outputs = model(input_ids=tokens_tensors, 
                token_type_ids=segments_tensors, 
                attention_mask=masks_tensors)
        batch_scores = outputs[0][:,1].detach().cpu().numpy()
        model_scores = np.append(model_scores,batch_scores)

    # topk_indices = model_scores.argsort()[-topk:][::-1]
    # print(topk_indices)
    return model_scores


In [3]:
def get_relcd(chunks,document):
    input_ids = []
    token_type_ids = []
    attention_mask = []
    document = ' '.join(document)
    for chunk in chunks:
        query = ' '.join(chunk)
        tmp_dict =  tokenizer(query,
                          document,
                          max_length=512,
                          return_tensors='pt',
                          return_token_type_ids = True,
                          pad_to_max_length=True,
                          padding='max_length',
                          truncation=True)
        input_ids.append(tmp_dict['input_ids'][0])
        token_type_ids.append(tmp_dict['token_type_ids'][0])
        attention_mask.append(tmp_dict['attention_mask'][0])
    BATCH_SIZE = 16
    test_set = TestDataset(input_ids, token_type_ids,attention_mask)
    loader =  DataLoader(test_set, batch_size=BATCH_SIZE,shuffle=False)
#     print(len(loader))
    model_scores = np.array([])
    for data in loader:
        tokens_tensors, segments_tensors, masks_tensors = [t.to(device) for t in data]
        outputs = model(input_ids=tokens_tensors, 
                token_type_ids=segments_tensors, 
                attention_mask=masks_tensors)
        batch_scores = outputs[0][:,1].detach().cpu().numpy()
        model_scores = np.append(model_scores,batch_scores)

    # topk_indices = model_scores.argsort()[-topk:][::-1]
    # print(topk_indices)
    return model_scores

In [4]:
with open('doc_chunk_dict', 'rb') as handle:
    doc_chunk_dict = pickle.load(handle)
with open('rel_query_topkdoc_chunks', 'rb') as handle:
    rel_query_topkdoc_chunks = pickle.load(handle)
with open('rel_qds', 'rb') as handle:
    rel_qds = pickle.load(handle)


In [5]:
def softmax(x):
    """Compute softmax values for each sets of scores in x."""
    e_x = np.exp(x - np.max(x))
    return e_x / e_x.sum(axis=0) # only difference

In [6]:
with open('query_rel_cd_scores', 'rb') as handle:
    query_rel_cd_scores = pickle.load(handle)

In [16]:
topkd = 10
topkc = 10
rel_query_topkdoc_chunks = []
phase3_scores = []
query_rel_cd_scores = []

# phase2_scores = []
with open('phase2_scores', 'rb') as handle:
    phase2_scores = pickle.load(handle)
    
with open('query_rel_cd_scores', 'rb') as handle:
    query_rel_cd_scores = pickle.load(handle)

# with open('phase2_scores', 'rb') as handle:
#     phase2_scores = pickle.load(handle)
for i in range(len(test_queries)):
    query = test_queries[i]
    query_id = test_queries_id[i]
    print(i,query_id , query)
    rel_qd = rel_qds[i]
    bm25_top1000_document_ids = test_queries_top_bm25[i].split()


#    phase 1 output
    topk_first_rerank_docs_indices = rel_qd.argsort()[-topkd:][::-1]
    
    
#   phase 2 
    
    be_selected_chunks = []
    
    for rerank_docs_index in topk_first_rerank_docs_indices:
        doc_id = bm25_top1000_document_ids[rerank_docs_index]
        doc_idx = doc_id2idx[doc_id]
        chunks = doc_chunks[doc_idx]
        be_selected_chunks.extend(chunks)
    
#     phase2_score = get_relqc(query,be_selected_chunks)
#     phase2_scores.append(phase2_score)
    
    phase2_score = phase2_scores[i]
    topk_chunks = []
    rel_qc_score = []
    
    topk_chunks_indices = phase2_score.argsort()[-topkc:][::-1]
    for index in topk_chunks_indices:
        topk_chunks.append(be_selected_chunks[index])
        rel_qc_score.append(phase2_score[index])
    
    softmax_rel_qc_score = softmax(rel_qc_score)
    softmax_rel_qc_score = np.array(softmax_rel_qc_score)

#     phase 3
    rel_qcds = []
    rel_cd_scores = query_rel_cd_scores[i]

    #     rel_cd_scores = []
    index = 0
    for doc_id in bm25_top1000_document_ids:
        doc_idx = doc_id2idx[doc_id]
        doc = docs[doc_idx]
        
#         rel_cd_score = get_relcd(topk_chunks,doc)
#         rel_cd_scores.append(rel_cd_score)
        rel_cd_score = rel_cd_scores[index]
        index += 1
        rel_cd_score = np.array(rel_cd_score)
#         print(rel_cd_score.shape,softmax_rel_qc_score.shape)
        final_score = np.dot(rel_cd_score,softmax_rel_qc_score)
        
        rel_qcds.append(final_score)
    phase3_scores.append(rel_qcds)
    
    
    

# with open('query_rel_cd_scores', 'wb') as handle:
#     pickle.dump(query_rel_cd_scores, handle, protocol=pickle.HIGHEST_PROTOCOL) 
        
#     phase3_scores.append(rel_qcds)

# with open('phase2_scores', 'wb') as handle:
#     pickle.dump(phase2_scores, handle, protocol=pickle.HIGHEST_PROTOCOL)      

        
# #     phase 3


    

# # with open('doc_chunk_dict', 'wb') as handle:
# #     pickle.dump(doc_chunk_dict, handle, protocol=pickle.HIGHEST_PROTOCOL) 


# with open('phase3_scores', 'wb') as handle:
#     pickle.dump(phase3_scores, handle, protocol=pickle.HIGHEST_PROTOCOL) 

 

0 301 ['international', 'organized', 'crime']
1 303 ['hubble', 'telescope', 'achievements']
2 304 ['endangered', 'species', 'mammals']
3 306 ['african', 'civilian', 'deaths']
4 311 ['industrial', 'espionage']
5 313 ['magnetic', 'levitationmaglev']
6 314 ['marine', 'vegetation']
7 315 ['unexplained', 'highway', 'accidents']
8 316 ['polygamy', 'polyandry', 'polygyny']
9 317 ['unsolicited', 'faxes']
10 320 ['undersea', 'fiber', 'optic', 'cable']
11 331 ['world', 'bank', 'criticism']
12 333 ['antibiotics', 'bacteria', 'disease']
13 337 ['viral', 'hepatitis']
14 338 ['risk', 'of', 'aspirin']
15 339 ['alzheimers', 'drug', 'treatment']
16 343 ['police', 'deaths']
17 344 ['abuses', 'of', 'email']
18 345 ['overseas', 'tobacco', 'sales']
19 346 ['educational', 'standards']
20 347 ['wildlife', 'extinction']
21 348 ['agoraphobia']
22 353 ['antarctica', 'exploration']
23 355 ['ocean', 'remote', 'sensing']
24 361 ['clothing', 'sweatshops']
25 362 ['human', 'smuggling']
26 365 ['el', 'nino']
27 366 [

In [17]:
with open('rel_qds', 'rb') as handle:
    rel_qds = pickle.load(handle)

# with open('phase3_scores', 'rb') as handle:
#     phase3_scores = pickle.load(handle)
    
print(len(rel_qds))
print(len(phase3_scores))

80
80


In [32]:
def minmax_normal(score):
    return (score - score.min()) / (score.max() - score.min())

In [36]:
for a in np.arange(0.0,1.1,0.1):

    filename = './final/BERT_QE_{}.txt'.format(a)
    print(filename)
    fp = open(filename, 'w')
    print('query_id,ranked_doc_ids', file=fp)

    for i in range(len(test_queries)):
        query = test_queries[i]
        query_id = test_queries_id[i]

        
        print(query_id + ',', file=fp, end='')
        rel_qd = rel_qds[i]
        phase3_score = phase3_scores[i]
        query_bm25_doc_id = test_queries_top_bm25[i].split()
        rel_qd = np.array(rel_qd)
        phase3_score = np.array(phase3_score)

        final_scores = (1-a) * rel_qd + a * phase3_score
        
        doc_dict = dict(zip(query_bm25_doc_id, final_scores))
        sorted_docs = sorted(doc_dict.items(), key=lambda x: x[1], reverse=True)
#         print(sorted_docs[:20])

        for _doc in sorted_docs:
            doc_id, value = _doc
            print(doc_id, file=fp, end=' ')
        print('',file=fp)


    fp.close()

./final/BERT_QE_0.0.txt
./final/BERT_QE_0.1.txt
./final/BERT_QE_0.2.txt
./final/BERT_QE_0.30000000000000004.txt
./final/BERT_QE_0.4.txt
./final/BERT_QE_0.5.txt
./final/BERT_QE_0.6000000000000001.txt
./final/BERT_QE_0.7000000000000001.txt
./final/BERT_QE_0.8.txt
./final/BERT_QE_0.9.txt
./final/BERT_QE_1.0.txt


In [35]:
bm25_alpha = 2.3
for a in np.arange(0.0,1.1,0.1):
    
    

    filename = './final/BERT_QE{}_BM25{}.txt'.format(a,bm25_alpha)
    print(filename)
    fp = open(filename, 'w')
    print('query_id,ranked_doc_ids', file=fp)

    for i in range(len(test_queries)):
        query = test_queries[i]
        query_id = test_queries_id[i]
        tmp_bm25_score = test_queries_top_bm25_scores[i].split()
    

        bm25_score = []
        for score in tmp_bm25_score:
            bm25_score.append(float(score))
        bm25_score = np.array(bm25_score)
        bm25_score = minmax_normal(bm25_score)

        if query_id == '316':
            bm25_score = np.array([0.0]*1000)
        
        
        print(query_id + ',', file=fp, end='')
        rel_qd = rel_qds[i]
        phase3_score = phase3_scores[i]
        query_bm25_doc_id = test_queries_top_bm25[i].split()
        rel_qd = np.array(rel_qd)
        phase3_score = np.array(phase3_score)

        final_scores = (1-a) * rel_qd + a * phase3_score
        final_scores = minmax_normal(final_scores)
        
        final_scores = bm25_score + bm25_alpha * final_scores
        
        doc_dict = dict(zip(query_bm25_doc_id, final_scores))
        sorted_docs = sorted(doc_dict.items(), key=lambda x: x[1], reverse=True)
#         print(sorted_docs[:20])

        for _doc in sorted_docs:
            doc_id, value = _doc
            print(doc_id, file=fp, end=' ')
        print('',file=fp)


    fp.close()

./final/BERT_QE0.0_BM252.3.txt
./final/BERT_QE0.1_BM252.3.txt
./final/BERT_QE0.2_BM252.3.txt
./final/BERT_QE0.30000000000000004_BM252.3.txt
./final/BERT_QE0.4_BM252.3.txt
./final/BERT_QE0.5_BM252.3.txt
./final/BERT_QE0.6000000000000001_BM252.3.txt
./final/BERT_QE0.7000000000000001_BM252.3.txt
./final/BERT_QE0.8_BM252.3.txt
./final/BERT_QE0.9_BM252.3.txt
./final/BERT_QE1.0_BM252.3.txt


In [41]:
test_queries[0]

['international', 'organized', 'crime']

In [24]:
doc_idx = doc_id2idx['FBIS3-26415']
print(docs[doc_idx])

['language', 'f', 'p105', 'chinese', 'f', 'article', 'typebfn', 'f', 'p106', 'special', 'article', 'by', 'staff', 'reporter', 'chang', 'shaowei', '1728', 'f', '1421', '1218', 'china', 'and', 'the', 'united', 'states', 'increase', 'cooperation', 'in', 'cracking', 'down', 'on', 'crimes', 'text', 'the', 'chinese', 'and', 'us', 'governments', 'have', 'clear', 'differences', 'in', 'terms', 'of', 'human', 'rights', 'missile', 'proliferation', 'and', 'trade', 'methods', 'and', 'we', 'neither', 'deny', 'nor', 'ignore', 'them', 'however', 'since', 'internationalized', 'crime', 'poses', 'a', 'common', 'threat', 'to', 'both', 'our', 'countries', 'we', 'are', 'making', 'joint', 'efforts', 'to', 'face', 'squarely', 'the', 'common', 'interests', 'we', 'have', 'on', 'this', 'issue', 'that', 'is', 'the', 'view', 'aired', 'by', 'robert', 'gelbard', 'us', 'assistant', 'secretary', 'of', 'state', 'for', 'international', 'narcotics', 'matters', 'in', 'a', 'speech', 'in', 'hong', 'kong', 'yesterday', 'it',

In [44]:
len(doc_chunk_dict)

770

In [None]:
# calculate rel(C, d)
for i in range(len(test_querues)):
    chunks , scores_qc = rel_query_topkdoc_chunks[i]

    