# Loading features computed in Core IR

In [36]:
import pandas as pd

# Load collection
passages = pd.read_csv('collections/msmarco-passage/collectionandqueries/collection.tsv', sep = '\t', names=['pid', 'passage'])

# Load training data
training_data = pd.read_csv('core/training_data.csv', index_col=0)
training_data_nonrelevant = pd.read_csv('core/training_data_nonrelevant.csv', index_col=0)
training_data = training_data.append(training_data_nonrelevant).reset_index()
queries_train = pd.read_csv('collections/msmarco-passage/collectionandqueries/queries.train.tsv', sep = '\t', names=['qid', 'query'])

# Load testing data
testing_data = pd.read_csv('core/testing_data.csv', index_col=0)
testing_data_nonrelevant = pd.read_csv('core/testing_data_nonrelevant.csv', index_col=0)
testing_data = testing_data.append(testing_data_nonrelevant).reset_index()
queries_test = pd.read_csv('collections/msmarco-passage/msmarco-test2019-queries.tsv', sep = '\t', names=['qid', 'query'])

# Load validation data
validation_data = pd.read_csv('core/validation_data.csv', index_col=0)
validation_data_nonrelevant = pd.read_csv('core/validation_data_nonrelevant.csv', index_col=0)
validation_data = validation_data.append(validation_data_nonrelevant).reset_index()
queries_val = pd.read_csv('collections/msmarco-passage/collectionandqueries/queries.dev.small.tsv', sep = '\t', names=['qid', 'query'])


In [37]:
# All data sets will look like this:
display(training_data)
display(queries_train)

Unnamed: 0,index,qid,docid,rating,bm25,passage_length,c,df,cf,idf,c_idf
0,0,1185869,0,1,11.560829,325,5,797858,983404,15.112933,173.856118
1,1,1185868,16,1,20.567997,306,9,917868,1175894,44.489958,2765.200357
2,2,597651,49,1,11.347543,305,6,718082,959345,10.137694,94.404182
3,3,403613,60,1,13.854973,521,7,200763,334483,30.979131,551.382704
4,4,1183785,389,1,10.219151,319,3,390237,561290,17.817594,68.195023
...,...,...,...,...,...,...,...,...,...,...,...
542707,9995,92552,5884425,0,0.000000,268,0,221142,264779,19.256276,0.000000
542708,9996,435696,2716350,0,2.086764,304,1,221593,267426,19.153344,19.153344
542709,9997,1181703,8602111,0,0.000000,321,0,486729,732203,20.982625,0.000000
542710,9998,1173262,3786944,0,0.000000,737,0,122353,181461,13.853809,0.000000


Unnamed: 0,qid,query
0,121352,define extreme
1,634306,what does chattel mean on credit history
2,920825,what was the great leap forward brainly
3,510633,tattoo fixers how much does it cost
4,737889,what is decentralization process.
...,...,...
808726,633855,what does canada post regulations mean
808727,1059728,wholesale lularoe price
808728,210839,how can i watch the day after
808729,908165,what to use instead of pgp in windows


In [14]:
import torch
from transformers import AlbertTokenizer, AlbertForQuestionAnswering

# Store the model we want to use
albert = "albert-base-v2"

# We need to create the model and tokenizer
tokenizer = AlbertTokenizer.from_pretrained(albert)
model = AlbertForQuestionAnswering.from_pretrained(albert)

Some weights of the model checkpoint at albert-base-v2 were not used when initializing AlbertForQuestionAnswering: ['predictions.bias', 'predictions.LayerNorm.weight', 'predictions.LayerNorm.bias', 'predictions.dense.weight', 'predictions.dense.bias', 'predictions.decoder.weight', 'predictions.decoder.bias']
- This IS expected if you are initializing AlbertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing AlbertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of AlbertForQuestionAnswering were not initialized from the model checkpoint at albert-base-v2 and are newly initialized: ['qa_outputs.weight', 'qa_outputs.bias']
You should probably TRAIN t

# Obtaining features of text

In [53]:
from tqdm import tqdm

def compute_NLP_features(data, queries):
    # Creating new features for NLP features
    data['albert_loss'] = None
    
    # Going through all query-document pairs
    for index in tqdm(range(len(data))):
        row = data.loc[index]

        # Identifying relevant query and passage/document
        query = queries[queries['qid'] == row['qid']]['query'].values[0]
        passage = passages[passages['pid'] == row['docid']]['passage'].values[0]

        # Computing Albert loss
        inputs = tokenizer(query, passage, return_tensors='pt')
        start_positions = torch.tensor([1])
        end_positions = torch.tensor([3])

        outputs = model(**inputs, start_positions=start_positions, end_positions=end_positions)
        loss = outputs.loss
 
        # Adding NLP features to data
        data.at[index, 'albert_loss'] = loss.item()
    
    return data

In [54]:
training_data_NLP = compute_NLP_features(training_data, queries_train)

100%|██████████| 10/10 [00:03<00:00,  3.25it/s]


In [55]:
print(training_data_NLP)

        index      qid    docid  rating       bm25  passage_length  c  \
0           0  1185869        0       1  11.560829             325  5   
1           1  1185868       16       1  20.567997             306  9   
2           2   597651       49       1  11.347543             305  6   
3           3   403613       60       1  13.854973             521  7   
4           4  1183785      389       1  10.219151             319  3   
...       ...      ...      ...     ...        ...             ... ..   
542707   9995    92552  5884425       0   0.000000             268  0   
542708   9996   435696  2716350       0   2.086764             304  1   
542709   9997  1181703  8602111       0   0.000000             321  0   
542710   9998  1173262  3786944       0   0.000000             737  0   
542711   9999   650309  7430049       0   0.000000             334  0   

             df       cf        idf        c_idf albert_loss  
0        797858   983404  15.112933   173.856118    5.160336

In [56]:
# Save to csv in case the kernel is stopped during experiments
training_data_NLP.to_csv('training_data_NLP.csv')

print(training_data_NLP)

        index      qid    docid  rating       bm25  passage_length  c  \
0           0  1185869        0       1  11.560829             325  5   
1           1  1185868       16       1  20.567997             306  9   
2           2   597651       49       1  11.347543             305  6   
3           3   403613       60       1  13.854973             521  7   
4           4  1183785      389       1  10.219151             319  3   
...       ...      ...      ...     ...        ...             ... ..   
542707   9995    92552  5884425       0   0.000000             268  0   
542708   9996   435696  2716350       0   2.086764             304  1   
542709   9997  1181703  8602111       0   0.000000             321  0   
542710   9998  1173262  3786944       0   0.000000             737  0   
542711   9999   650309  7430049       0   0.000000             334  0   

             df       cf        idf        c_idf albert_loss  
0        797858   983404  15.112933   173.856118    5.160336