In [None]:
# !pip install allennlp

In [None]:
import pickle
import numpy as np
import pandas as pd
import re

import torch
from allennlp.modules.elmo import batch_to_ids
from allennlp.modules.token_embedders.elmo_token_embedder import ElmoTokenEmbedder

## Save Elmo Embeddings

In [None]:
class Batcher:
    '''Iterator class that returns batches'''
    def __init__(self, data, batch_size=64):
        self.data = data
        self.n = len(data)
        self.batch_size = batch_size
        self.i = 1
        self.max_i = self.n // batch_size
        
    def __iter__(self):
        self.i = 0
        return self
        
    def __next__(self):
        if self.i <= self.max_i:
            start = self.i * self.batch_size
            stop = (self.i+1) * self.batch_size
            self.i += 1
            return self.data[start:stop]
        else: 
            raise StopIteration
    

def mean_pooling(x):
    '''Mean pooling that takes padding into consideration'''
    return x.sum(-2) / x.any(-1).sum(-1, keepdims=True)

pattern = re.compile(r'[\w]+')
def tokenize(text):
    '''Tokenizes data into words'''
    return pattern.findall(text)

# Use GPU if available
device = 'cuda' if torch.cuda.is_available() else 'cpu'
# ELmo Small configuration files
options_file = "https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x1024_128_2048cnn_1xhighway/elmo_2x1024_128_2048cnn_1xhighway_options.json"
weight_file = "https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x1024_128_2048cnn_1xhighway/elmo_2x1024_128_2048cnn_1xhighway_weights.hdf5"
embedder = ElmoTokenEmbedder(options_file, weight_file).to(device)
def embed(data):
    '''Creates embeddings with Elmo'''
    embeddings = np.empty((0, 256))
    batches = iter(Batcher(data))
    for batch in batches:
        torch.cuda.empty_cache()
        tokenized = list(map(tokenize, batch))
        character_ids = batch_to_ids(tokenized).to(device)
        embeds = embedder(character_ids).detach().cpu().numpy()
        embeds = mean_pooling(embeds)
        embeddings = np.vstack((embeddings, embeds))
    return embeddings

def data_from_embeds(df_org, q_dict, p_dict):
    '''Matches query and passage embeddings to produce a numpy dataset'''
    df = df_org[['qid', 'pid', 'relevancy']]
    # Sorting is conveniet for fast groups retireval from dataset with np.bincount
    df = df[df['pid'].isin(p_dict.keys())].sort_values(by='qid')
    qids = df.qid.values
    pids = df.pid.values
    
    q_emb = [q_dict[qid] for qid in qids]
    p_emb = [p_dict[pid] for pid in pids]
    x = np.hstack((q_emb, p_emb))
    y = df.relevancy.values
    data = np.hstack((qids[:,None], pids[:,None], y[:,None], x))
    return data

### Get Train data Embeddings

In [None]:
df_train = pd.read_csv('../input/irdmdata/train_data.tsv', sep='\t')
df_train.head()

Get train query embeddings

In [None]:
# Only unique values to not duplicate computations
df = df_train[['qid', 'queries']].drop_duplicates()
# Sort by length to create batches of similar lentgh - faster for Elmo inference
df = df.sort_values(by='queries', key=lambda x: x.str.len())
embeddings = list(embed(df['queries'].values))
# Dictionary is more convenient data structure here - O(1) for finding query data
q_dict = pd.Series(embeddings, index=df['qid'].values).to_dict()

Get sampled train passage embeddings

In [None]:
p_dict = {'pid':[], 'passage':[]}
# Sample randomly 100 non-relevant passages for each query
for qid in df_train['qid'].unique():
    mask = (df_train['qid']==qid) & (df_train['relevancy']==0)
    p_df = df_train[mask][['pid', 'passage']]
    idxs = np.arange(p_df.shape[0])
    size = min(100, idxs.size)
    idxs = np.random.choice(idxs, size=size, replace=False)
    p_dict['pid'] += list(p_df.iloc[idxs]['pid'].values)
    p_dict['passage'] += list(p_df.iloc[idxs]['passage'].values)

# Add all passages with rel > 0
p_df = df_train[df_train['relevancy']>0][['pid', 'passage']]
p_dict['pid'] += list(p_df['pid'].values)
p_dict['passage'] += list(p_df['passage'].values)

# Get embeddings: 
# Only unique and sort by str len for faster inference
df = pd.DataFrame(p_dict).drop_duplicates()
df = df.sort_values(by='passage', key=lambda x: x.str.len())
embeddings = list(embed(df['passage'].values))
# Dictionary is more convenient data structure here - O(1) for finding query data
p_dict = pd.Series(embeddings, index=df['pid'].values).to_dict()

Save train data embeddings

In [None]:
data_train = data_from_embeds(df_train, q_dict, p_dict)
np.savez_compressed('data_train.npz', data_train)

Free memory

In [None]:
import gc
del df_train, q_dict, p_dict, data_train
gc.collect()

### Validation

In [None]:
df_val = pd.read_csv('../input/irdmdata/validation_data.tsv', sep='\t')
df_val.head()

Get validation query embeddings

In [None]:
df = df_val[['qid', 'queries']].drop_duplicates()
df = df.sort_values(by='queries', key=lambda x: x.str.len())
embeddings = list(embed(df['queries'].values))
q_dict = pd.Series(embeddings, index=df['qid'].values).to_dict()

Get validation passage embeddings

In [None]:
df = df_val[['pid', 'passage']].drop_duplicates()
df = df.sort_values(by='passage', key=lambda x: x.str.len())
embeddings = list(embed(df['passage'].values))
p_dict = pd.Series(embeddings, index=df['pid'].values).to_dict()

Save validation data as compressed .npz file - allows to load only specific query data into memory. Very useful for calculating final metrics and ommit problems with memory overflow.

In [None]:
data_val = data_from_embeds(df_val, q_dict, p_dict)
data_dict = dict()
for qid in q_dict.keys():
    qid_mask = data_val[:,0]==qid
    data_dict[str(qid)] = data_val[qid_mask][:,1:]
np.savez_compressed('data_val.npz', **data_dict)

Save small val data with all rel docs and 200 non-rel docs for each qid - useful for quick tests.

In [None]:
qids = list(data_dict.keys())
qids = [int(qid) for qid in qids]
data = np.empty((0, 1+data_dict[str(qids[0])].shape[-1]))
for qid in qids:
    q_data = data_dict[str(qid)]
    pos_ids, = np.where(q_data[:,1]==1)
    neg_ids, = np.where(q_data[:,1]==0)
    size = min(200, len(neg_ids))
    neg_ids = np.random.choice(neg_ids, size=size, replace=False)
    ids = np.append(pos_ids, neg_ids)
    qids = np.ones((ids.size, 1)) * qid
    q_data = np.hstack((qids, q_data[ids]))
    data = np.vstack((data, q_data))
data = np.array(data)
np.savez_compressed('data_val_small.npz', data)