In [1]:
import time
import torch
import string
import numpy as np
import pandas as pd
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

from tqdm import tqdm
from trectools import TrecTopics, TrecQrel
from vectors import MultiCCA, VectorVocabField
from torchtext.data import Field, Example, TabularDataset, BucketIterator

In [3]:
tqdm.monitor_interval = 0 

In [4]:
docs = pd.read_csv('docs.csv')
docs['filename'] = docs['id']
docs = docs.drop('id', 1)

FileNotFoundError: File b'docs.csv' does not exist

In [None]:
topics = TrecTopics.from_file(
    '../material/CLEF/TOPICS00-04/f00-03', 
    topic_tag='top', numberid_tag='num', number_attr=False, querytext_tag='fr-title'
)

In [None]:
ids, queries = [], []

for k, v in topics.topics.items():
    ids.append(int(k[1:]))
    queries.append(v)
    
queries = pd.DataFrame({'query': ids, 'qtext': queries})

In [None]:
qrels = TrecQrel('../material/CLEF/QRELS00-04/qrels_french')

In [None]:
data = qrels.qrels_data.merge(docs, on=['filename'], how='left').merge(queries, on='query', how='left')
data = data[data['query'] <= 200]

In [None]:
data = data.dropna()

In [None]:
#sampled = np.random.choice(data[data.rel == 0].index.values, 4000)

In [None]:
#data = data[data.index.isin(sampled)].append(data[data.rel == 1])

In [None]:
train = data[data['query'] <= 150]
test = data[data['query'] > 150]

In [None]:
train.to_csv('train.csv')
test.to_csv('test.csv')

In [None]:
train.columns

# Numericalize the Documents

In [None]:
vectors = MultiCCA()

In [None]:
translator = str.maketrans('', '', string.punctuation)

def preprocess(l):
    """
    remove the punctuation from the string
    """
    nopunct = ['fr:' + s.translate(translator) for s in l]
    return [s for s in nopunct if s]

def sort_key(ex):
    """
    needed because `split` returns a plain Dataset, and thus doesn't account
    for sorting examples based on text
    """
    return len(ex.text)

title_field = VectorVocabField(lower=True, preprocessing=preprocess)
query_field = VectorVocabField(lower=True, preprocessing=preprocess)
label_field = Field(sequential=False, unk_token=None)


In [None]:
train, test = TabularDataset.splits(
    path='./', train='train.csv', test='test.csv', format='csv',
    fields = [('ignore', None),
              ('queryid', None),
              ('q0', None),
              ('filename', None),
              ('label', label_field),
              ('text', None),
              ('title', title_field),
              ('query', query_field)],
    filter_pred=lambda ex: ex.label in ['0', '1']
)

In [None]:
title_field.build_vocab(train, vectors=vectors)
query_field.build_vocab(train, vectors=vectors)
label_field.build_vocab(train)

In [None]:
train_iter, test_iter = BucketIterator.splits(
    datasets=(train,test), batch_sizes=(128, 2096), sort_key=lambda x: len(x.query), repeat=False
)

In [None]:
class SiameseDAN(nn.Module):
    def __init__(self, vocab_size, embedding_dim, pretrained_embeddings, 
                 num_filters=100, window_sizes=(3, 4, 5), mode='static', num_classes=2):
        super(SiameseDAN, self).__init__()
        
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.embedding.weight.data.copy_(pretrained_embeddings)
        self.embedding.weight.requires_grad = (mode == 'nonstatic')
        
        self.hidden = nn.Linear(embedding_dim, num_filters)
        self.norm_hidden = nn.BatchNorm1d(num_filters)
        
        self.out = nn.Linear(2 * num_filters, 1)
        self.norm_out = nn.BatchNorm1d(1)
        
    def forward_one(self, obj):
        obj = self.embedding(obj)
        obj = obj.mean(dim=1)
        obj = F.sigmoid(self.hidden(obj))
        obj = self.norm_hidden(obj)
        
        return obj
        
    def forward(self, d, q):
        d = self.forward_one(d)
        q = self.forward_one(q)
        
        h1 = d * q
        h2 = d + q
        
        x = torch.cat((h1, h2), 1)

        x = self.out(x)
        x = self.norm_out(x)
        
        #x = F.log_softmax(x, dim=1)
        
        return x

In [None]:
class Ranker(nn.Module):
    def __init__(self):
        self.model = SiameseDAN(self, vocab_size, embedding_dim, pretrained_embeddings, ...)
    
    def forward(self, query, d1, d2):
        d1 = self.model(query, d1)
        d2 = self.model(query, d2)
        
        return F.log_softmax([d1, d2], dim=1)

In [None]:
vocab_size, embeddings_dim = title_field.vocab.vectors.shape

clf = SiameseDAN(vocab_size, embeddings_dim, title_field.vocab.vectors, num_filters=20, mode='static')

In [None]:
def run_epoch(model, loss, iterable, training=True):
    batch_accs, batch_losses = [], []
    epoch_start = time.time()
    for batch in tqdm(iterable, total=len(iterable)):
        d, q, y = batch.title.t(), batch.query.t(), batch.label
        
        y = y
        
        if training:
            model.zero_grad()

        out = model(d, q)
        _, preds = torch.max(out, 1)

        accuracy = torch.mean(torch.eq(preds, y).float())
        batch_loss = loss(out, y)

        if training:
            batch_loss.backward()
            torch.nn.utils.clip_grad_norm(model.parameters(), .25)
            opt.step()

        batch_accs.append(accuracy.data[0])
        batch_losses.append(batch_loss.data[0])

        del d, q, y
    
    epoch_end = time.time()
    return np.mean(batch_accs), np.mean(batch_losses), epoch_end - epoch_start

In [None]:
opt = optim.Adam(filter(lambda p: p.requires_grad, clf.parameters()), lr=3e-3)
loss = nn.NLLLoss()

from tqdm import tqdm

init_acc, _, _ = run_epoch(clf, loss, train_iter, training=False)
best_acc, _, _ = run_epoch(clf, loss, test_iter, training=False)

trn_losses, trn_accs = [0.], [init_acc]
val_losses, val_accs = [0.], [best_acc]

print(best_acc)

for epoch in range(10):
    clf.train()
    trn_acc, trn_loss, trn_time = run_epoch(clf, loss, train_iter, training=True)
    trn_losses.append(trn_loss)
    trn_accs.append(trn_acc)
        
    clf.eval()
    val_acc, val_loss, val_time = run_epoch(clf, loss, test_iter, training=False)
    val_losses.append(val_loss)
    val_accs.append(val_acc)
    
    if val_acc > best_acc:
        best_acc = val_acc
    
    print(best_acc)

In [None]:
qid_field = Field(sequential=False)
did_field = Field(sequential=False)

In [None]:
query_dataset = TabularDataset(
    path='topics.csv', format='csv',
    fields=[('qid', qid_field),('query', query_field)]
)

qid_field.build_vocab(query_dataset)

In [None]:
docs_dataset = TabularDataset(
    path='docs_no_header.csv', format='csv',
    fields=[('ix', None), ('did', did_field), ('text', None), ('title', title_field)]
)

did_field.build_vocab(docs_dataset)

docs_iter = BucketIterator(docs_dataset, batch_size=2048, device=-1, sort_key=lambda x: len(x.title), repeat=False)

In [None]:
outputs = {}
for example in tqdm(query_dataset.examples):
    query = query_field.numericalize([example.query], device=-1)
    results = None
    for batch in docs_iter:
        nd = batch.title.t()
        nq = query.t().repeat(nd.shape[0], 1)
        
        labels = clf(nq, nd).data
        ds = batch.did.data
        
        new_results = torch.cat((labels, ds.view(-1, 1).float()), dim=1)
        
        if results is None:
            results = new_results
        else:
            results = torch.cat((results, new_results), dim=0)
        
        del nd, nq, new_results, labels, ds
    
    #print(results[:100])
    _, dims = torch.topk(results[:, 1], k=1000, largest=True)
    #print(dims[:100])
    outputs[example.qid] = results[dims][:, 1:]
    #print(outputs)
    del results

In [None]:
with open('test.results', 'w') as fp:
    for qid, tensor in outputs.items():
        for row in tensor:
            docid = did_field.vocab.itos[int(row[1])]
            fp.write(f'{qid} Q0 {docid} 0 {row[0]} PSE\n')

In [None]:
did_field.vocab.itos[int(outputs['1'][0, 1])]

# Generate Sampled Data

In [None]:
import numpy as np

np.random.seed(10)

def paired_shuffle(*args):
    paired = list(zip(*args))
    np.random.shuffle(paired)
    return [np.array(l) for l in zip(*paired)]

def sample_docs(qrels, topics, n_irr=4):
    rel = qrels.qrels_data[qrels.qrels_data.rel == 1.0]
    irr = qrels.qrels_data[qrels.qrels_data.rel == 0.0]
    
    samples, gold = [], []
    for doc in rel.itertuples():
        # get the possible negative documents
        pool = irr[irr['query'] == doc.query]
        # randomly sample from the possible negative documents
        data = np.random.choice(pool.filename.values, n_irr)
        data = [doc.filename] + list(data)
        labels = np.array([1,] + [0,]*n_irr)
        # shuffle the data and labels
        data, labels = paired_shuffle(data, labels)
        samples.append(data) ; gold.append(labels)
    return np.array(samples), np.array(gold)

docs, labels = sample_docs(qrels, topics, 1)

In [None]:
labels.shape