In [None]:
import numpy as np
import pandas as pd
import torch
from torch import nn
from collections import defaultdict
import matplotlib.pyplot as plt
plt.style.use('seaborn-whitegrid')

from metrics import average_precision_score, norm_disc_cum_gain_score

In [None]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

## Dense Net

In [None]:
class DenseScoreNet(nn.Module):
    def __init__(self):
        super().__init__()
        self.module = nn.Sequential(
            nn.Linear(512, 512),
            nn.ReLU(),
            nn.BatchNorm1d(512),
            nn.Dropout(0.2),
            nn.Linear(512, 512),
            nn.ReLU(),
            nn.BatchNorm1d(512),
            nn.Dropout(0.2),
            nn.Linear(512, 128),
            nn.ReLU(),
            nn.Linear(128, 1),
            nn.Sigmoid()
        )
        
    def forward(self, x):
        return self.module(x)


class Batcher:
    """Batch Generator for doc ranking with negative sampling"""
    
    def __init__(self, x, y, batch_size=16, ratio=0.5):
        self.x = x
        self.y = y
        self.pos_rows, = np.where(y > 0)
        self.neg_rows, = np.where(y == 0)
        
        self.pos_samples = int(batch_size * ratio)
        self.neg_samples = batch_size - self.pos_samples
        
        self.batch_size = batch_size
        self.ratio = ratio
        
    def get_batch(self):
        pos_rows = np.random.choice(self.pos_rows, self.pos_samples)
        neg_rows = np.random.choice(self.neg_rows, self.neg_samples)
        samples = np.append(pos_rows, neg_rows)
        np.random.shuffle(samples)
        x = torch.tensor(self.x[samples], dtype=torch.float32, device=device)
        y = torch.tensor(self.y[samples].reshape(-1,1), dtype=torch.float32, device=device)
        return x, y
    
    def get_batches(self, n):
        return [self.get_batch() for _ in range(n)]


def train(model, batches, optimizer, criterion, epochs=1):
    n = len(batches)
    losses = []
    accuracies = []
    for epoch in range(epochs):
        epoch_loss = 0
        correct = 0
        for x_batch, y_batch in batches:
            loss, output = take_opt_step(model, x_batch, y_batch, optimizer, criterion)
            epoch_loss += loss
            correct += (torch.round(output)==y_batch).sum().item() / y_batch.shape[0]
        acc = correct / n
        accuracies.append(acc)
        losses.append(epoch_loss)
        print(f'Epoch: {epoch+1}, loss: {epoch_loss}, acc: {acc}')
    return losses, accuracies

def take_opt_step(model, x, y, optimizer, criterion):
    model.zero_grad()
    output = model(x)
    loss = criterion(output, y)
    loss.backward()
    optimizer.step()
    return loss.item(), output

def test_dense(model, file_name):
    test_data = np.load('../input/irdm-data/val_data.npz')

    qids = list(test_data.keys())
    qids = [int(qid) for qid in qids]
    ndcg, ap = [], []
    with open(file_name, 'w') as f:
        for qid in qids:
            data = test_data[str(qid)]
            pids, rels, x = data[:,0], data[:,1], torch.tensor(data[:,2:], dtype=torch.float32)
            with torch.no_grad():
                scores = model(x).cpu().numpy().reshape(-1)
            idxs = np.argsort(-scores)
            pids, scores, rels = pids[idxs], scores[idxs], rels[idxs]
            ap.append(average_precision_score(rels))
            ndcg.append(norm_disc_cum_gain_score(rels, k=100))
            for i in range(scores[:100].size):
                rank = i+1
                f.write(f'{qid} A1 {pids[i]} {rank} {scores[i]} NN\n')

    print(f'Mean AP: {sum(ap) / len(ap)}')
    print(f'Mean nDCG: {sum(ndcg) / len(ndcg)}')

In [None]:
data_train = np.load('../input/irdm-data/train_data.npz')['arr_0']
x, y = data_train[:,3:], data_train[:,2]
del data_train
batches = Batcher(x, y).get_batches(20000)

In [None]:
model = DenseScoreNet().train().to(device)
losses, accuracies = train(
    model, batches,
    optimizer = torch.optim.Adam(model.parameters(), lr=1e-3),
    criterion = nn.BCELoss(),
    epochs = 10,
)
model = model.eval()
plt.plot(losses)

In [None]:
test_dense(model, 'NN_ds.txt')
# Mean AP: 0.030199232517960098
# Mean nDCG: 0.08230779196166513

## Dense Pairwise Net

In [None]:
class DensePairwiseNet(nn.Module):
    def __init__(self):
        super().__init__()
        self.subnet = nn.Sequential(
            nn.Linear(512, 512),
            nn.ReLU(),
            nn.BatchNorm1d(512),
            nn.Dropout(0.2),
            nn.Linear(512, 512),
            nn.ReLU(),
            nn.BatchNorm1d(512),
            nn.Dropout(0.2),
            nn.Linear(512, 128),
            nn.ReLU(),
            nn.Linear(128, 1)
        )
        self.sigmoid = nn.Sigmoid()
    
    def forward(self, x1, x2):
        x1 = self.subnet(x1)
        x2 = self.subnet(x2)
        return self.sigmoid(x1-x2)
    
    
class PairwiseBatcher:
    def __init__(self, data_dict, is_text=True):
        self.data = data_dict
        self.n_queries = len(data_dict)
        self.is_text = is_text
        
    def get_batch(self, batch_size=16):
        qids = np.random.choice(list(self.data.keys()), size=batch_size)
        batch = []
        x1, x2, y = [], [], []
        for qid in qids:
            nrel, rel = self.data[qid] 
            if len(rel)==0 or len(nrel)==0: continue
            rel = rel[np.random.randint(len(rel))]
            nrel = nrel[np.random.randint(len(nrel))]
            if np.random.choice([True, False]):
                x1.append(rel)
                x2.append(nrel)
                y.append(1)
            else:
                x1.append(nrel)
                x2.append(rel)
                y.append(0)
        
        if self.is_text:
            return x1, x2, torch.tensor(y, dtype=torch.float32, device=device)
        else:
            return (torch.tensor(x1, dtype=torch.float32, device=device),
                    torch.tensor(x2, dtype=torch.float32, device=device),
                    torch.tensor(y, dtype=torch.float32, device=device))

    def get_batches(self, n_batches, batch_size=16):
        return [self.get_batch(batch_size) for _ in range(n_batches)]
    
    
def train_pairwise(model, batches, epochs, lr):
    criterion = nn.BCELoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    losses = []
    for epoch in range(1, epochs+1):
        epoch_losses = []
        for x1, x2, y in batches:
            model.zero_grad()
            output = model(x1, x2)
            loss = criterion(output, y.reshape(-1,1))
            loss.backward()
            optimizer.step()
            epoch_losses.append(loss.item())
        losses.append(sum(epoch_losses) / len(epoch_losses))
        print(f'Epoch: {epoch}, Loss: {losses[-1]}')
    return losses

In [None]:
data_train = np.load('../input/irdm-data/train_data.npz')['arr_0']

data_dict = defaultdict(lambda:[[],[]])
for row in data_train:
    qid, rel, x = int(row[0]), int(row[2]), row[3:]
    if rel == 0:
        data_dict[qid][0].append(x)
    else:
        data_dict[qid][1].append(x)
        
batches = PairwiseBatcher(data_dict, is_text=False).get_batches(20000)

In [None]:
model = DensePairwiseNet().train().to(device)
losses = train_pairwise(model, batches, epochs=10, lr=1e-3)
model = model.eval().subnet
plt.plot(losses)

In [None]:
test_dense(model, 'NN_dp.txt')
# Mean AP: 0.033328058396676935
# Mean nDCG: 0.08553910330324341

## Bert Embeddings with Pairwise Loss


In [None]:
!pip install sentence_transformers

In [None]:
class BaseBertNet(nn.Module):
    def __init__(self, embedder):
        super().__init__()
        self.embedder = embedder
        self.module = nn.Sequential(
            nn.Linear(2*768, 512),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(512, 128),
            nn.ReLU(),
            nn.Linear(128, 1)
        )
    
    def forward(self, x):
        q, p = [], []
        for xi in x:
            q.append(xi[0])
            p.append(xi[1])
        q = self.embedder.encode(q, convert_to_tensor=True,
                                 show_progress_bar=False)
        p = self.embedder.encode(p, convert_to_tensor=True,
                                 show_progress_bar=False)
        x = torch.hstack((q, p))
        return self.module(x)

    
class BertPairwiseNet(nn.Module):
    def __init__(self, embedder):
        super().__init__()
        self.subnet = BaseBertNet(embedder)
        self.sigmoid = nn.Sigmoid()
        
    def forward(self, x1, x2):
        x1 = self.subnet(x1)
        x2 = self.subnet(x2)
        return self.sigmoid(x1-x2)
    
    
def test_bert(model, file_name, has_predict=False):
    val_df = pd.read_table('../input/irdmdata/validation_data.tsv', sep='\t')

    ndcg, ap = [], []
    with open(file_name, 'w') as f:
        for qid in val_df.qid.unique():
            q_df = val_df[val_df.qid==qid]
            data = q_df.apply(lambda x: (x.queries, x.passage), axis=1)
            if has_predict:
                scores = model.predict(list(data), show_progress_bar=False)
            else:
                with torch.no_grad():
                    scores = model(list(data)).cpu().numpy().reshape(-1)
            idxs = np.argsort(-scores)
            scores = scores[idxs]
            pids = q_df.pid.values[idxs]
            rels = q_df.relevancy.values[idxs]
            ap.append(average_precision_score(rels))
            ndcg.append(norm_disc_cum_gain_score(rels, k=100))
            for i in range(scores[:100].size):
                rank = i+1
                f.write(f'{qid} A1 {pids[i]} {rank} {scores[i]} NN\n')

    print(f'Mean AP: {sum(ap) / len(ap)}')
    print(f'Mean nDCG: {sum(ndcg) / len(ndcg)}')

### Fine-tuning Bert embeddings

In [None]:
from sentence_transformers import SentenceTransformer, InputExample, losses
from torch.utils.data import DataLoader

def get_bert_dataset():
    train_df = pd.read_table('../input/irdmdata/train_data.tsv', sep='\t')    
    mask = (train_df['relevancy']==1).values
    idxs = np.random.choice(np.arange(mask.size), size=mask.sum(), replace=False)
    mask[idxs] = True
    train_df = train_df[mask]
    
    dataset = []
    for qid in train_df.qid.unique():
        q_df = train_df[train_df.qid==qid]
        data = q_df.apply(lambda x: InputExample(texts=[x.queries, x.passage],
                                                 label=x.relevancy), axis=1)
        dataset += list(data)
    
    return  DataLoader(dataset, shuffle=True, batch_size=16)

In [None]:
embedder = SentenceTransformer('msmarco-distilbert-base-v3', device=device)
dataset = get_bert_dataset()
loss = losses.CosineSimilarityLoss(embedder)
embedder.fit(
    train_objectives = [(dataset, loss)],
    epochs=1
)

### Training the ranking model 

In [None]:
train_df = pd.read_table('../input/irdmdata/train_data.tsv', sep='\t') 
mask = (train_df['relevancy']==1).values
idxs = np.random.choice(np.arange(mask.size), size=mask.size//10, replace=False)
mask[idxs] = True
train_df = train_df[mask]

data_dict = defaultdict(lambda:[[],[]])
for i, row in train_df.iterrows():
    qid, rel, q, p = row.qid, row.relevancy, row.queries, row.passage
    if rel == 0:
        data_dict[qid][0].append((q, p))
    else:
        data_dict[qid][1].append((q, p))

del train_df

In [None]:
batches = PairwiseBatcher(data_dict).get_batches(10000)

In [None]:
model = BertPairwiseNet(embedder).train().to(device)
losses = train_pairwise(model, batches, epochs=8, lr=1e-3)
model = model.eval().subnet
plt.plot(losses)

In [None]:
test_bert(model, 'NN_b.txt')
# Mean AP: 0.22555985292329858
# Mean nDCG: 0.3447011450899185

## Bert Cross Encoder

### Fine-tune 

In [None]:
from sentence_transformers import CrossEncoder

model = CrossEncoder('cross-encoder/ms-marco-TinyBERT-L-2', device=device)
dataset = get_bert_dataset()
model.fit(
    train_dataloader=dataset,
    epochs=1
)

In [None]:
test_bert(model, 'NN.txt', has_predict=True)
# Mean AP: 0.3683061733102693
# Mean nDCG: 0.4914604699183384