In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
plt.style.use('seaborn-whitegrid')

from metrics import average_precision_score, norm_disc_cum_gain_score

In [None]:
class LogisticRegression:
    def __init__(self):
        self.w = None
        self.b = None
    
    def fit(self, x, y, lr=0.1, tol=1e-4):
        # Initialize params
        if self.w is None:
            self.initialize(x.shape[1])
        
        # Optimize
        prev_loss = np.inf
        h = self.predict(x)
        loss = self.loss(y, h)
        i = 0
        while np.abs(loss-prev_loss) > tol:
            i += 1
            prev_loss = loss
            self.optim_step(lr, x, h, y)
            h = self.predict(x)
            loss = self.loss(y, h)

    def predict(self, x):
        return self.sigmoid(x@self.w + self.b)
    
    def initialize(self, in_features):
        self.w = 0.1 * np.random.rand(in_features)
        self.b = 0
        
    def optim_step(self, lr, x, h, y):
        err = h - y
        self.w -= lr * np.mean(err*x.T, axis=1)
        self.b -= lr * np.mean(err)
    
    @staticmethod
    def sigmoid(x):
        '''Stable sigmoid implementation'''
        
        result = np.zeros_like(x)
        
        pos_mask = x >= 0
        result[pos_mask] = 1 / (1+np.exp(-x[pos_mask]))
        
        neg_mask = ~pos_mask
        exp_x = np.exp(x[neg_mask])
        result[neg_mask] = exp_x / (1+exp_x)
        
        return result
    
    @staticmethod
    def loss(y, h):
        eps = 1e-100  # add small constant in logs to prevent numerical issues
        return -np.mean(y*np.log(h+eps) + (1-y)*np.log(1-h+eps))
    
    def save(self, file_name):
        params = {'w':self.w, 'b':self.b}
        np.savez_compressed(file_name, **params)
        
    def load(self, file_name):
        params = np.load(file_name)
        self.w = params['w']
        self.b = params['b']

In [None]:
class BatchGenerator:
    '''Batch Generator for doc ranking with sampling'''
    
    def __init__(self, x, y, batch_size=64, ratio=0.5, features_processor=lambda x: x):
        self.x = x
        self.y = y
        self.pos_rows, = np.where(y > 0)
        self.neg_rows, = np.where(y == 0)
        
        self.pos_samples = int(batch_size * ratio)
        self.neg_samples = batch_size - self.pos_samples
        
        self.batch_size = batch_size
        self.ratio = ratio  # frequency of positive samples
        
        self.features_processor = features_processor
        
    def get_batch(self):
        pos_rows = np.random.choice(self.pos_rows, self.pos_samples)
        neg_rows = np.random.choice(self.neg_rows, self.neg_samples)
        samples = np.append(pos_rows, neg_rows)
        np.random.shuffle(samples)
        return self.features_processor(self.x[samples]), self.y[samples]
    
    def get_batches(self, n):
        return [self.get_batch() for _ in range(n)]

In [None]:
def test(model, data, feature_processor=lambda x: x):
    '''Helper function used for model evaluation'''
    
    qids = list(data.keys())
    qids = [int(qid) for qid in qids]
    ndcg, ap = [], []
    for qid in qids:
        data = test_data[str(qid)]
        pids, rels, x = data[:,0], data[:,1], data[:,2:]
        x = feature_processor(x)
        scores = model.predict(x)
        idxs = np.argsort(-scores)
        pids, scores, rels = pids[idxs], scores[idxs], rels[idxs]
        ap.append(average_precision_score(rels))
        ndcg.append(norm_disc_cum_gain_score(rels, k=100))
    mean_ap = sum(ap) / len(ap)
    mean_ndcg = sum(ndcg) / len(ndcg)
    return mean_ap, mean_ndcg

In [None]:
train_data = np.load('./data/data_train.npz')['arr_0']

In [None]:
x_train, y_train = train_data[:,3:], train_data[:,2]
del train_data  # Free memory
x_train.shape, y_train.shape

## Learning Rate Study

In [None]:
epochs = 100
n_batches = 5000
generator = BatchGenerator(x_train, y_train, batch_size=64, ratio=0.5)
batches = generator.get_batches(n_batches)

losses_dict = {}
for lr in [0.1, 0.01, 0.001]:
    model = LogisticRegression()
    model.initialize(in_features=x_train.shape[-1])
    losses = []
    for epoch in range(1, epochs+1):
        loss = 0
        for x_batch, y_batch in batches:
                preds = model.predict(x_batch)
                model.optim_step(lr, x_batch, preds, y_batch)
                loss += model.loss(y_batch, preds)
        losses.append(loss / n_batches)
    
    plt.plot(range(1,len(losses)+1), losses, label=f'lr: {lr}')
plt.xlabel('epoch')
plt.ylabel('training loss')
plt.legend()
plt.savefig('lr_study.png', dpi=300)

## Features

model1 - concatenated query and passage embeddings

In [None]:
epochs = 100
n_batches = 5000
generator = BatchGenerator(x_train, y_train, batch_size=64, ratio=0.5)
lr_schedule = lambda epoch: 0.01 * 0.98**(epoch-1)

model1 = LogisticRegression()
model1.initialize(in_features=x_train.shape[-1])

losses = []
batches = generator.get_batches(n_batches)
for epoch in range(1, epochs+1):
    lr = lr_schedule(epoch)
    loss = 0
    for x_batch, y_batch in batches:
            preds = model1.predict(x_batch)
            model1.optim_step(lr, x_batch, preds, y_batch)
            loss += model1.loss(y_batch, preds)
    losses.append(loss / n_batches)
    if epoch%10 == 0:
        print(f'Epoch: {epoch}, Loss: {losses[-1]}')

losses1 = losses.copy()

model2 - features: concatenated query and passage embeddings, absolute element-wise difference, element-wise product and cosine similarity. Inspired by InferSent.

In [None]:
def cosine_similarity(x, z):
    x_norm = np.linalg.norm(x, axis=1)
    z_norm = np.linalg.norm(z, axis=1)
    return np.sum(x*z, axis=1) / x_norm / z_norm

def create_features(x):
    idx = x.shape[-1] // 2
    q_emb, p_emb = x[:,:idx], x[:,idx:]
    abs_diff = np.abs(q_emb-p_emb)
    prod = q_emb * p_emb
    cos_sim = cosine_similarity(q_emb, p_emb).reshape(-1, 1)
    return np.hstack((x, abs_diff, prod, cos_sim))

In [None]:
epochs = 100
n_batches = 5000
generator = BatchGenerator(x_train, y_train, batch_size=64,
                           ratio=0.5, features_processor=create_features)
lr_schedule = lambda epoch: 0.01 * 0.98**(epoch-1)

model2 = LogisticRegression()
in_features = x_train.shape[-1]//2*4 + 1
model2.initialize(in_features)

losses = []
batches = generator.get_batches(n_batches)
for epoch in range(1, epochs+1):
    lr = lr_schedule(epoch)
    loss = 0
    for x_batch, y_batch in batches:
        preds = model2.predict(x_batch)
        model2.optim_step(lr, x_batch, preds, y_batch)
        loss += model2.loss(y_batch, preds)
    losses.append(loss / n_batches)
    if epoch%10 == 0:
        print(f'Epoch: {epoch}, Loss: {losses[-1]}')

losses2 = losses.copy()

### Comapre models

In [None]:
plt.plot(range(1,len(losses1)+1), losses1, label='Basic Features')
plt.plot(range(1,len(losses2)+1), losses2, label='Additional Features')
plt.xlabel('epoch')
plt.ylabel('training loss')
plt.legend()
plt.savefig('LR_features.png', dpi=300)

In [None]:
del x_train, y_train
test_data = np.load('./data/data_val.npz')

In [None]:
mean_ap, mean_ndcg = test(model1, test_data)
print(f'Mean AP: {mean_ap}')  # 0.021492473677600318
print(f'Mean nDCG: {mean_ndcg}')  # 0.05529640685379329

In [None]:
mean_ap, mean_ndcg = test(model2, test_data, create_features)
print(f'Mean AP: {mean_ap}')  # 0.030926104220628112
print(f'Mean nDCG: {mean_ndcg}')  # 0.08210214768471422

In [None]:
# Best model
model = model2

## Save Results

In [None]:
qids = list(test_data.keys())
qids = [int(qid) for qid in qids]
ndcg, ap = [], []
with open('LR.txt', 'w') as f:
    for qid in qids:
        data = test_data[str(qid)]
        pids, rels, x = data[:,0], data[:,1], data[:,2:]
        x = create_features(x)
        scores = model.predict(x)
        idxs = np.argsort(-scores)
        pids, scores, rels = pids[idxs], scores[idxs], rels[idxs]
        ap.append(average_precision_score(rels))
        ndcg.append(norm_disc_cum_gain_score(rels, k=100))
        for i in range(scores[:100].size):
            rank = i+1
            f.write(f'{qid} A1 {pids[i]} {rank} {scores[i]} LR\n')

print(f'Mean AP: {sum(ap) / len(ap)}')  # 0.030926104220628112
print(f'Mean nDCG: {sum(ndcg) / len(ndcg)}')  # 0.08210214768471422