# TASK 4 - Neural Network Model

In [114]:
import torch
import torch.autograd as autograd         # computation graph
from torch import Tensor                  # tensor node in the computation graph
import torch.nn as nn                     # neural networks
import torch.nn.functional as F           # layers, activations and more
import torch.optim as optim               # optimizers e.g. gradient descent, ADAM, etc.
import string
import pickle
import numpy as np
import pandas as pd
import matplotlib
import math

In [85]:
train_dataframe = pd.read_csv('train_data.tsv', delimiter='\t')
val_dataframe = pd.read_csv('validation_data.tsv', delimiter='\t')

In [115]:
X_data_train = np.loadtxt('processed_X_train_data.txt')
y_data_train = np.loadtxt('processed_y_train_data.txt')
X_data_val = np.loadtxt('processed_X_val_data.txt')
y_data_val = np.loadtxt('processed_y_val_data.txt')

X_data_train = torch.from_numpy(X_data_train)
y_data_train = torch.from_numpy(y_data_train)
X_data_val = torch.from_numpy(X_data_val)
y_data_val = torch.from_numpy(y_data_val)

In [129]:
# Weights

# train_weights = torch.zeros(len(y_data_train))
# train_weights[y_data_train == 1] = 1 - len(y_data_train[y_data_train == 1]) / len(y_data_train)
# train_weights[y_data_train == 0] = 1 - len(y_data_train[y_data_train == 0]) / len(y_data_train)

# val_weights = torch.ones(len(y_data_val))
# val_weights[y_data_val == 1] = 1 - len(y_data_val[y_data_val == 1]) / len(y_data_val)
# val_weights[y_data_val == 0] = 1 - len(y_data_val[y_data_val == 0]) / len(y_data_val)

train_weights = torch.zeros(len(y_data_train))
train_weights[y_data_train == 1] = 0.9
train_weights[y_data_train == 0] = 0.1

val_weights = torch.ones(len(y_data_val))
val_weights[y_data_val == 1] = 0.9
val_weights[y_data_val == 0] = 0.1


## Using a MLP

In [117]:
class Feedforward(torch.nn.Module):
    def __init__(self, input_size, hidden_size):
        super(Feedforward, self).__init__()
        self.input_size = input_size
        self.hidden_size  = hidden_size
        self.fc1 = torch.nn.Linear(self.input_size, self.hidden_size)
        self.relu = torch.nn.ReLU()
        self.fch = torch.nn.Linear(self.hidden_size, self.hidden_size)
        self.fc2 = torch.nn.Linear(self.hidden_size, 1)
        self.sigmoid = torch.nn.Sigmoid()
        # Define proportion or neurons to dropout
        self.dropout = nn.Dropout(0.2)
    
    def forward(self, x): # x is the input layer
        x = self.dropout(self.relu(self.fc1(x))) # hidden layer 1
        x = self.dropout(self.relu(self.fch(x))) # hidden layer 2
        x = self.dropout(self.relu(self.fch(x))) # hidden layer 3
        x = self.dropout(self.relu(self.fch(x))) # hidden layer 4
        x = self.dropout(self.relu(self.fch(x))) # hidden layer 5
        x = self.dropout(self.relu(self.fch(x))) # hidden layer 6
        output = self.sigmoid(self.fc2(x)) # output layer
        return output

In [130]:
input_dim = 5
hidden_dim = 10
model = Feedforward(input_dim, hidden_dim)
criterion = torch.nn.BCELoss(weight=val_weights)
optimizer = torch.optim.Adam(model.parameters(), lr = 0.01, weight_decay=0.01)

In [131]:
model.eval()
y_pred = model(X_data_val.float())
before_train = criterion(y_pred.squeeze(), y_data_val.float())
print('Test loss before training' , before_train.item())

Test loss before training 0.06480520963668823


In [132]:
model.train()
criterion = torch.nn.BCELoss(weight=train_weights)
epoch = 20
for epoch in range(epoch):
    optimizer.zero_grad()
    # Forward pass
    y_pred = model(X_data_train.float())
    # Compute Loss
    loss = criterion(y_pred.squeeze(), y_data_train.float())
   
    print('Epoch {}: train loss: {}'.format(epoch+1, loss.item()))
    # Backward pass
    loss.backward()
    optimizer.step()

Epoch 1: train loss: 0.06556297838687897
Epoch 2: train loss: 0.05949615687131882
Epoch 3: train loss: 0.052813392132520676
Epoch 4: train loss: 0.04570887237787247
Epoch 5: train loss: 0.03852064162492752
Epoch 6: train loss: 0.03178482875227928
Epoch 7: train loss: 0.025889089331030846
Epoch 8: train loss: 0.020978713408112526
Epoch 9: train loss: 0.017357463017106056
Epoch 10: train loss: 0.015001445077359676
Epoch 11: train loss: 0.013711116276681423
Epoch 12: train loss: 0.0134737528860569
Epoch 13: train loss: 0.013710679486393929
Epoch 14: train loss: 0.014265657402575016
Epoch 15: train loss: 0.014615845866501331
Epoch 16: train loss: 0.014834432862699032
Epoch 17: train loss: 0.015052419155836105
Epoch 18: train loss: 0.014501589350402355
Epoch 19: train loss: 0.014044645242393017
Epoch 20: train loss: 0.013431651517748833


In [133]:
model.eval()
criterion = torch.nn.BCELoss(weight=val_weights)
y_pred = model(X_data_val.float())
after_train = criterion(y_pred.squeeze(), y_data_val.float()) 
print('Test loss after Training' , after_train.item())

Test loss after Training 0.009435766376554966


In [134]:
y_pred_val = y_pred.detach().numpy()
y_pred_val = y_pred_val.squeeze()

### Store results

In [135]:
# Export NN.txt ranking

unique_qids = val_dataframe['qid'].copy()
unique_qids = unique_qids.drop_duplicates()
unique_qids = unique_qids.reset_index(drop=True)
unique_qids = np.array(unique_qids) # vector of unique qid in the validation set

qid_pid_val = val_dataframe[['qid','pid']].copy()
qid_pid_val = np.array(qid_pid_val) # (n,2) array, where each row corresponds to the (qid, pid) pair of each sample

In [136]:
NN_scores = pd.DataFrame(columns=['qid', 'A', 'pid', 'rank', 'score', 'algoname']) # dataframe where we store global results for all queries

for qid in unique_qids:
    output_info = pd.DataFrame(columns=['qid', 'A', 'pid', 'rank', 'score', 'algoname']) # dataframe where we store 
                                                                                         # the results for the current query
    qid_pairs = qid_pid_val[qid_pid_val[:,0] == qid]
    qid_rels = y_pred_val[qid_pid_val[:,0] == qid]
    indxs = np.argsort(qid_rels)[::-1]
    sorted_qid_pairs = qid_pairs[indxs]
    sorted_qid_rels = qid_rels[indxs]
    
    # Now we just get the top 100 scores (if they are available)
    top_sorted_qid_pairs = sorted_qid_pairs[:100,:]
    top_sorted_qid_rels = sorted_qid_rels[:100]

    # Prepare the array with 'A2'
    A2 = np.array(['A2'] * len(top_sorted_qid_rels))

    # Prepare the array with the ranks
    rank = np.array(range(1,len(top_sorted_qid_rels)+1))

    # Prepare the array with the algonamme 'LR'
    algoname = np.array(['NN'] * len(top_sorted_qid_rels))

    # Put everything together in the output_info dataframe
    output_info['qid'] = top_sorted_qid_pairs[:,0]
    output_info['A'] = A2
    output_info['pid'] = top_sorted_qid_pairs[:,1]
    output_info['rank'] = rank
    output_info['score'] = top_sorted_qid_rels
    output_info['algoname'] = algoname

    # Append this query dataframe to the one with global results
    NN_scores = NN_scores.append(output_info, ignore_index=True)

In [137]:
# Save ranking file as NN.txt
np.savetxt(r'NN.txt', NN_scores.values, fmt=['%d','%s','%d','%d','%f','%s'])

### Assess performance of model on validation data
For this part we use the mAP and NDCG functions defiend in Task 1

In [126]:
# Set up dataframes for the functions
tq = val_dataframe[['qid', 'queries']].copy()
tq = tq.drop_duplicates()
tq = tq.reset_index(drop=True)

relevancies = val_dataframe[['qid', 'pid', 'relevancy']].copy()

In [127]:
# Load ranking data created in previous part
NN_ranking = pd.read_csv('NN.txt', delimiter=' ', header=None, names=['qid', 'A', 'pid', 'rank', 'score', 'algoname'])
ranking = NN_ranking[['qid', 'pid', 'score']].copy()

Now copy the AP and NDCG functions from Task 1

In [98]:
# mAP function
def AP(queries, ranking, relevancies, k):
    '''
    Function that computes the Average Precision (AP) metric for each query in 'queries', based on a ranking determined by 
    a retrieval model, where queries are matched with passages from most relevant to least relevant, and based on relevancies 
    between queries and passages.

    Inputs:
    queries = data frame of queries for which you want to calculate the AP metric (contains qid and actual query)
    ranking = data frame of queries and passages pairs, where higher score pairs are ranked higher (for each query)
    relevancies = data frame of relevancies between each possible (qid,pid) pair
    k = top k passages you want to take into account when calculating the AP metric

    Outputs:
    APs = list of AP@k metric for each query, in the same order of appearance as the input list 'queries'
    mAP = mean Average Precision of all the queries
    '''

    APs = []

    for q in queries['qid']:
        AP_values = []
        cum_rel = 0 # cumulative number of relevant passages found in the ranking

        max_k = len(ranking[ranking['qid'] == q])
        iter = min(k,max_k) # This is because we some queries do not have that many candidate passages

        for i in range(1,iter+1):
            p = int(ranking[ranking['qid'] == q].reset_index(drop=True).iloc[i-1]['pid'])
            relevancy = relevancies[(relevancies['qid'] == q) & (relevancies['pid'] == p)]['relevancy'].values.item()
            if relevancy != 0: # we operate when we encounter a relevant passage
                cum_rel += relevancy
                AP_values.append(cum_rel / i) 

        if len(AP_values) != 0:    
            APs.append(sum(AP_values)/len(AP_values))
        else: # we do this to avoid the computing error of dividing 0/0
            APs.append(0)

    mAP = np.mean(APs)

    return APs, mAP

In [99]:
# NDCG function
def NDCG(queries, ranking, relevancies, k):
    '''
    Function that computes the Normalized Discounted Cumulative Gain (NDCG) metric for each query in 'queries', 
    based on a ranking determined by a retrieval model, where queries are matched with passages from most relevant 
    to least relevant, and based on relevancies between queries and passages.

    Inputs:
    queries = data frame of queries for which you want to calculate the AP metric (contains qid and actual query)
    ranking = data frame of queries and passages pairs, where higher score pairs are ranked higher (for each query)
    relevancies = data frame of relevancies between each possible (qid,pid) pair
    k = top k passages you want to take into account when calculating the AP metric

    Outputs:
    NDCGs = list of AP@k metric for each query, in the same order of appearance as the input list 'queries'
    mNDCG = mean Average Precision of all the queries
    '''

    NDCGs = []

    for q in queries['qid']:
        DCG = 0
        IDCG = 0

        max_k = len(ranking[ranking['qid'] == q])
        iter = min(k,max_k) # This is because we some queries do not have that many candidate passages

        # Get the relevancies for the ideal ranking (of the top k candidates??? - doesn't matter for our data tho), in order
        sorted_revs = relevancies[relevancies['qid'] == q].sort_values(by=['relevancy'], ascending=False)['relevancy'].values

        for i in range(1,iter+1):
            IDCG += (2**sorted_revs[i-1] - 1)/np.log2(i+1)

            p = int(ranking[ranking['qid'] == q].reset_index(drop=True).iloc[i-1]['pid'])
            rel = relevancies[(relevancies['qid'] == q) & (relevancies['pid'] == p)]['relevancy'].values.item()
            DCG += (2**rel - 1)/np.log2(i+1)

        if IDCG != 0:
            NDCGs.append(DCG/IDCG)
        else: # we do this to avoid the computing error of dividing 0/0
            NDCGs.append(0)

    mNDCG = np.mean(NDCGs)

    return NDCGs, mNDCG

Compute performance metrics

In [138]:
AP_3, mAP_3 = AP(tq, ranking, relevancies, 3)
AP_10, mAP_10 = AP(tq, ranking, relevancies, 10)
AP_100, mAP_100 = AP(tq, ranking, relevancies, 100)

NDCG_3, mNDCG_3 = NDCG(tq, ranking, relevancies, 3)
NDCG_10, mNDCG_10 = NDCG(tq, ranking, relevancies, 10)
NDCG_100, mNDCG_100 = NDCG(tq, ranking, relevancies, 100)

In [139]:
print(mAP_3, mAP_10, mAP_100)

print(mNDCG_3, mNDCG_10, mNDCG_100)

0.007113821138211382 0.009674865881311875 0.012530086828311094
0.008409972794300774 0.013653655839167635 0.03325898104597199


# End of Task 4