In [18]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F

# data

In [None]:
patent_data = np.load('../data/patent_tiny.npy')

In [59]:
vocab = set()
links = set()

for data in patent_data:
    for word in data['content']:
        vocab.add(word)
    for link in data['citations']:
        links.add(link)
word_to_ix = {word: i for i, word in enumerate(vocab)}
link_to_ix = {link: i for i, link in enumerate(links)}

In [60]:
print(len(vocab))
print(len(links))
#print(patent_data[100]['content'])

99615
48129


In [264]:
def generate_training_data(data, links_dict):
    t_data = []
    _data = [(d['content'],d['citations']) for d in data]
    for d in _data:
        one_hot = np.zeros(len(links_dict), dtype=np.int_)
        for a in d[1]: one_hot[link_to_ix[a]] = 1
        t_data.append((d[0], one_hot))
    return t_data
d = generate_training_data(patent_data, links)
print(len(d))

3526


## rnn

In [265]:
class RNN(nn.Module):
    
    def __init__(self, hidden_size, output_size, vocab_size, embedding_dim):
        super(RNN, self).__init__()
        self.embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.hidden_size = hidden_size
        self.hidden = nn.Linear(embedding_dim + hidden_size, hidden_size)
        self.output = nn.Linear(embedding_dim + hidden_size, output_size)
        self.softmax = nn.LogSoftmax(dim=1)

    def forward(self, input, hidden):
        embeds = self.embeddings(input).view((1, -1))
        combined = torch.cat((embeds, hidden), 1)
        hidden = self.hidden(combined)
        output = self.output(combined)
        output = self.softmax(output)
        return output, hidden
    
    def init_hidden(self):
        return torch.zeros(1, n_hidden)

## cnn

In [266]:
class CNN(nn.Module):
    
    def __init__(self, hidden_size, output_size, vocab_size, embedding_dim):
        super(CNN, self).__init__()
        
    def forward(self, input): return

## fc network

In [267]:
class FC(nn.Module):
    
    def __init__(self, input_size, hidden_size, output_size):
        super(FC, self).__init__()
        self.first = nn.Linear(input_size,2*hidden_size)
        self.second = nn.Linear(2*hidden_size,hidden_size)
        self.third = nn.Linear(hidden_size,output_size)

    def forward(self, features):
        _first = self.first(features)
        _second = self.second(_first)
        output = self.third(_second)
        return output

## optimizer

## classification network

In [268]:
class Classify(nn.Module):
    
    def __init__(self):
        super(Classify, self).__init__()
        self.softmax =  nn.Softmax(dim=1)
        
    def forward(self, representation):
        return self.softmax(representation)

# loss
### cross-entropy loss

In [269]:
ce_loss = torch.nn.CrossEntropyLoss()

### multi-label margin loss

In [270]:
mlm_loss = torch.nn.MultiLabelMarginLoss()

### bce with logits loss

In [271]:
bce_loss = torch.nn.BCEWithLogitsLoss()

# metric
### mean average precision implementation
https://github.com/joaopalotti/trec_tools/blob/master/trectools/trec_eval.py

In [272]:
def getMAP(self, depth=1000, per_query=False, trec_eval=True):
        label = "MAP@%ddepth" % (depth)

        # We only care for binary evaluation here:
        relevant_docs = self.qrels.qrels_data[self.qrels.qrels_data.rel > 0].copy()
        relevant_docs["rel"] = 1

        if trec_eval:
            trecformat = self.run.run_data.sort_values(["query", "score", "docid"], ascending=[True,False,False]).reset_index()
            topX = trecformat.groupby("query")[["query","docid","score"]].head(depth)
        else:
            topX = self.run.run_data.groupby("query")[["query","docid","score"]].head(depth)

        # check number of queries
        nqueries = len(self.run.topics())

        # Make sure that rank position starts by 1
        topX["rank"] = 1
        topX["rank"] = topX.groupby("query")["rank"].cumsum()
        topX["discount"] = 1. / np.log2(topX["rank"]+1)

        # Keep only documents that are relevant (rel > 0)
        selection = pd.merge(topX, relevant_docs[["query","docid","rel"]], how="left")

        selection["rel"] = selection.groupby("query")["rel"].cumsum()
        # contribution of each relevant document
        selection[label] = selection["rel"] / selection["rank"]

        # MAP is the sum of individual's contribution
        map_per_query = selection[["query", label]].groupby("query").sum()
        relevant_docs[label] = relevant_docs["rel"]
        nrel_per_query = relevant_docs[["query",label]].groupby("query").sum()
        map_per_query = map_per_query / nrel_per_query

        if per_query:
            """ This will return a pandas dataframe with ["query", "NDCG"] values """
            return map_per_query

        if map_per_query.empty:
            return 0.0

In [273]:
def _map():return
    

In [298]:
class Network(nn.Module):
    
    def __init__(self, f_e, f_c, classify, loss, idx, device):
        super(Network, self).__init__()
        self.device = device
        self.f_e = f_e
        self.f_c = f_c
        self.classify = classify
        self.loss = loss
        self.word_idx = idx
        
    def forward(self, input, target):
        # feature extraction
        _hidden = self.f_e.init_hidden()
        for word in input:
            _input = torch.tensor([self.word_idx[word]], dtype=torch.long)
            hidden = self.f_e(_input, _hidden)
            _hidden = hidden[1]
        features = hidden[0]
        #print('FEATURES:',features)

        # feature aggregation
        representation = self.f_c(features)
        #print('REP:', representation)

        # prediction
        prediction = self.classify(representation)
        #print('PREDICTION:', prediction.shape)
    
        target = torch.tensor(target).unsqueeze(0).to(device=device, dtype=torch.int64)
        #print('TARGET', target.shape)
        #loss
        return self.loss(prediction, target)

In [299]:
_lr = .0001
cuda = False
n_hidden = 128
embedding_dim = 100
output_size = len(links)
fc_hidden = 512
rnn = RNN(n_hidden, embedding_dim, len(vocab), embedding_dim)
fc = FC(embedding_dim, fc_hidden, output_size)
classify = Classify() 
device = torch.device("cuda:0" if cuda else "cpu")

In [300]:
net = Network(rnn, fc, classify, mlm_loss, word_to_ix, device)
optimizer = torch.optim.Adam(net.parameters(), lr=_lr)#fill in params

In [None]:
step = 0
for _d in d:
    step += 1
    
    optimizer.zero_grad()

    loss = net.forward(_d[0], _d[1])

    loss.backward()

    grad_norm = torch.nn.utils.clip_grad_norm_(net.parameters(), 5.0)

    optimizer.step()
    
    print(step)

1
2
3
4
5
