In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.nn.init import xavier_normal_
from constants import D_MODEL, STACKED_NUM,DK, DV, H, P_DROP, D_FF, MAX_SEQUENCE_LENGTH, MAX_NUM_WORDS, EMBEDDING_DIM
# environment
with_gpu = torch.cuda.is_available()
# with_gpu = False
device = torch.device("cuda:0" if with_gpu else "cpu")

def positional_encoding(pos):
    assert D_MODEL % 2 == 0
    pos = torch.tensor(pos, dtype=torch.float32, requires_grad=False)
    pe = torch.zeros([1,D_MODEL], dtype=torch.float32, requires_grad=False)
    for i in range(D_MODEL//2):
        a = torch.tensor(10000, dtype=torch.float32, requires_grad=False)
        b = torch.tensor(2.*i/float(D_MODEL), dtype=torch.float32, requires_grad=False)
        c = pos / torch.pow(a, b)
        pe[0, 2*i] = torch.sin(c)
        pe[0, 2*i+1] = torch.cos(c)
    return pe
def get_pos_mat(length):
    if length > MAX_SEQUENCE_LENGTH:
        print('sequence length reach PE_MAT_CACHE. %d ' % length)
        ret = torch.cat([positional_encoding(i) for i in range(length)], dim=0).to(device)
        ret.requires_grad = False
        global PE_CACHE_MATRIX
        PE_CACHE_MATRIX = ret
        return ret
    else:
        return PE_CACHE_MATRIX[:length]
    
PE_CACHE_MATRIX = torch.cat([positional_encoding(i) for i in range(0,MAX_SEQUENCE_LENGTH)], dim=0).to(device)
PE_CACHE_MATRIX.requires_grad = False

# construct neuron network

def scaled_dot_attention(Q, K, V, mask=None):
    assert Q.size()[-1] == K.size()[-1]
    dk = torch.tensor(K.size()[-1], dtype=torch.float32, requires_grad=False).to(device)
    out = torch.matmul(Q,K.t()) / torch.sqrt(dk) 
    if mask is not None:
        out = out.masked_fill_(mask, -float('inf'))
        
    return torch.matmul(F.softmax(out, dim=-1), V)
                            
class Transformer(nn.Module):

    def __init__(self, layer_num, dk, dv, dm, h, emb_matrix):
        super(Transformer, self).__init__()
        
        self.emb = Word_Embedding(emb_matrix)
        
        self.emb_drop = nn.Dropout(P_DROP)
        
        self.encoder = Stack_Encoder(layer_num, dk, dv, dm, h)
        self.decoder = Stack_Decoder(layer_num, dk, dv, dm, h)
        self.summary_decoder = Stack_Decoder(2, dk, dv, dm, h)
        
        self.summary_weight = nn.Parameter(torch.FloatTensor(1, dm))
        torch.nn.init.xavier_uniform_(self.summary_weight)
        
        self.output_linear = nn.Linear(3*dm, 1)

    def forward(self, Q, K, Q_fea, K_fea):
        
#         encoder
        K = self.emb(K)
#         print(K.size(), get_pos_mat(MAX_SEQUENCE_LENGTH).size())
        K = K + get_pos_mat(MAX_SEQUENCE_LENGTH)
        K = self.emb_drop(K)
        
        en_out = self.encoder(K)
        
#         decoder
        Q = self.emb(Q)
        seq_len, d = Q.size()
        
        Q = Q + get_pos_mat(MAX_SEQUENCE_LENGTH)
        Q = self.emb_drop(Q)
        
        de_out = self.decoder(Q, en_out)
        
        
        summary = self.summary_decoder(self.summary_weight, de_out)
        x = torch.cat([summary, Q_fea.view([1,-1]), K_fea.view([1,-1])], dim=-1)
        out = self.output_linear(x)
        out = torch.sigmoid(out)
        

        return out

class Word_Embedding(nn.Module):
    def __init__(self, emb_matrix):
        super(Word_Embedding, self).__init__()
        self.emb = nn.Embedding(MAX_NUM_WORDS, EMBEDDING_DIM, padding_idx=0)
        self.emb.weight = nn.parameter.Parameter(torch.FloatTensor(emb_matrix))
        self.emb.weight.requires_grad_(False)
        
        self.linear = nn.Linear(EMBEDDING_DIM, D_MODEL, bias=False)
        


    def forward(self, x):
        x = self.emb(x)
        x = self.linear(x)
        return x
    
class Stack_Encoder(nn.Module):
    """
    Stacked Encoder
    """
    def __init__(self, layer_num, dk, dv, dm, h):
        super(Stack_Encoder, self).__init__()
        self.encoders = nn.ModuleList([Encoder(dk, dv, dm, h) for i in range(layer_num)])

    def forward(self, K):
        # ModuleList can act as an iterable, or be indexed using ints
        for lay in self.encoders:
            K = lay(K)
        return K               
class Encoder(nn.Module):
    def __init__(self, dk, dv, dm, h):
        super(Encoder, self).__init__()
#         attention residual block
        self.multi_head_attention_layer = Multi_Head_attention_layer(dk, dv, dm, h)
        self.attention_norm_lay = nn.LayerNorm([dm,])
        self.att_drop = nn.Dropout(P_DROP)
#         feed forward residual block
        self.fcn = PositionwiseFeedForward(D_MODEL, D_FF)
        self.linear_drop = nn.Dropout(P_DROP)
        self.ff_norm_lay = nn.LayerNorm([dm, ])
        

    def forward(self, K):
#         attention
        attention_out = self.multi_head_attention_layer(K, K, K)
        attention_out = self.att_drop(attention_out)
        att_out = self.attention_norm_lay(K + attention_out)
#         feed forward
        linear_out = self.fcn(att_out)
        linear_out = self.linear_drop(linear_out)
        out = self.ff_norm_lay(att_out + linear_out)
        out = att_out + linear_out
    
        return out
class Stack_Decoder(nn.Module):
    """
    Stacked Encoder
    """
    def __init__(self, layer_num, dk, dv, dm, h):
        super(Stack_Decoder, self).__init__()
        self.decoders = nn.ModuleList([Decoder(dk, dv, dm, h) for i in range(layer_num)])
        
        
    def forward(self, Q, encoder_out):
        # ModuleList can act as an iterable, or be indexed using ints
        Q_len, d = Q.size()
        for lay in self.decoders:
            Q = lay(Q, encoder_out, mask=None)
        return Q           

class Decoder(nn.Module):
    def __init__(self, dk, dv, dm, h):
        super(Decoder, self).__init__()
#         query attention residual block
        self.Q_attention_lay = Multi_Head_attention_layer(dk, dv, dm, h)
        self.Q_attention_norm_lay = nn.LayerNorm([dm, ])
        self.Q_att_drop = nn.Dropout(P_DROP)
    
#         query key attention residual block
        self.QK_attention_lay = Multi_Head_attention_layer(dk, dv, dm, h)
        self.QK_attention_norm_lay = nn.LayerNorm([dm, ])
        self.QK_att_drop = nn.Dropout(P_DROP)
        
    
#         feed forward residual block
        self.fcn = PositionwiseFeedForward(D_MODEL, D_FF)
        self.ff_norm_lay = nn.LayerNorm([dm, ])
        self.linear_drop = nn.Dropout(P_DROP)
        

    def forward(self, Q, encoder_out, mask):
#         query attention
        Q_attention_out = self.Q_attention_lay(Q, Q, Q, mask)
        Q_attention_out = self.Q_att_drop(Q_attention_out)
        Q_att_out = self.Q_attention_norm_lay(Q + Q_attention_out)
#         query key attention
        QK_attention_out = self.QK_attention_lay(Q_att_out, encoder_out, encoder_out)
        QK_attention_out = self.QK_att_drop(QK_attention_out)
        QK_att_out = self.QK_attention_norm_lay(Q_att_out + QK_attention_out)
        
#         feed forward
        linear_out = self.fcn(QK_att_out)
        out = self.ff_norm_lay(QK_att_out + linear_out)
        return out

class Multi_Head_attention_layer(nn.Module):
    def __init__(self, dk, dv, dm, h):
        super(Multi_Head_attention_layer, self).__init__()
        self.Q_linears = nn.ModuleList([nn.Linear(dm, dk) for i in range(h)])
        self.K_linears = nn.ModuleList([nn.Linear(dm, dk) for i in range(h)])
        self.V_linears = nn.ModuleList([nn.Linear(dm, dv) for i in range(h)])
        self.output_linear = nn.Linear(h*dv, dm)
                            

    def forward(self, Q_input, K_input, V_input, mask=None):
        buf = []
        for Q_linear, K_linear, V_linear in zip(self.Q_linears, self.K_linears, self.V_linears):
            Q = Q_linear(Q_input)
            K = K_linear(K_input)
            V = V_linear(V_input)
            buf.append(scaled_dot_attention(Q, K, V, mask))
            
        buf = torch.cat(buf,dim=-1)
        out = self.output_linear(buf)
        
        return out      
class PositionwiseFeedForward(nn.Module):
    def __init__(self, d_model, d_ff):
        super(PositionwiseFeedForward, self).__init__()
        self.cnn1 = nn.Conv1d(d_model, d_ff, 1)
        self.cnn2 = nn.Conv1d(d_ff, d_model, 1)
                            

    def forward(self, x):
        seq_len,_ = x.size()
        x = x.unsqueeze(0)
        x = x.permute(0,2,1)
        x = self.cnn1(x)
        x = F.relu(x)
        x = self.cnn2(x)
        x = x.permute(0,2,1)
        x = x.squeeze(0)
        
        return x      
    
# encoder = Stack_Encoder(6, 64,64,20,8)
# # print net
print 'load done'

load done


In [2]:
import keras
from os.path import join
import os
from bs4 import BeautifulSoup as BS
from constants import MAX_SEQUENCE_LENGTH, MAX_NUM_WORDS, EMBEDDING_DIM
from keras.preprocessing.sequence import pad_sequences
import numpy as np

def quote_title_abstract(xml_path):
    with open(xml_path, 'r') as f:
        data = f.read()
    soup = BS(data)
    title, abstract = soup.find('title').text, soup.find('abstract').text
    return title.strip(), abstract.strip()

# text preprocessing
data_path = join('./','kaggle/')
xml_dir = join(data_path, 't2-doc')
xml_list = [f for f in os.listdir(xml_dir) if f.endswith('.xml')]
# print(len(xml_list))


texts = []

for xml in xml_list:
    path = join(xml_dir,xml)
    title, abstract = quote_title_abstract(path)
    text = title + '' + abstract
    texts.append(text)
#     texts.append(title)
#     texts.append(abstract)
print('read all %d xml files.' % len(xml_list))
tokenizer = keras.preprocessing.text.Tokenizer(num_words=MAX_NUM_WORDS, filters='!"#$%&()*+,-./:;<=>?@[\]^_`{|}~ ',
                                   lower=True, split=' ', char_level=False, oov_token=None)
tokenizer.fit_on_texts(texts)
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))
sequences = tokenizer.texts_to_sequences(texts)
data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)
xml_id_map = {}
for i,xml in enumerate(xml_list):
    node_id = int(xml.replace('.xml',''))
    xml_id_map[node_id] = data[i,:]


print('Preparing embedding matrix.')
embeddings_index = {}
# with open(os.path.join('./','glove', 'glove.6B.%dd.txt' % EMBEDDING_DIM), 'r', encoding='utf8') as f:
# with open(os.path.join('./','glove', 'glove.6B.%dd.txt' % EMBEDDING_DIM), 'r') as f:
#     for line in f:
#         values = line.split()
#         word = values[0]
#         coefs = np.asarray(values[1:], dtype='float32')
#         embeddings_index[word] = coefs
# prepare embedding matrix
print('done')

Using TensorFlow backend.


read all 17500 xml files.
Found 82709 unique tokens.
Preparing embedding matrix.
done


### only text

In [3]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.nn.init import xavier_normal_
from constants import D_MODEL, STACKED_NUM,DK, DV, H, P_DROP, D_FF, MAX_SEQUENCE_LENGTH, MAX_NUM_WORDS, EMBEDDING_DIM
# environment
with_gpu = torch.cuda.is_available()
# with_gpu = False
device = torch.device("cuda:0" if with_gpu else "cpu")

def positional_encoding(pos):
    assert D_MODEL % 2 == 0
    pos = torch.tensor(pos, dtype=torch.float32, requires_grad=False)
    pe = torch.zeros([1,D_MODEL], dtype=torch.float32, requires_grad=False)
    for i in range(D_MODEL//2):
        a = torch.tensor(10000, dtype=torch.float32, requires_grad=False)
        b = torch.tensor(2.*i/float(D_MODEL), dtype=torch.float32, requires_grad=False)
        c = pos / torch.pow(a, b)
        pe[0, 2*i] = torch.sin(c)
        pe[0, 2*i+1] = torch.cos(c)
    return pe
def get_pos_mat(length):
    if length > MAX_SEQUENCE_LENGTH:
        print('sequence length reach PE_MAT_CACHE. %d ' % length)
        ret = torch.cat([positional_encoding(i) for i in range(length)], dim=0).to(device)
        ret.requires_grad = False
        global PE_CACHE_MATRIX
        PE_CACHE_MATRIX = ret
        return ret
    else:
        return PE_CACHE_MATRIX[:length]
    
PE_CACHE_MATRIX = torch.cat([positional_encoding(i) for i in range(0,MAX_SEQUENCE_LENGTH)], dim=0).to(device)
PE_CACHE_MATRIX.requires_grad = False

# construct neuron network

def scaled_dot_attention(Q, K, V, mask=None):
    assert Q.size()[-1] == K.size()[-1]
    dk = torch.tensor(K.size()[-1], dtype=torch.float32, requires_grad=False).to(device)
    out = torch.matmul(Q,K.t()) / torch.sqrt(dk) 
    if mask is not None:
        out = out.masked_fill_(mask, -float('inf'))
        
    return torch.matmul(F.softmax(out, dim=-1), V)
                            
class Transformer(nn.Module):

    def __init__(self, layer_num, dk, dv, dm, h, emb_matrix):
        super(Transformer, self).__init__()
        
        self.emb = Word_Embedding(emb_matrix)
        
        self.emb_drop = nn.Dropout(P_DROP)
        
        self.encoder = Stack_Encoder(layer_num, dk, dv, dm, h)
        self.decoder = Stack_Decoder(layer_num, dk, dv, dm, h)
        self.summary_decoder = Stack_Decoder(2, dk, dv, dm, h)
        
        self.summary_weight = nn.Parameter(torch.FloatTensor(1, dm))
        torch.nn.init.xavier_uniform_(self.summary_weight)
        
        self.output_linear = nn.Linear(dm, 1)

    def forward(self, Q, K):
        
#         encoder
        K = self.emb(K)
#         print(K.size(), get_pos_mat(MAX_SEQUENCE_LENGTH).size())
        K = K + get_pos_mat(MAX_SEQUENCE_LENGTH)
        K = self.emb_drop(K)
        
        en_out = self.encoder(K)
        
#         decoder
        Q = self.emb(Q)
        seq_len, d = Q.size()
        
        Q = Q + get_pos_mat(MAX_SEQUENCE_LENGTH)
        Q = self.emb_drop(Q)
        
        de_out = self.decoder(Q, en_out)
        
        
        summary = self.summary_decoder(self.summary_weight, de_out)
        out = self.output_linear(summary)
        out = torch.sigmoid(out)
        

        return out

class Word_Embedding(nn.Module):
    def __init__(self, emb_matrix):
        super(Word_Embedding, self).__init__()
        self.emb = nn.Embedding(MAX_NUM_WORDS, EMBEDDING_DIM, padding_idx=0)
        self.emb.weight = nn.parameter.Parameter(torch.FloatTensor(emb_matrix))
        self.emb.weight.requires_grad_(False)
        
        self.linear = nn.Linear(EMBEDDING_DIM, D_MODEL, bias=False)
        


    def forward(self, x):
        x = self.emb(x)
        x = self.linear(x)
        return x
    
class Stack_Encoder(nn.Module):
    """
    Stacked Encoder
    """
    def __init__(self, layer_num, dk, dv, dm, h):
        super(Stack_Encoder, self).__init__()
        self.encoders = nn.ModuleList([Encoder(dk, dv, dm, h) for i in range(layer_num)])

    def forward(self, K):
        # ModuleList can act as an iterable, or be indexed using ints
        for lay in self.encoders:
            K = lay(K)
        return K               
class Encoder(nn.Module):
    def __init__(self, dk, dv, dm, h):
        super(Encoder, self).__init__()
#         attention residual block
        self.multi_head_attention_layer = Multi_Head_attention_layer(dk, dv, dm, h)
        self.attention_norm_lay = nn.LayerNorm([dm,])
        self.att_drop = nn.Dropout(P_DROP)
#         feed forward residual block
        self.fcn = PositionwiseFeedForward(D_MODEL, D_FF)
        self.linear_drop = nn.Dropout(P_DROP)
        self.ff_norm_lay = nn.LayerNorm([dm, ])
        

    def forward(self, K):
#         attention
        attention_out = self.multi_head_attention_layer(K, K, K)
        attention_out = self.att_drop(attention_out)
        att_out = self.attention_norm_lay(K + attention_out)
#         feed forward
        linear_out = self.fcn(att_out)
        linear_out = self.linear_drop(linear_out)
        out = self.ff_norm_lay(att_out + linear_out)
        out = att_out + linear_out
    
        return out
class Stack_Decoder(nn.Module):
    """
    Stacked Encoder
    """
    def __init__(self, layer_num, dk, dv, dm, h):
        super(Stack_Decoder, self).__init__()
        self.decoders = nn.ModuleList([Decoder(dk, dv, dm, h) for i in range(layer_num)])
        
        
    def forward(self, Q, encoder_out):
        # ModuleList can act as an iterable, or be indexed using ints
        Q_len, d = Q.size()
        for lay in self.decoders:
            Q = lay(Q, encoder_out, mask=None)
        return Q           

class Decoder(nn.Module):
    def __init__(self, dk, dv, dm, h):
        super(Decoder, self).__init__()
#         query attention residual block
        self.Q_attention_lay = Multi_Head_attention_layer(dk, dv, dm, h)
        self.Q_attention_norm_lay = nn.LayerNorm([dm, ])
        self.Q_att_drop = nn.Dropout(P_DROP)
    
#         query key attention residual block
        self.QK_attention_lay = Multi_Head_attention_layer(dk, dv, dm, h)
        self.QK_attention_norm_lay = nn.LayerNorm([dm, ])
        self.QK_att_drop = nn.Dropout(P_DROP)
        
    
#         feed forward residual block
        self.fcn = PositionwiseFeedForward(D_MODEL, D_FF)
        self.ff_norm_lay = nn.LayerNorm([dm, ])
        self.linear_drop = nn.Dropout(P_DROP)
        

    def forward(self, Q, encoder_out, mask):
#         query attention
        Q_attention_out = self.Q_attention_lay(Q, Q, Q, mask)
        Q_attention_out = self.Q_att_drop(Q_attention_out)
        Q_att_out = self.Q_attention_norm_lay(Q + Q_attention_out)
#         query key attention
        QK_attention_out = self.QK_attention_lay(Q_att_out, encoder_out, encoder_out)
        QK_attention_out = self.QK_att_drop(QK_attention_out)
        QK_att_out = self.QK_attention_norm_lay(Q_att_out + QK_attention_out)
        
#         feed forward
        linear_out = self.fcn(QK_att_out)
        out = self.ff_norm_lay(QK_att_out + linear_out)
        return out

class Multi_Head_attention_layer(nn.Module):
    def __init__(self, dk, dv, dm, h):
        super(Multi_Head_attention_layer, self).__init__()
        self.Q_linears = nn.ModuleList([nn.Linear(dm, dk) for i in range(h)])
        self.K_linears = nn.ModuleList([nn.Linear(dm, dk) for i in range(h)])
        self.V_linears = nn.ModuleList([nn.Linear(dm, dv) for i in range(h)])
        self.output_linear = nn.Linear(h*dv, dm)
                            

    def forward(self, Q_input, K_input, V_input, mask=None):
        buf = []
        for Q_linear, K_linear, V_linear in zip(self.Q_linears, self.K_linears, self.V_linears):
            Q = Q_linear(Q_input)
            K = K_linear(K_input)
            V = V_linear(V_input)
            buf.append(scaled_dot_attention(Q, K, V, mask))
            
        buf = torch.cat(buf,dim=-1)
        out = self.output_linear(buf)
        
        return out      
class PositionwiseFeedForward(nn.Module):
    def __init__(self, d_model, d_ff):
        super(PositionwiseFeedForward, self).__init__()
        self.cnn1 = nn.Conv1d(d_model, d_ff, 1)
        self.cnn2 = nn.Conv1d(d_ff, d_model, 1)
                            

    def forward(self, x):
        seq_len,_ = x.size()
        x = x.unsqueeze(0)
        x = x.permute(0,2,1)
        x = self.cnn1(x)
        x = F.relu(x)
        x = self.cnn2(x)
        x = x.permute(0,2,1)
        x = x.squeeze(0)
        
        return x      
    
# encoder = Stack_Encoder(6, 64,64,20,8)
# # print net
print('load text model')

load text model


In [5]:
import torch 
import numpy as np
from tqdm import tqdm
model = torch.load('./text_best_loss.pt')
model.eval()

edges_unordered = np.genfromtxt('./kaggle/t2-test.txt', dtype=np.int32)
with torch.no_grad():
    with open('weight.csv', 'w') as f:
        with tqdm(total=edges_unordered.shape[0]) as pbar:
            f.write('query_id,prediction\n')
            for i in range(edges_unordered.shape[0]):
                src, dst = edges_unordered[i, :]

                q = xml_id_map[dst]
                k = xml_id_map[src]


                q,k = torch.LongTensor(q), torch.LongTensor(k)

                output = model(q.cuda(), k.cuda()).flatten().item()
                print output
                wf
                out = 1 if output >= 0.5 else 0
                f.write('%d,%d\n' % (1 + i, out))
                if src not in training_node_set and dst not in training_node_set:
                    g.write('%d,%d\n' % (1 + i, 0))
                else:
                    g.write('%d,%d\n' % (1 + i, out))
                pbar.update(1)
print c
print 'done'



  0%|          | 0/76947 [00:00<?, ?it/s]

0.536920726299





NameError: name 'wf' is not defined

In [3]:
# emb
buf = np.genfromtxt('./t2.emb', skip_header=1, dtype=np.float32)
nodes = buf[:,0].astype(np.int32)
emb = buf[:,1:]

node_emb_dict = {}
for i in range(emb.shape[0]):
    node_id = nodes[i]
    x = emb[i,:]
    node_emb_dict[node_id] = x

In [4]:
import torch 
import numpy as np
from tqdm import tqdm
model = torch.load('./best_loss.pt')
model.eval()

edges_unordered = np.genfromtxt('./kaggle/t2-test.txt', dtype=np.int32)
links = np.genfromtxt('./kaggle/t2-train.txt', dtype=np.int32)
training_node_set = set(links.flatten().tolist())
c = 1
with torch.no_grad():
    with open('pred.txt.csv', 'w') as f:
        with open('guess.csv', 'w') as g:
            with tqdm(total=edges_unordered.shape[0]) as pbar:

                f.write('query_id,prediction\n')
                g.write('query_id,prediction\n')
                for i in range(edges_unordered.shape[0]):

                    src, dst = edges_unordered[i, :]
                        
                    q = xml_id_map[dst]
                    k = xml_id_map[src]

                    q_f = node_emb_dict[dst]
                    k_f = node_emb_dict[src]

                    q,k = torch.LongTensor(q), torch.LongTensor(k)
                    q_f,k_f = torch.FloatTensor(q_f), torch.FloatTensor(k_f)

                    output = model(q.cuda(), k.cuda(), q_f.cuda(), k_f.cuda()).flatten().item()

                    out = 1 if output >= 0.5 else 0
                    f.write('%d,%d\n' % (1 + i, out))
                    if src not in training_node_set and dst not in training_node_set:
                        g.write('%d,%d\n' % (1 + i, 0))
                    else:
                        g.write('%d,%d\n' % (1 + i, out))
                    pbar.update(1)
print c
print 'done'



100%|██████████| 76947/76947 [43:43<00:00, 29.33it/s] 

1
done





## to libsvm type

In [9]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.nn.init import xavier_normal_
from constants import D_MODEL, STACKED_NUM,DK, DV, H, P_DROP, D_FF, MAX_SEQUENCE_LENGTH, MAX_NUM_WORDS, EMBEDDING_DIM
class Transformer2(nn.Module):

    def __init__(self, layer_num, dk, dv, dm, h, emb_matrix):
        super(Transformer2, self).__init__()
        
        self.emb = Word_Embedding(emb_matrix)
        
        self.emb_drop = nn.Dropout(P_DROP)
        
        self.encoder = Stack_Encoder(layer_num, dk, dv, dm, h)
        self.decoder = Stack_Decoder(layer_num, dk, dv, dm, h)
        self.summary_decoder = Stack_Decoder(2, dk, dv, dm, h)
        
        self.summary_weight = nn.Parameter(torch.FloatTensor(1, dm))
        torch.nn.init.xavier_uniform_(self.summary_weight)
        
        self.output_linear = nn.Linear(3*dm, 1)

    def forward(self, Q, K, Q_fea, K_fea):
        
#         encoder
        K = self.emb(K)
#         print(K.size(), get_pos_mat(MAX_SEQUENCE_LENGTH).size())
        K = K + get_pos_mat(MAX_SEQUENCE_LENGTH)
        K = self.emb_drop(K)
        
        en_out = self.encoder(K)
        
#         decoder
        Q = self.emb(Q)
        seq_len, d = Q.size()
        
        Q = Q + get_pos_mat(MAX_SEQUENCE_LENGTH)
        Q = self.emb_drop(Q)
        
        de_out = self.decoder(Q, en_out)
        
        
        summary = self.summary_decoder(self.summary_weight, de_out)
        x = torch.cat([summary, Q_fea.view([1,-1]), K_fea.view([1,-1])], dim=-1)
        out = self.output_linear(x)
        out = torch.sigmoid(out)
        

        return out,x

print ' transformer2 done'

 transformer2 done


In [23]:
import numpy as np
import scipy.sparse as sp
from tqdm import tqdm

def negative_bootsrap_generator(adj_mat, links, idx_map, xml_id_map, training_node_list, node_emb_dict, neighbor_link_rate=0.8):
    
    
    exist_node_list = xml_id_map.keys()
    exist_N = len(training_node_list)
    N = adj_mat.shape[0]
    
#     adj mat
    links = np.array(list(map(idx_map.get, links.flatten())),
                     dtype=np.int32).reshape(links.shape)
    
    adj_sp = sp.coo_matrix((np.ones(links.shape[0]), (links[:, 0], links[:, 1])),
                        shape=(N, N),
                        dtype=np.uint8)
    adj_sp_2 = (sp.coo_matrix.dot(adj_sp,adj_sp) + adj_sp).tocoo()
    
    rev_map = {v:k for k,v in idx_map.items()}
    adj_map = {i:[] for i in range(N)}
    with tqdm(total=len(adj_sp_2.row)) as pbar:
        for i,j,v in zip(adj_sp_2.row, adj_sp_2.col, adj_sp_2.data):
            if adj_mat[i, j] != 1 and v == 1:
                adj_map[i].append(j)
            pbar.update(1)
#             print i,N
                
    while True:
        src = training_node_list[np.random.randint(exist_N)]
        
#         choose neighbor link
        if np.random.rand(1) <= neighbor_link_rate:
        
            i = idx_map[src]
            high = len(adj_map[i])
            while high == 0:
                src = training_node_list[np.random.randint(exist_N)]
                i = idx_map[src]
                high = len(adj_map[i])
                
            idx = np.random.randint(high)
            dst = adj_map[i][idx]
            dst = rev_map[dst]
        else:
            dst = training_node_list[np.random.randint(exist_N)]
            while adj_mat[idx_map[src], idx_map[dst]] == 1:
                dst = training_node_list[np.random.randint(exist_N)]
        Q = xml_id_map[dst]
        K = xml_id_map[src]
        Q_fea = node_emb_dict[dst]
        K_fea = node_emb_dict[src]
        yield Q, K, Q_fea, K_fea

def val_data(edges, xml_id_map):
    Q, K = [],[]
    Q_f, K_f = [],[]
    
    for idx in range(edges.shape[0]):
        src, dst = edges[idx, :]
        q = xml_id_map[dst]
        k = xml_id_map[src]
        q_fea = node_emb_dict[dst]
        k_fea = node_emb_dict[src]
        
        Q.append(q)
        K.append(k)
        Q_f.append(q_fea)
        K_f.append(k_fea)
        
    Q = np.vstack(Q)
    K = np.vstack(K)
    Q_fea = np.vstack(Q_f)
    K_fea = np.vstack(K_f)
    
    return Q, K, Q_fea, K_fea




links = np.genfromtxt(join(data_path,'t2-fake.txt'), dtype=np.int32)
idx_map = {node:idx for idx, node in enumerate(list(set(links.flatten().tolist())))}
N = links.shape[0]
adj_mat = np.zeros([N,N], dtype=np.uint8)
for i in range(links.shape[0]):
    src, dst = links[i].tolist()
    adj_mat[idx_map[src], idx_map[dst]] = 1


idx = np.random.permutation(N)
train_idx = idx[N//10:]
val_idx = idx[:N//10]

training_node_list = list(set(links.flatten().tolist()))
neg_G = negative_bootsrap_generator(adj_mat, links, idx_map, xml_id_map, training_node_list, node_emb_dict)
pos_Q, pos_K, pos_Q_fea, pos_K_fea = val_data(links[train_idx,:], xml_id_map)
val_Q, val_K, val_Q_fea, val_K_fea = val_data(links[val_idx,:], xml_id_map)

q,k,q_f,k_f = next(neg_G)
print(q.shape,k.shape, q_f.shape, k_f.shape)
print(val_Q.shape,val_K.shape, val_Q_fea.shape, val_K_fea.shape)
print(pos_Q.shape,pos_K.shape, pos_Q_fea.shape, pos_K_fea.shape)


100%|██████████| 2003515/2003515 [00:06<00:00, 294116.57it/s]

((150,), (150,), (128,), (128,))
((16161, 150), (16161, 150), (16161, 128), (16161, 128))
((145458, 150), (145458, 150), (145458, 128), (145458, 128))





In [8]:
import numpy as np
from os.path import join
import numpy as np

# emb
buf = np.genfromtxt('./t2.emb', skip_header=1, dtype=np.float32)
nodes = buf[:,0].astype(np.int32)
emb = buf[:,1:]

node_emb_dict = {}
for i in range(emb.shape[0]):
    node_id = nodes[i]
    x = emb[i,:]
    node_emb_dict[node_id] = x
#     
    
    
links = np.genfromtxt('./kaggle/t2-test.txt', dtype=np.int32)
with open('pred.txt.csv', 'w') as f:
    f.write('query_id,prediction\n')
    for i in range(links.shape[0]):

        a = node_emb_dict[src]
        b = node_emb_dict[dst]
        d = cos_d(a, b)
        if d >= 0.242:
            f.write('%d,1\n' % (1 + i))
        else:
            f.write('%d,0\n' % (1 + i))
    
print 'done', i


done 76946


In [6]:
import numpy as np
from os.path import join
import numpy as np

def get_node_set(path):
    # training data
    edges_unordered = np.genfromtxt(path,
                                    dtype=np.int32)
    id_set = set(edges_unordered.flatten().tolist())
    return id_set

data_path = join('./','kaggle')
# emb
buf = np.genfromtxt('./t2.emb', skip_header=1, dtype=np.float32)
nodes = buf[:,0].astype(np.int32)
emb = buf[:,1:]

node_emb_dict = {}
for i in range(emb.shape[0]):
    node_id = nodes[i]
    x = emb[i,:]
    node_emb_dict[node_id] = x
    
# training data
idx_map = {k:i for i,k in enumerate(nodes.tolist())}
links = np.genfromtxt('./kaggle/t2-fake.txt', dtype=np.int32)

N = len(idx_map)
X = []
adj_mat = np.zeros([N,N], dtype=np.uint8)
for i in range(links.shape[0]):
    
    src, dst = links[i].tolist()
    adj_mat[idx_map[src], idx_map[dst]] = 1
    fea = np.concatenate([node_emb_dict[src], node_emb_dict[dst]], axis=-1)
    X.append(fea)
X = np.vstack(X)
    

batch_size = 128
def naive_bootsrap_generator(X, adj_mat, idx_map, node_emb_dict, train_node_set, batch_size=128, neg_rate=1. ):
    train_node_list = list(train_node_set)
    train_N = len(train_node_list)
    num_edge = X.shape[0]
        
    while True:
        idx = np.random.choice(num_edge, batch_size)
        pos_X = X[idx, :]
        
        neg_count = int(batch_size*neg_rate)
        neg_idx = np.random.randint(train_N, size=[neg_count, 2])
        neg_X = []
        for i in range(neg_count):
            src, dst = neg_idx[i]
            src = train_node_list[src]
            dst = train_node_list[dst]
            if src != dst and adj_mat[idx_map[src], idx_map[dst]] == 0:
                fea = np.concatenate([node_emb_dict[src], node_emb_dict[dst]], axis=-1)
                neg_X.append(fea)
        neg_X = np.vstack(neg_X)

        ret_X = np.vstack([pos_X, neg_X])
        ret_Y = np.zeros([ret_X.shape[0], 1])
        ret_Y[:batch_size, 0] = 1
        yield ret_X, ret_Y

N = X.shape[0]
idx = np.random.permutation(N)
train_idx = idx[N//10:]
val_idx = idx[:N//10]

train_X = X[train_idx,:]
val_X = X[val_idx,:]

train_node_set = get_node_set('./kaggle/t2-train.txt')
G = naive_bootsrap_generator(train_X, adj_mat, idx_map, node_emb_dict, train_node_set, batch_size=batch_size)
val_G = naive_bootsrap_generator(val_X, adj_mat, idx_map, node_emb_dict, train_node_set,batch_size=batch_size, neg_rate=0.1)
x,y = next(G)
# print x.shape,y.shape
x,y = next(val_G)
# print x.shape,y.shape

N = X.shape[0]
print N
xx = []
yy = []

for i in range((N//2) // batch_size +1):
    x,y = next(G)
#     x = x[batch_size:,:]
#     y = y[batch_size:,:]
    yy.append(y)
XX = np.vstack(xx)
YY = np.vstack(yy)


# XX = np.vstack([X,XX])
# YY = np.vstack([np.ones([X.shape[0],1]),YY])
print XX.shape, YY.shape

    
def cos_d(x,y):
    assert x.shape==y.shape==(128,)
    return np.dot(x,y)/(np.linalg.norm(x, ord=2) * np.linalg.norm(y, ord=2))

def foo(X, Y):
    N,dim = X.shape
    out = []
    
    for i in range(N):
        src, dst = X[i,:dim//2], X[i,dim//2:]
        d = cos_d(src, dst)
        out.append(d)
    out = np.array(out)
    label = Y.flatten().astype(np.bool)
    best_acc = best_threshold = 0
    for threshold in np.linspace(0,1,100):
        buf = out >= threshold
        acc = np.mean(buf == label)
#         print '%.2f%%' % acc, threshold
    
        if acc > best_acc:
            best_acc = acc
            best_threshold = threshold
    return best_acc, best_threshold, out
acc, threshold, out = foo(XX,YY)
print '%.3f%%, %.3f' % (acc, threshold)

# XX = np.vstack([X,XX])
# YY = np.vstack([np.ones([X.shape[0],1]),YY])
acc, threshold, out2 = foo(XX,YY)
buf = np.sort(out2)
l = len(buf)
print buf[l//4]



161619
(161715, 256) (161715, 1)
0.923%, 0.242
0.11969484


In [4]:
import numpy as np
from os.path import join
with open(join('./','t2_fake.emb')) as f:
    num_nodes, D = f.readline().strip().split(' ')
    num_nodes = int(num_nodes)
    D = int(D)
    
    ls = f.readlines()
node_emb_dict = {}
for l in ls:
    buf = l.strip().split(' ')
    node_id, emb = int(buf[0]), buf[1:]
    x = np.asarray([float(i) for i in emb], dtype=np.float32)
    node_emb_dict[node_id] = x

In [11]:
print embedding_matrix.shape

(10001, 300)


In [34]:
print q.type()

torch.LongTensor


In [37]:
import torch 
import numpy as np
from tqdm import tqdm
from constants import D_MODEL, STACKED_NUM,DK, DV, H, P_DROP, D_FF, MAX_SEQUENCE_LENGTH, MAX_NUM_WORDS, EMBEDDING_DIM
with_gpu = torch.cuda.is_available()
# with_gpu = False
device = torch.device("cuda:0")


tmp = torch.load('./best_loss.pt')
model = Transformer2(STACKED_NUM, DK, DV, D_MODEL, H, emb_matrix=np.zeros([10001, 300])).to(device)
model.load_state_dict(tmp.state_dict())
model.to(device)
model.eval()

edges_unordered = np.genfromtxt('./kaggle/t2-test.txt', dtype=np.int32)
# links = np.genfromtxt('./kaggle/t2-train.txt', dtype=np.int32)
# training_node_set = set(links.flatten().tolist())
mid_X = []
val_mid_X = []
weight = []
neg_rate = 1
total_N = pos_Q.shape[0] + int(pos_Q.shape[0]*neg_rate) + 2*val_Q.shape[0]
with torch.no_grad():
    for i in range(edges_unordered.shape[0]):
        with tqdm(total=total_N) as pbar:
    #         positive train
            for i in range(pos_Q.shape[0]):
                q,k = pos_Q[i,:], val_K[i,:]
                q_f,k_f = pos_Q_fea[i,:], pos_K_fea[i,:]
                q,k = torch.LongTensor(q), torch.LongTensor(k)
                q_f,k_f = torch.FloatTensor(q_f), torch.FloatTensor(k_f)

                output, mid = model(q.to(device), k.to(device), q_f.to(device), k_f.to(device))
#                 output, mid = model(q, k, q_f, k_f)
                mid_X.append(mid)
                weight.append(1)
                pbar.update(1)
    #         negative train
            for i in range(pos_Q.shape[0]*neg_rate):
                q,k,q_f,k_f = next(neg_G)
                q,k = torch.LongTensor(q), torch.LongTensor(k)
                q_f,k_f = torch.FloatTensor(q_f), torch.FloatTensor(k_f)

                output, mid = model(q.to(device), k.to(device), q_f.to(device), k_f.to(device))
                mid_X.append(mid)
                weight.append(1./neg_rate)
                pbar.update(1)
    #         positive val
            for val_i in range(val_Q.shape[0]):
                q,k = val_Q[val_i,:], val_K[val_i,:]
                q_f,k_f = val_Q_fea[val_i,:], val_K_fea[val_i,:]
                q,k = torch.LongTensor(q), torch.LongTensor(k)
                q_f,k_f = torch.FloatTensor(q_f), torch.FloatTensor(k_f)

                output, mid = model(q.to(device), k.to(device), q_f.to(device), k_f.to(device))
                val_mid_X.append(mid)
                pbar.update(1)

    #         negative val
            for val_i in range(val_Q.shape[0]):
                q,k,q_f,k_f = next(neg_G)
                q,k = torch.LongTensor(q), torch.LongTensor(k)
                q_f,k_f = torch.FloatTensor(q_f), torch.FloatTensor(k_f)

                output, mid = model(q.to(device), k.to(device), q_f.to(device), k_f.to(device))
                val_mid_X.append(mid)
                pbar.update(1)



  0%|          | 135/323238 [00:09<6:02:09, 14.87it/s]


KeyboardInterrupt: 

In [10]:
import keras
from os.path import join
import os
from bs4 import BeautifulSoup as BS
from constants import MAX_SEQUENCE_LENGTH, MAX_NUM_WORDS, EMBEDDING_DIM
from keras.preprocessing.sequence import pad_sequences
import numpy as np
np.random.seed(1337)
def quote_title_abstract(xml_path):
    with open(xml_path, 'r') as f:
        data = f.read()
    soup = BS(data)
    title, abstract = soup.find('title').text, soup.find('abstract').text
    return title.strip(), abstract.strip()

# text preprocessing
data_path = join('./','kaggle/')
xml_dir = join(data_path, 't2-doc')
xml_list = [f for f in os.listdir(xml_dir) if f.endswith('.xml')]
# print(len(xml_list))


texts = []

for xml in xml_list:
    path = join(xml_dir,xml)
    title, abstract = quote_title_abstract(path)
    text = title + '' + abstract
    texts.append(text)
#     texts.append(title)
#     texts.append(abstract)
print('read all %d xml files.' % len(xml_list))
tokenizer = keras.preprocessing.text.Tokenizer(num_words=MAX_NUM_WORDS, filters='!"#$%&()*+,-./:;<=>?@[\]^_`{|}~ ',
                                   lower=True, split=' ', char_level=False, oov_token=None)
tokenizer.fit_on_texts(texts)
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))
sequences = tokenizer.texts_to_sequences(texts)
data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)
xml_id_map = {}
for i,xml in enumerate(xml_list):
    node_id = int(xml.replace('.xml',''))
    xml_id_map[node_id] = data[i,:]


print('Preparing embedding matrix.')
embeddings_index = {}
# with open(os.path.join('./','glove', 'glove.6B.%dd.txt' % EMBEDDING_DIM), 'r', encoding='utf8') as f:
with open(os.path.join('./','glove', 'glove.6B.%dd.txt' % EMBEDDING_DIM), 'r') as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs
# prepare embedding matrix
num_words = min(MAX_NUM_WORDS, len(word_index)) + 1
embedding_matrix = np.zeros((num_words, EMBEDDING_DIM))
for word, i in word_index.items():
    if i > MAX_NUM_WORDS:
        continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

print('done')


read all 17500 xml files.
Found 82709 unique tokens.
Preparing embedding matrix.
done


In [39]:
import torch 
import numpy as np
from tqdm import tqdm
from constants import D_MODEL, STACKED_NUM,DK, DV, H, P_DROP, D_FF, MAX_SEQUENCE_LENGTH, MAX_NUM_WORDS, EMBEDDING_DIM
with_gpu = torch.cuda.is_available()
# with_gpu = False
device = torch.device("cuda:0")


tmp = torch.load('./best_loss_2.pt')
model = Transformer2(STACKED_NUM, DK, DV, D_MODEL, H, emb_matrix=np.zeros([10001, 300])).to(device)
model.load_state_dict(tmp.state_dict())
model.to(device)
model.eval()

edges_unordered = np.genfromtxt('./kaggle/t2-test.txt', dtype=np.int32)
# links = np.genfromtxt('./kaggle/t2-train.txt', dtype=np.int32)
# training_node_set = set(links.flatten().tolist())
test_mid_X = []
test_Y = []
with torch.no_grad():
#     with tqdm(total=total_N) as pbar:
    with tqdm(total=edges_unordered.shape[0]) as pbar:
        for i in range(edges_unordered.shape[0]):

            src, dst = edges_unordered[i, :]

            q = xml_id_map[dst]
            k = xml_id_map[src]

            q_f = node_emb_dict[dst]
            k_f = node_emb_dict[src]

            q,k = torch.LongTensor(q), torch.LongTensor(k)
            q_f,k_f = torch.FloatTensor(q_f), torch.FloatTensor(k_f)

            output,mid = model(q.cuda(), k.cuda(), q_f.cuda(), k_f.cuda())

            out = 1 if output.flatten().item() >= 0.5 else 0
            test_mid_X.append(mid)
            test_Y.append(out)
            pbar.update(1)

        
test_mid = np.vstack(test_mid_X)
test_out = np.array(test_Y)
with open('./test_mid', 'wb') as f:
    np.save(f,test_mid)
with open('./test_out', 'wb') as f:
    np.save(f,test_out)
print 'write done'
    





100%|██████████| 76947/76947 [1:30:33<00:00, 14.16it/s]


write done
