# generate fake link

In [3]:
# randomly sample test link
import numpy as np
from tqdm import tqdm
from os.path import join
def get_node_set(path):
    # training data
    edges_unordered = np.genfromtxt(path,
                                    dtype=np.int32)
    id_set = set(edges_unordered.flatten().tolist())
    return id_set

data_path = join('./','kaggle')
# training data
train_node_set = get_node_set(join(data_path,'t2-train.txt'))
test_node_set = get_node_set(join(data_path,'t2-test.txt'))
node_set = set.union(train_node_set, test_node_set)
idx_map = {k:i for i,k in enumerate(list(node_set))}
N = len(node_set)
adj_mat = np.zeros([N,N], dtype=np.uint8)

links = np.genfromtxt(join(data_path,'t2-train.txt'), dtype=np.int32)
for i in range(links.shape[0]):
    src, dst = links[i].tolist()
    adj_mat[idx_map[src], idx_map[dst]] = 1

out_degree = np.sum(adj_mat, axis=1).flatten()

means, std = np.mean(out_degree), np.std(out_degree)

rev_map = {v:k for k,v in idx_map.items()}
total_link_num = links.shape[0] + int(np.sum(out_degree))
with tqdm(total=total_link_num) as pbar:
    with open(join(data_path,'t2-fake.txt'), 'w') as f:
        for i in range(links.shape[0]):
            src, dst = links[i].tolist()
            s = '%d %d\n' % (src, dst)
            f.write(s)
            pbar.update(1)
        train_node_list = list(train_node_set)
        for node_id in list(test_node_set):
            i = idx_map[node_id]
            d = int(np.round(np.random.normal(means, std)))
            d = max(1, d)
            
            for j in range(d):
                idx = np.random.randint(len(train_node_list))
                dst = idx_map[train_node_list[idx]]
                while adj_mat[i, dst] == 1 or dst == i:
                    idx = np.random.randint(len(train_node_list))
                    dst = idx_map[train_node_list[idx]]
                
                adj_mat[i, dst] = 1
                s = '%d %d\n' % (rev_map[i], rev_map[dst])
                f.write(s)
            
                pbar.update(1)
    
print 'done', np.sum(adj_mat)
    

 93%|█████████▎| 161619/173364 [00:00<00:00, 262344.67it/s]


done 161619


In [1]:
import keras
from os.path import join
import os
from bs4 import BeautifulSoup as BS
from constants import MAX_SEQUENCE_LENGTH, MAX_NUM_WORDS, EMBEDDING_DIM
from keras.preprocessing.sequence import pad_sequences
import numpy as np

def quote_title_abstract(xml_path):
    with open(xml_path, 'r') as f:
        data = f.read()
    soup = BS(data)
    title, abstract = soup.find('title').text, soup.find('abstract').text
    return title.strip(), abstract.strip()

# text preprocessing
data_path = join('./','kaggle/')
xml_dir = join(data_path, 't2-doc')
xml_list = [f for f in os.listdir(xml_dir) if f.endswith('.xml')]
# print(len(xml_list))


texts = []

for xml in xml_list:
    path = join(xml_dir,xml)
    title, abstract = quote_title_abstract(path)
    text = title + '' + abstract
    texts.append(text)
#     texts.append(title)
#     texts.append(abstract)
print('read all %d xml files.' % len(xml_list))
tokenizer = keras.preprocessing.text.Tokenizer(num_words=MAX_NUM_WORDS, filters='!"#$%&()*+,-./:;<=>?@[\]^_`{|}~ ',
                                   lower=True, split=' ', char_level=False, oov_token=None)
tokenizer.fit_on_texts(texts)
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))
sequences = tokenizer.texts_to_sequences(texts)
data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)
xml_id_map = {}
for i,xml in enumerate(xml_list):
    node_id = int(xml.replace('.xml',''))
    xml_id_map[node_id] = data[i,:]


print('Preparing embedding matrix.')
embeddings_index = {}
# with open(os.path.join('./','glove', 'glove.6B.%dd.txt' % EMBEDDING_DIM), 'r', encoding='utf8') as f:
with open(os.path.join('./','glove', 'glove.6B.%dd.txt' % EMBEDDING_DIM), 'r') as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs
# prepare embedding matrix
num_words = min(MAX_NUM_WORDS, len(word_index)) + 1
embedding_matrix = np.zeros((num_words, EMBEDDING_DIM))
for word, i in word_index.items():
    if i > MAX_NUM_WORDS:
        continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

print('done')

Using TensorFlow backend.


read all 17500 xml files.
Found 82709 unique tokens.
Preparing embedding matrix.
done


In [2]:
# emb
buf = np.genfromtxt('./t2.emb', skip_header=1, dtype=np.float32)
nodes = buf[:,0].astype(np.int32)
emb = buf[:,1:]

node_emb_dict = {}
for i in range(emb.shape[0]):
    node_id = nodes[i]
    x = emb[i,:]
    node_emb_dict[node_id] = x

In [3]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.nn.init import xavier_normal_
from constants import D_MODEL, STACKED_NUM,DK, DV, H, P_DROP, D_FF, MAX_SEQUENCE_LENGTH, MAX_NUM_WORDS, EMBEDDING_DIM
# environment
with_gpu = torch.cuda.is_available()
# with_gpu = False
device = torch.device("cuda:0" if with_gpu else "cpu")

def positional_encoding(pos):
    assert D_MODEL % 2 == 0
    pos = torch.tensor(pos, dtype=torch.float32, requires_grad=False)
    pe = torch.zeros([1,D_MODEL], dtype=torch.float32, requires_grad=False)
    for i in range(D_MODEL//2):
        a = torch.tensor(10000, dtype=torch.float32, requires_grad=False)
        b = torch.tensor(2.*i/float(D_MODEL), dtype=torch.float32, requires_grad=False)
        c = pos / torch.pow(a, b)
        pe[0, 2*i] = torch.sin(c)
        pe[0, 2*i+1] = torch.cos(c)
    return pe
def get_pos_mat(length):
    if length > MAX_SEQUENCE_LENGTH:
        print('sequence length reach PE_MAT_CACHE. %d ' % length)
        ret = torch.cat([positional_encoding(i) for i in range(length)], dim=0).to(device)
        ret.requires_grad = False
        global PE_CACHE_MATRIX
        PE_CACHE_MATRIX = ret
        return ret
    else:
        return PE_CACHE_MATRIX[:length]
    
PE_CACHE_MATRIX = torch.cat([positional_encoding(i) for i in range(0,MAX_SEQUENCE_LENGTH)], dim=0).to(device)
PE_CACHE_MATRIX.requires_grad = False

# construct neuron network

def scaled_dot_attention(Q, K, V, mask=None):
    assert Q.size()[-1] == K.size()[-1]
    dk = torch.tensor(K.size()[-1], dtype=torch.float32, requires_grad=False).to(device)
    out = torch.matmul(Q,K.t()) / torch.sqrt(dk) 
    if mask is not None:
        out = out.masked_fill_(mask, -float('inf'))
        
    return torch.matmul(F.softmax(out, dim=-1), V)
                            
class Transformer(nn.Module):

    def __init__(self, layer_num, dk, dv, dm, h, emb_matrix):
        super(Transformer, self).__init__()
        
        self.emb = Word_Embedding(emb_matrix)
        
        self.emb_drop = nn.Dropout(P_DROP)
        
        self.encoder = Stack_Encoder(layer_num, dk, dv, dm, h)
        self.decoder = Stack_Decoder(layer_num, dk, dv, dm, h)
        
        self.summary_weight = nn.Parameter(torch.FloatTensor(1, dm))
        torch.nn.init.xavier_uniform_(self.summary_weight)
        
        self.output_linear = nn.Linear(3*dm, 1)

    def forward(self, Q, K, Q_fea, K_fea):
        
#         encoder
        K = self.emb(K)
#         print(K.size(), get_pos_mat(MAX_SEQUENCE_LENGTH).size())
        K = K + get_pos_mat(MAX_SEQUENCE_LENGTH)
        K = self.emb_drop(K)
        
        en_out = self.encoder(K)
        
#         decoder
        Q = self.emb(Q)
        seq_len, d = Q.size()
        
        Q = Q + get_pos_mat(MAX_SEQUENCE_LENGTH)
        Q = self.emb_drop(Q)
        
        de_out = self.decoder(Q, en_out)
        
        
        summary = scaled_dot_attention(self.summary_weight, de_out, de_out)
        x = torch.cat([summary, Q_fea.view([1,-1]), K_fea.view([1,-1])], dim=-1)
        out = self.output_linear(x)
        out = torch.sigmoid(out)
        

        return out

class Word_Embedding(nn.Module):
    def __init__(self, emb_matrix):
        super(Word_Embedding, self).__init__()
        self.emb = nn.Embedding(MAX_NUM_WORDS, EMBEDDING_DIM, padding_idx=0)
        self.emb.weight = nn.parameter.Parameter(torch.FloatTensor(emb_matrix))
        self.emb.weight.requires_grad_(False)
        
        self.linear = nn.Linear(EMBEDDING_DIM, D_MODEL, bias=False)
        


    def forward(self, x):
        x = self.emb(x)
        x = self.linear(x)
        return x
    
class Stack_Encoder(nn.Module):
    """
    Stacked Encoder
    """
    def __init__(self, layer_num, dk, dv, dm, h):
        super(Stack_Encoder, self).__init__()
        self.encoders = nn.ModuleList([Encoder(dk, dv, dm, h) for i in range(layer_num)])

    def forward(self, K):
        # ModuleList can act as an iterable, or be indexed using ints
        for lay in self.encoders:
            K = lay(K)
        return K               
class Encoder(nn.Module):
    def __init__(self, dk, dv, dm, h):
        super(Encoder, self).__init__()
#         attention residual block
        self.multi_head_attention_layer = Multi_Head_attention_layer(dk, dv, dm, h)
        self.attention_norm_lay = nn.LayerNorm([dm,])
        self.att_drop = nn.Dropout(P_DROP)
#         feed forward residual block
        self.fcn = PositionwiseFeedForward(D_MODEL, D_FF)
        self.linear_drop = nn.Dropout(P_DROP)
        self.ff_norm_lay = nn.LayerNorm([dm, ])
        

    def forward(self, K):
#         attention
        attention_out = self.multi_head_attention_layer(K, K, K)
        attention_out = self.att_drop(attention_out)
        att_out = self.attention_norm_lay(K + attention_out)
#         feed forward
        linear_out = self.fcn(att_out)
        linear_out = self.linear_drop(linear_out)
        out = self.ff_norm_lay(att_out + linear_out)
        out = att_out + linear_out
    
        return out
class Stack_Decoder(nn.Module):
    """
    Stacked Encoder
    """
    def __init__(self, layer_num, dk, dv, dm, h):
        super(Stack_Decoder, self).__init__()
        self.decoders = nn.ModuleList([Decoder(dk, dv, dm, h) for i in range(layer_num)])
        
        
    def forward(self, Q, encoder_out):
        # ModuleList can act as an iterable, or be indexed using ints
        Q_len, d = Q.size()
        for lay in self.decoders:
            Q = lay(Q, encoder_out, mask=None)
        return Q           

class Decoder(nn.Module):
    def __init__(self, dk, dv, dm, h):
        super(Decoder, self).__init__()
#         query attention residual block
        self.Q_attention_lay = Multi_Head_attention_layer(dk, dv, dm, h)
        self.Q_attention_norm_lay = nn.LayerNorm([dm, ])
        self.Q_att_drop = nn.Dropout(P_DROP)
    
#         query key attention residual block
        self.QK_attention_lay = Multi_Head_attention_layer(dk, dv, dm, h)
        self.QK_attention_norm_lay = nn.LayerNorm([dm, ])
        self.QK_att_drop = nn.Dropout(P_DROP)
        
    
#         feed forward residual block
        self.fcn = PositionwiseFeedForward(D_MODEL, D_FF)
        self.ff_norm_lay = nn.LayerNorm([dm, ])
        self.linear_drop = nn.Dropout(P_DROP)
        

    def forward(self, Q, encoder_out, mask):
#         query attention
        Q_attention_out = self.Q_attention_lay(Q, Q, Q, mask)
        Q_attention_out = self.Q_att_drop(Q_attention_out)
        Q_att_out = self.Q_attention_norm_lay(Q + Q_attention_out)
#         query key attention
        QK_attention_out = self.QK_attention_lay(Q_att_out, encoder_out, encoder_out)
        QK_attention_out = self.QK_att_drop(QK_attention_out)
        QK_att_out = self.QK_attention_norm_lay(Q_att_out + QK_attention_out)
        
#         feed forward
        linear_out = self.fcn(QK_att_out)
        out = self.ff_norm_lay(QK_att_out + linear_out)
        return out

class Multi_Head_attention_layer(nn.Module):
    def __init__(self, dk, dv, dm, h):
        super(Multi_Head_attention_layer, self).__init__()
        self.Q_linears = nn.ModuleList([nn.Linear(dm, dk) for i in range(h)])
        self.K_linears = nn.ModuleList([nn.Linear(dm, dk) for i in range(h)])
        self.V_linears = nn.ModuleList([nn.Linear(dm, dv) for i in range(h)])
        self.output_linear = nn.Linear(h*dv, dm)
                            

    def forward(self, Q_input, K_input, V_input, mask=None):
        buf = []
        for Q_linear, K_linear, V_linear in zip(self.Q_linears, self.K_linears, self.V_linears):
            Q = Q_linear(Q_input)
            K = K_linear(K_input)
            V = V_linear(V_input)
            buf.append(scaled_dot_attention(Q, K, V, mask))
            
        buf = torch.cat(buf,dim=-1)
        out = self.output_linear(buf)
        
        return out      
class PositionwiseFeedForward(nn.Module):
    def __init__(self, d_model, d_ff):
        super(PositionwiseFeedForward, self).__init__()
        self.cnn1 = nn.Conv1d(d_model, d_ff, 1)
        self.cnn2 = nn.Conv1d(d_ff, d_model, 1)
                            

    def forward(self, x):
        seq_len,_ = x.size()
        x = x.unsqueeze(0)
        x = x.permute(0,2,1)
        x = self.cnn1(x)
        x = F.relu(x)
        x = self.cnn2(x)
        x = x.permute(0,2,1)
        x = x.squeeze(0)
        
        return x      
    
# encoder = Stack_Encoder(6, 64,64,20,8)
# # print net
Q = torch.randint(10000,[MAX_SEQUENCE_LENGTH,], dtype=torch.long).to(device)
V = torch.randint(10000,[MAX_SEQUENCE_LENGTH,], dtype=torch.long).to(device)
Q_fea = torch.rand([D_MODEL,]).to(device)
K_fea = torch.rand([D_MODEL,]).to(device)
net = Transformer(STACKED_NUM, DK, DV, D_MODEL, H, embedding_matrix).to(device)
print(Q.dtype)
o = net(Q, V, Q_fea, K_fea)
# print t
print(o.size())
# print o
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)
print(count_parameters(net))

torch.int64
torch.Size([1, 1])
4263425


In [4]:
# tmp_m = torch.load('./best_acc.pt')
# tmp_m
# net.load_state_dict(tmp_m.state_dict())
# torch.nn.init.xavier_uniform_(net.output_linear.weight)
# print 'load weight done'

In [5]:




links = np.genfromtxt(join(data_path,'t2-fake.txt'), dtype=np.int32)
idx_map = {node:idx for idx, node in enumerate(list(set(links.flatten().tolist())))}
N = links.shape[0]
adj_mat = np.zeros([N,N], dtype=np.uint8)
for i in range(links.shape[0]):
    src, dst = links[i].tolist()
    adj_mat[idx_map[src], idx_map[dst]] = 1




In [61]:
import scipy.sparse as sp
# adj_sp = sp.coo_matrix((np.ones(links.shape[0]), (links[:, 0], links[:, 1])),
#                         shape=(N, N),
#                         dtype=np.int8)
# adj_csr = adj_sp.tocsr() 
# r = adj_csr.multiply(adj_csr)
l = np.asarray([[0,1],[1,2],[1,3],[1,4],[3,4],[3,5],[5,2]])

adj_sp = sp.coo_matrix((np.ones(l.shape[0]), (l[:, 0], l[:, 1])),
                        shape=(6, 6),
                        dtype=np.int8)
adj_csr = adj_sp.tocsr() 
# r = sp.csr_matrix.dot(adj_csr,adj_csr)
r = sp.coo_matrix.dot(adj_sp,adj_sp)
print adj_sp.todense() 
print r.todense()
print r.tocoo().row
print r.tocoo().col

r2 = adj_sp+r > 0
print r2
print r2.astype(np.uint8).todense()
print adj_sp.row
print adj_sp

[[0 1 0 0 0 0]
 [0 0 1 1 1 0]
 [0 0 0 0 0 0]
 [0 0 0 0 1 1]
 [0 0 0 0 0 0]
 [0 0 1 0 0 0]]
[[0 0 1 1 1 0]
 [0 0 0 0 1 1]
 [0 0 0 0 0 0]
 [0 0 1 0 0 0]
 [0 0 0 0 0 0]
 [0 0 0 0 0 0]]
[0 0 0 1 1 3]
[4 3 2 5 4 2]
  (0, 1)	True
  (0, 2)	True
  (0, 3)	True
  (0, 4)	True
  (1, 2)	True
  (1, 3)	True
  (1, 4)	True
  (1, 5)	True
  (3, 2)	True
  (3, 4)	True
  (3, 5)	True
  (5, 2)	True
[[0 1 1 1 1 0]
 [0 0 1 1 1 1]
 [0 0 0 0 0 0]
 [0 0 1 0 1 1]
 [0 0 0 0 0 0]
 [0 0 1 0 0 0]]
[0 1 1 1 3 3 5]
  (0, 1)	1
  (1, 2)	1
  (1, 3)	1
  (1, 4)	1
  (3, 4)	1
  (3, 5)	1
  (5, 2)	1


In [71]:
import numpy as np
import scipy.sparse as sp
from tqdm import tqdm

def adj_iter(adj_mat):
    
    N = adj_mat.shape[0]
    ret = adj_mat.copy()
    with tqdm(total=N*N) as pbar:
        for i in range(N):
            for j in range(N):
                if adj[i, j] == 1:
                    for k in range(N):
                        if adj[j, k] == 1:
                            ret[i, k] = 1
                pbar.update(1)
def positive_bootsrap_generator(edges, xml_id_map, node_emb_dict):
    num_edge = len(edges)
        
    while True:
        for idx in np.random.permutation(num_edge):
            src, dst = edges[idx, :]
            Q = xml_id_map[dst]
            K = xml_id_map[src]
            Q_fea = node_emb_dict[dst]
            K_fea = node_emb_dict[src]
            yield Q, K, Q_fea, K_fea
def negative_bootsrap_generator(adj_mat, links, idx_map, xml_id_map, training_node_list, node_emb_dict, neighbor_link_rate=0.8):
    
    
    exist_node_list = xml_id_map.keys()
    exist_N = len(training_node_list)
    N = adj_mat.shape[0]
    
#     adj mat
    links = np.array(list(map(idx_map.get, links.flatten())),
                     dtype=np.int32).reshape(links.shape)
    
    adj_sp = sp.coo_matrix((np.ones(links.shape[0]), (links[:, 0], links[:, 1])),
                        shape=(N, N),
                        dtype=np.uint8)
    adj_sp_2 = (sp.coo_matrix.dot(adj_sp,adj_sp) + adj_sp).tocoo()
    
    rev_map = {v:k for k,v in idx_map.items()}
    adj_map = {i:[] for i in range(N)}
    with tqdm(total=len(adj_sp_2.row)) as pbar:
        for i,j,v in zip(adj_sp_2.row, adj_sp_2.col, adj_sp_2.data):
            if adj_mat[i, j] != 1 and v == 1:
                adj_map[i].append(j)
            pbar.update(1)
#             print i,N
                
    while True:
        src = training_node_list[np.random.randint(exist_N)]
        
#         choose neighbor link
        if np.random.rand(1) <= neighbor_link_rate:
        
            i = idx_map[src]
            high = len(adj_map[i])
            while high == 0:
                src = training_node_list[np.random.randint(exist_N)]
                i = idx_map[src]
                high = len(adj_map[i])
                
            idx = np.random.randint(high)
            dst = adj_map[i][idx]
            dst = rev_map[dst]
        else:
            dst = training_node_list[np.random.randint(exist_N)]
            while adj_mat[idx_map[src], idx_map[dst]] == 1:
                dst = training_node_list[np.random.randint(exist_N)]
        Q = xml_id_map[dst]
        K = xml_id_map[src]
        Q_fea = node_emb_dict[dst]
        K_fea = node_emb_dict[src]
        yield Q, K, Q_fea, K_fea

def val_data(edges, xml_id_map):
    Q, K = [],[]
    Q_f, K_f = [],[]
    
    for idx in range(edges.shape[0]):
        src, dst = edges[idx, :]
        q = xml_id_map[dst]
        k = xml_id_map[src]
        q_fea = node_emb_dict[dst]
        k_fea = node_emb_dict[src]
        
        Q.append(q)
        K.append(k)
        Q_f.append(q_fea)
        K_f.append(k_fea)
        
    Q = np.vstack(Q)
    K = np.vstack(K)
    Q_fea = np.vstack(Q_f)
    K_fea = np.vstack(K_f)
    
    return Q, K, Q_fea, K_fea
    
N = links.shape[0]
idx = np.random.permutation(N)
train_idx = idx[N//10:]
val_idx = idx[:N//10]

pos_G = positive_bootsrap_generator(links[train_idx,:], xml_id_map, node_emb_dict)
training_node_list = list(set(links[train_idx,:].flatten().tolist()))
neg_G = negative_bootsrap_generator(adj_mat, links, idx_map, xml_id_map, training_node_list, node_emb_dict)
val_Q, val_K, val_Q_fea, val_K_fea = val_data(links[val_idx,:], xml_id_map)
q,k,q_f,k_f = next(pos_G)
print(q.shape,k.shape, q_f.shape, k_f.shape)
q,k,q_f,k_f = next(neg_G)
print(q.shape,k.shape, q_f.shape, k_f.shape)
print(val_Q.shape,val_K.shape, val_Q_fea.shape, val_K_fea.shape)


((150,), (150,), (128,), (128,))


100%|██████████| 2003515/2003515 [00:06<00:00, 299428.19it/s]

((150,), (150,), (128,), (128,))
((16161, 150), (16161, 150), (16161, 128), (16161, 128))





In [None]:
from collections import deque
import time
def dump_log(model, n_iter, loss, acc, val_loss, val_acc, log_file_stream, tmp_model_path):
    log_text = '%.7d<split>%.5f<split>%.5f<split>%.5f<split>%.5f\n' % (n_iter, loss, acc, val_loss, val_acc)
    log_file_stream.write(log_text)
    if n_iter % 100 == 0 :
        log_file_stream.flush()
        torch.save(model, tmp_model_path)

acc_q = deque(maxlen=1000)
loss_q = deque(maxlen=1000)
val_acc_q = deque(maxlen=1000)
val_loss_q = deque(maxlen=1000)
criterion = nn.BCELoss()
# 
model = net
model.cuda()
optimizer = torch.optim.Adam(model.parameters(),lr=0.0001)
#
interval = 100
t = time.time()
print 'start training.'
best_acc  = 0
best_loss = float('inf')
for i in range(1,1000000):
    with open('log.txt', 'a') as f:
        
        optimizer.zero_grad()
        model.train()
    #     positive
        q,k,q_f,k_f = next(pos_G)
        q,k = torch.LongTensor(q), torch.LongTensor(k)
        q_f,k_f = torch.FloatTensor(q_f), torch.FloatTensor(k_f)
        output = model(q.cuda(), k.cuda(), q_f.cuda(), k_f.cuda())
        acc = 1 if output.flatten().item() > 0.5 else 0
        acc_q.append(acc)
        pos_loss = criterion(output, torch.FloatTensor([[1]]).cuda() )

#         negative
        q,k,q_f,k_f = next(neg_G)
        q,k = torch.LongTensor(q), torch.LongTensor(k)
        q_f,k_f = torch.FloatTensor(q_f), torch.FloatTensor(k_f)
        
        output = model(q.cuda(), k.cuda(), q_f.cuda(), k_f.cuda())
        acc = 1 if output.flatten().item() < 0.5 else 0
        acc_q.append(acc)
        neg_loss = criterion(output, torch.FloatTensor([[0]]).cuda())
        loss = pos_loss + neg_loss
        loss_q.append(loss.item())
        loss.backward()
        optimizer.step()
    #     val
        model.eval()
        with torch.no_grad():
            val_i = i % val_Q.shape[0]
            q,k = val_Q[val_i,:], val_K[val_i,:]
            q_f,k_f = val_Q_fea[val_i,:], val_K_fea[val_i,:]
            q,k = torch.LongTensor(q), torch.LongTensor(k)
            q_f,k_f = torch.FloatTensor(q_f), torch.FloatTensor(k_f)
        
            output = model(q.cuda(), k.cuda(), q_f.cuda(), k_f.cuda())
            val_acc = 1 if output.flatten().item() > 0.5 else 0
            val_acc_q.append(val_acc)

            val_loss = criterion(output, torch.FloatTensor([[1]]).cuda() )
            val_loss_q.append(val_loss.item())

        model.eval()
        acc = float(np.mean(acc_q))
        loss = float(np.mean(loss_q))
        val_acc = float(np.mean(val_acc_q))
        val_loss = float(np.mean(val_loss_q))

        if i % interval == 0:
            print('iter: {:04d}'.format(i+1),
                  'loss_train: {:.4f}'.format(loss),
                  'acc_train: {:.4f}'.format(acc),
                  'loss_val: {:.4f}'.format(val_loss),
                  'acc_val: {:.4f}'.format(val_acc),
                  'time: {:.4f}s'.format((time.time() - t)))
        if val_acc > best_acc:
            best_acc = val_acc
            torch.save(model, './best_acc.pt')
            with open('./best.txt', 'a') as g:
                g.write('best acc at %d with %.5f\n' % (i+1, best_acc))
                
        if val_loss < best_loss:
            best_loss = val_loss
            torch.save(model, './best_loss.pt')
            with open('./best.txt', 'a') as g:
                g.write('best loss at %d with %.5f\n' % (i+1, best_loss))
            
        dump_log(model, i+1, loss, acc, val_loss, val_acc, f, './tmp.pt')


# Train model
print("Optimization Finished!")
# print("Total time elapsed: {:.4f}s".format(time.time() - t_total))

start training.
('iter: 0101', 'loss_train: 1.3817', 'acc_train: 0.5350', 'loss_val: 0.6835', 'acc_val: 0.5600', 'time: 15.4182s')
('iter: 0201', 'loss_train: 1.3931', 'acc_train: 0.5150', 'loss_val: 0.6881', 'acc_val: 0.5450', 'time: 30.3636s')
('iter: 0301', 'loss_train: 1.3954', 'acc_train: 0.5017', 'loss_val: 0.6917', 'acc_val: 0.5367', 'time: 45.3021s')


In [None]:
model.eval()

edges_unordered = np.genfromtxt('./kaggle/t2-test.txt', dtype=np.int32)
with torch.no_grad():
    with open('pred.txt.csv', 'w') as f:
        f.write('query_id,prediction\n')
        for i in range(edges_unordered.shape[0]):
            src, dst = edges_unordered[i, :]
            if src not in node_emb_dict or dst not in node_emb_dict:
                f.write('%d,%d\n' % (1 + i, out))
                continue
            q = xml_id_map[dst]
            k = xml_id_map[src]
            q_f = node_emb_dict[dst]
            k_f = node_emb_dict[src]

            q,k = torch.LongTensor(q), torch.LongTensor(k)
            q_f,k_f = torch.FloatTensor(q_f), torch.FloatTensor(k_f)

            output = model(q.cuda(), k.cuda(), q_f.cuda(), k_f.cuda()).flatten().item()

            out = 1 if output >= 0.5 else 0
            f.write('%d,%d\n' % (1 + i, out))
print 'done'



In [None]:
# import numpy as np
# batch_size = 128
# # xml_id_map[113].shape
# def positive_bootsrap_generator(edges, xml_id_map):
#     num_edge = len(edges)
        
#     while True:
#         for idx in np.random.permutation(num_edge):
#             src, dst = edges[idx, :]
#             Q = xml_id_map[dst]
#             K = xml_id_map[src]
#             yield Q, K
# def negative_bootsrap_generator(adj_mat, idx_map, xml_id_map, training_node_list):
#     exist_node_list = xml_id_map.keys()
#     exist_N = len(training_node_list)
        
#     while True:
#         src = training_node_list[np.random.randint(exist_N)]
#         dst = training_node_list[np.random.randint(exist_N)]
#         while adj_mat[idx_map[src], idx_map[dst]] == 1:
#             dst = training_node_list[np.random.randint(exist_N)]
#         Q = xml_id_map[dst]
#         K = xml_id_map[src]
#         yield Q, K
# def val_data(edges, xml_id_map):
#     Q, K = [],[]
    
#     for idx in range(edges.shape[0]):
#         src, dst = edges[idx, :]
#         q = xml_id_map[dst]
#         k = xml_id_map[src]
#         Q.append(q)
#         K.append(k)
#     Q = np.vstack(Q)
#     K = np.vstack(K)
    
#     return Q, K
    
# N = edges.shape[0]
# idx = np.random.permutation(N)
# train_idx = idx[N//10:]
# val_idx = idx[:N//10]

# pos_G = positive_bootsrap_generator(edges[train_idx,:], xml_id_map)
# training_node_list = list(set(edges[train_idx,:].flatten().tolist()))
# neg_G = negative_bootsrap_generator(adj_mat, idx_map, xml_id_map, training_node_list)
# val_Q, val_K = val_data(edges[val_idx,:], xml_id_map)
# q,k = next(pos_G)
# print(q.shape,k.shape)
# q,k = next(neg_G)
# print(q.shape,k.shape)
# print(val_Q.shape,val_K.shape)
# # 