# process text info and time info

In [1]:
import keras
from os.path import join
import os
from bs4 import BeautifulSoup as BS
from constants import MAX_SEQUENCE_LENGTH, MAX_NUM_WORDS, EMBEDDING_DIM
from keras.preprocessing.sequence import pad_sequences
import numpy as np
from datetime import datetime
from tqdm import tqdm
import re

np.random.seed(1337)
def replace_timezone_fn(s):
    return re.sub('(\d){2}:(\d){2}:(\d){2} [A-Z]{3}', _sub_tz, s)
def _sub_tz(matched):
    return re.sub('[A-Za-z]{3}', 'GMT', matched.group(0))
     
def retrive_date(pat_obj_1, pat_obj_2, pat_obj_3, s):
    s = s.lower()
    
    match_1 = pat_obj_1.search(s)
    match_2 = None
    match_3 = None
    
    if match_1 is None:
        match_2 = pat_obj_2.search(s)
    if match_2 is None:
        match_3 = pat_obj_3.search(s)
    assert (match_1 is not None) | (match_2 is not None) | (match_3 is not None)
    
        
    if match_1 is not None:
        days, months, years = match_1.groups()
        date_str = '%s %s %s' % (days, months, years)
    if match_2 is not None:
        days, months, years = match_2.groups()
        date_str = '%s %s 19%s' % (days, months, years)
    if match_3 is not None:
        months, days, years = match_3.groups()
        date_str = '%s %s %s' % (days, months, years)
    
    if len(months) > 3:
        return datetime.strptime(date_str, '%d %B %Y')
    else:
        return datetime.strptime(date_str, '%d %b %Y')
    

def prep_date(date):
#     min date in training set 1991-12-31
    secs = (date - datetime.strptime('1990-9-9', '%Y-%m-%d')).total_seconds()
    weeks = np.round(secs // (60*60*24*7))
    return weeks

def quote_title_date_abstract(xml_path):
    with open(xml_path, 'r') as f:
        data = f.read()
    soup = BS(data)
    title, abstract, date = soup.find('title').text, soup.find('abstract').text, soup.find('date').text
    
    return title.strip(), abstract.strip(), date.strip()

# text preprocessing
data_path = join('./','kaggle/')
xml_dir = join(data_path, 't3-doc')
xml_list = [f for f in os.listdir(xml_dir) if f.endswith('.xml')]
# print(len(xml_list))


texts = []
weeks = []
c = 0
# 
month_pattern = '(jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec|january|february|march|april|may|june|july|august|september|october|november|december)'
pat_obj_1 = re.compile('([0-9]+) %s ([0-9]{4}|20[0-9]{3})' % month_pattern)
pat_obj_2 = re.compile('([0-9]+) %s (9[0-9])' % month_pattern)
pat_obj_3 = re.compile('%s ([0-9]+) .* ([0-9]{4}|20[0-9]{3})' % month_pattern)
# 

with tqdm(total=len(xml_list)) as pbar:
    for xml in xml_list:
        
        
        path = join(xml_dir,xml)
        title, abstract, date_str = quote_title_date_abstract(path)
        text = title + '' + abstract
#         date special case
#         01/01/93 13:35:33 GMT+0100 12330.xml
        if xml == '12330.xml':
            date_str = '1 Jan 93'

        try :
            date = retrive_date(pat_obj_1, pat_obj_2, pat_obj_3, date_str)
        except AssertionError:
            print date_str, xml
            c+=1 
            continue
        texts.append(text)
        weeks.append(prep_date(date))
        pbar.update(1)
print('read all %d xml files.' % len(xml_list))
print 'error count %d' % c
min_w, avg_w, std_w = (min(weeks), np.mean(weeks), np.std(weeks))
print 'min weeks in training set : %.0f, avg weeks : %.0f' % (min_w, avg_w)
tokenizer = keras.preprocessing.text.Tokenizer(num_words=MAX_NUM_WORDS, filters='!"#$%&()*+,-./:;<=>?@[\]^_`{|}~ ',
                                   lower=True, split=' ', char_level=False, oov_token=None)
tokenizer.fit_on_texts(texts)
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))
sequences = tokenizer.texts_to_sequences(texts)
data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)
xml_id_map = {}
for i,xml in enumerate(xml_list):
    node_id = int(xml.replace('.xml',''))
    w = float(weeks[i] - avg_w)/std_w 
    w = np.array([w,])
    xml_id_map[node_id] = (data[i,:], w)


print('Preparing embedding matrix.')


Using TensorFlow backend.
100%|██████████| 17500/17500 [00:09<00:00, 1868.50it/s]


read all 17500 xml files.
error count 0
min weeks in training set : 68, avg weeks : 396
Found 82615 unique tokens.
Preparing embedding matrix.


In [2]:
# emb
buf = np.genfromtxt('./t3.emb', skip_header=1, dtype=np.float32)
nodes = buf[:,0].astype(np.int32)
emb = buf[:,1:]

node_emb_dict = {}
for i in range(emb.shape[0]):
    node_id = nodes[i]
    x = emb[i,:]
    node_emb_dict[node_id] = x

In [3]:




links = np.genfromtxt(join(data_path,'t3-fake.txt'), dtype=np.int32)
idx_map = {node:idx for idx, node in enumerate(list(set(links.flatten().tolist())))}
N = links.shape[0]
adj_mat = np.zeros([N,N], dtype=np.uint8)
for i in range(links.shape[0]):
    src, dst = links[i].tolist()
    adj_mat[idx_map[src], idx_map[dst]] = 1




In [4]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.nn.init import xavier_normal_
from constants import D_MODEL, STACKED_NUM,DK, DV, H, P_DROP, D_FF, MAX_SEQUENCE_LENGTH, MAX_NUM_WORDS, EMBEDDING_DIM
# environment
with_gpu = torch.cuda.is_available()
# with_gpu = False
device = torch.device("cuda:0" if with_gpu else "cpu")

def positional_encoding(pos):
    assert D_MODEL % 2 == 0
    pos = torch.tensor(pos, dtype=torch.float32, requires_grad=False)
    pe = torch.zeros([1,D_MODEL], dtype=torch.float32, requires_grad=False)
    for i in range(D_MODEL//2):
        a = torch.tensor(10000, dtype=torch.float32, requires_grad=False)
        b = torch.tensor(2.*i/float(D_MODEL), dtype=torch.float32, requires_grad=False)
        c = pos / torch.pow(a, b)
        pe[0, 2*i] = torch.sin(c)
        pe[0, 2*i+1] = torch.cos(c)
    return pe
def get_pos_mat(length):
    if length > MAX_SEQUENCE_LENGTH:
        print('sequence length reach PE_MAT_CACHE. %d ' % length)
        ret = torch.cat([positional_encoding(i) for i in range(length)], dim=0).to(device)
        ret.requires_grad = False
        global PE_CACHE_MATRIX
        PE_CACHE_MATRIX = ret
        return ret
    else:
        return PE_CACHE_MATRIX[:length]
    
PE_CACHE_MATRIX = torch.cat([positional_encoding(i) for i in range(0,MAX_SEQUENCE_LENGTH)], dim=0).to(device)
PE_CACHE_MATRIX.requires_grad = False

# construct neuron network

def scaled_dot_attention(Q, K, V, mask=None):
    assert Q.size()[-1] == K.size()[-1]
    dk = torch.tensor(K.size()[-1], dtype=torch.float32, requires_grad=False).to(device)
    out = torch.matmul(Q,K.t()) / torch.sqrt(dk) 
    if mask is not None:
        out = out.masked_fill_(mask, -float('inf'))
        
    return torch.matmul(F.softmax(out, dim=-1), V)
                            
class Transformer(nn.Module):

    def __init__(self, layer_num, dk, dv, dm, h, emb_matrix):
        super(Transformer, self).__init__()
        
        self.emb = Word_Embedding(emb_matrix)
        
        self.emb_drop = nn.Dropout(P_DROP)
        
        self.encoder = Stack_Encoder(layer_num, dk, dv, dm, h)
        self.decoder = Stack_Decoder(layer_num, dk, dv, dm, h)
        self.summary_decoder = Stack_Decoder(2, dk, dv, dm, h)
        
        self.summary_weight = nn.Parameter(torch.FloatTensor(1, dm))
        torch.nn.init.xavier_uniform_(self.summary_weight)
        
#         self.q_weeks_linear = nn.Linear(1, dm//2)
#         self.k_weeks_linear = nn.Linear(1, dm//2)
        self.output_linear = nn.Linear(3*dm+2, dm)
        self.output_linear2 = nn.Linear(dm, dm)
        self.output_linear3 = nn.Linear(dm, 1)
        

    def forward(self, Q, K, Q_fea, K_fea, Q_w, K_w):
        
#         encoder
        K = self.emb(K)
#         print(K.size(), get_pos_mat(MAX_SEQUENCE_LENGTH).size())
        K = K + get_pos_mat(MAX_SEQUENCE_LENGTH)
        K = self.emb_drop(K)
        
        en_out = self.encoder(K)
        
#         decoder
        Q = self.emb(Q)
        seq_len, d = Q.size()
        
        Q = Q + get_pos_mat(MAX_SEQUENCE_LENGTH)
        Q = self.emb_drop(Q)
        
        de_out = self.decoder(Q, en_out)
        summary = self.summary_decoder(self.summary_weight, de_out)

#         q_w_out = F.selu(self.q_weeks_linear(Q_w))
#         k_w_out = F.selu(self.k_weeks_linear(K_w))
        
        
#         x = torch.cat([summary, Q_fea.view([1,-1]), K_fea.view([1,-1]), q_w_out.view([1,-1]), k_w_out.view([1,-1])], dim=-1)
        x = torch.cat([summary, Q_fea.view([1,-1]), K_fea.view([1,-1]), Q_w.view([1,-1]), K_w.view([1,-1])], dim=-1)
        out = self.output_linear(x)
        
        out = self.output_linear2(F.selu(out))
        out = self.output_linear3(F.selu(out))
        
        out = torch.sigmoid(out)
        

        return out

class Word_Embedding(nn.Module):
    def __init__(self, emb_matrix):
        super(Word_Embedding, self).__init__()
        self.emb = nn.Embedding(MAX_NUM_WORDS, EMBEDDING_DIM, padding_idx=0)
        self.emb.weight = nn.parameter.Parameter(torch.FloatTensor(emb_matrix))
        self.emb.weight.requires_grad_(False)
        
        self.linear = nn.Linear(EMBEDDING_DIM, D_MODEL, bias=False)
        


    def forward(self, x):
        x = self.emb(x)
        x = self.linear(x)
        return x
    
class Stack_Encoder(nn.Module):
    """
    Stacked Encoder
    """
    def __init__(self, layer_num, dk, dv, dm, h):
        super(Stack_Encoder, self).__init__()
        self.encoders = nn.ModuleList([Encoder(dk, dv, dm, h) for i in range(layer_num)])

    def forward(self, K):
        # ModuleList can act as an iterable, or be indexed using ints
        for lay in self.encoders:
            K = lay(K)
        return K               
class Encoder(nn.Module):
    def __init__(self, dk, dv, dm, h):
        super(Encoder, self).__init__()
#         attention residual block
        self.multi_head_attention_layer = Multi_Head_attention_layer(dk, dv, dm, h)
        self.attention_norm_lay = nn.LayerNorm([dm,])
        self.att_drop = nn.Dropout(P_DROP)
#         feed forward residual block
        self.fcn = PositionwiseFeedForward(D_MODEL, D_FF)
        self.linear_drop = nn.Dropout(P_DROP)
        self.ff_norm_lay = nn.LayerNorm([dm, ])
        

    def forward(self, K):
#         attention
        attention_out = self.multi_head_attention_layer(K, K, K)
        attention_out = self.att_drop(attention_out)
        att_out = self.attention_norm_lay(K + attention_out)
#         feed forward
        linear_out = self.fcn(att_out)
        linear_out = self.linear_drop(linear_out)
        out = self.ff_norm_lay(att_out + linear_out)
        out = att_out + linear_out
    
        return out
class Stack_Decoder(nn.Module):
    """
    Stacked Encoder
    """
    def __init__(self, layer_num, dk, dv, dm, h):
        super(Stack_Decoder, self).__init__()
        self.decoders = nn.ModuleList([Decoder(dk, dv, dm, h) for i in range(layer_num)])
        
        
    def forward(self, Q, encoder_out):
        # ModuleList can act as an iterable, or be indexed using ints
        Q_len, d = Q.size()
        for lay in self.decoders:
            Q = lay(Q, encoder_out, mask=None)
        return Q           

class Decoder(nn.Module):
    def __init__(self, dk, dv, dm, h):
        super(Decoder, self).__init__()
#         query attention residual block
        self.Q_attention_lay = Multi_Head_attention_layer(dk, dv, dm, h)
        self.Q_attention_norm_lay = nn.LayerNorm([dm, ])
        self.Q_att_drop = nn.Dropout(P_DROP)
    
#         query key attention residual block
        self.QK_attention_lay = Multi_Head_attention_layer(dk, dv, dm, h)
        self.QK_attention_norm_lay = nn.LayerNorm([dm, ])
        self.QK_att_drop = nn.Dropout(P_DROP)
        
    
#         feed forward residual block
        self.fcn = PositionwiseFeedForward(D_MODEL, D_FF)
        self.ff_norm_lay = nn.LayerNorm([dm, ])
        self.linear_drop = nn.Dropout(P_DROP)
        

    def forward(self, Q, encoder_out, mask):
#         query attention
        Q_attention_out = self.Q_attention_lay(Q, Q, Q, mask)
        Q_attention_out = self.Q_att_drop(Q_attention_out)
        Q_att_out = self.Q_attention_norm_lay(Q + Q_attention_out)
#         query key attention
        QK_attention_out = self.QK_attention_lay(Q_att_out, encoder_out, encoder_out)
        QK_attention_out = self.QK_att_drop(QK_attention_out)
        QK_att_out = self.QK_attention_norm_lay(Q_att_out + QK_attention_out)
        
#         feed forward
        linear_out = self.fcn(QK_att_out)
        out = self.ff_norm_lay(QK_att_out + linear_out)
        return out

class Multi_Head_attention_layer(nn.Module):
    def __init__(self, dk, dv, dm, h):
        super(Multi_Head_attention_layer, self).__init__()
        self.Q_linears = nn.ModuleList([nn.Linear(dm, dk) for i in range(h)])
        self.K_linears = nn.ModuleList([nn.Linear(dm, dk) for i in range(h)])
        self.V_linears = nn.ModuleList([nn.Linear(dm, dv) for i in range(h)])
        self.output_linear = nn.Linear(h*dv, dm)
                            

    def forward(self, Q_input, K_input, V_input, mask=None):
        buf = []
        for Q_linear, K_linear, V_linear in zip(self.Q_linears, self.K_linears, self.V_linears):
            Q = Q_linear(Q_input)
            K = K_linear(K_input)
            V = V_linear(V_input)
            buf.append(scaled_dot_attention(Q, K, V, mask))
            
        buf = torch.cat(buf,dim=-1)
        out = self.output_linear(buf)
        
        return out      
class PositionwiseFeedForward(nn.Module):
    def __init__(self, d_model, d_ff):
        super(PositionwiseFeedForward, self).__init__()
        self.cnn1 = nn.Conv1d(d_model, d_ff, 1)
        self.cnn2 = nn.Conv1d(d_ff, d_model, 1)
                            

    def forward(self, x):
        seq_len,_ = x.size()
        x = x.unsqueeze(0)
        x = x.permute(0,2,1)
        x = self.cnn1(x)
        x = F.relu(x)
        x = self.cnn2(x)
        x = x.permute(0,2,1)
        x = x.squeeze(0)
        
        return x      
    
# encoder = Stack_Encoder(6, 64,64,20,8)
# # print net
Q = torch.randint(10000,[MAX_SEQUENCE_LENGTH,], dtype=torch.long).to(device)
V = torch.randint(10000,[MAX_SEQUENCE_LENGTH,], dtype=torch.long).to(device)
Q_fea = torch.rand([D_MODEL,]).to(device)
K_fea = torch.rand([D_MODEL,]).to(device)
Q_w = torch.rand([1,]).to(device)
K_w = torch.rand([1,]).to(device)


print 'model '

model 


In [None]:
import torch 
import numpy as np
from tqdm import tqdm
model = torch.load('./best_loss.pt')
model.eval()

edges_unordered = np.genfromtxt('./kaggle/t3-test.txt', dtype=np.int32)
c = 1
with torch.no_grad():
    with open('pred.txt.csv', 'w') as f:
        with tqdm(total=edges_unordered.shape[0]) as pbar:

            f.write('query_id,prediction\n')
            for i in range(edges_unordered.shape[0]):

                src, dst = edges_unordered[i, :]

                q, q_w = xml_id_map[dst]
                k, k_w = xml_id_map[src]

                q_f = node_emb_dict[dst]
                k_f = node_emb_dict[src]
                if k_w < q_w:
                    f.write('%d,%d\n' % (1 + i, 0))
                    pbar.update(1)
                    continue



                q,k = torch.LongTensor(q), torch.LongTensor(k)
                q_f,k_f = torch.FloatTensor(q_f), torch.FloatTensor(k_f)
                q_w,k_w = torch.FloatTensor(q_w), torch.FloatTensor(k_w)

                output = model(q.cuda(), k.cuda(), q_f.cuda(), k_f.cuda(), q_w.cuda(), k_w.cuda()).flatten().item()

                out = 1 if output >= 0.5 else 0
                f.write('%d,%d\n' % (1 + i, out))
                pbar.update(1)
print c
print 'done'



 21%|██        | 15808/74798 [10:36<38:56, 25.25it/s]

In [17]:
print 'DDDDone'

DDDDone


### find best threshold, test cosine similarity

In [8]:
import numpy as np
import scipy.sparse as sp
from tqdm import tqdm

def positive_bootsrap_generator(edges, xml_id_map, node_emb_dict):
    num_edge = len(edges)
        
    while True:
        for idx in np.random.permutation(num_edge):
            src, dst = edges[idx, :]
            Q, Q_w = xml_id_map[dst]
            K, K_w = xml_id_map[src]
            Q_fea = node_emb_dict[dst]
            K_fea = node_emb_dict[src]
            yield Q, K, Q_fea, K_fea, Q_w, K_w
def negative_bootsrap_generator(adj_mat, links, idx_map, xml_id_map, training_node_list, node_emb_dict, neighbor_link_rate=0.8):
    
    
    exist_node_list = xml_id_map.keys()
    exist_N = len(training_node_list)
    N = adj_mat.shape[0]
    
#     adj mat
    links = np.array(list(map(idx_map.get, links.flatten())),
                     dtype=np.int32).reshape(links.shape)
    
    adj_sp = sp.coo_matrix((np.ones(links.shape[0]), (links[:, 0], links[:, 1])),
                        shape=(N, N),
                        dtype=np.uint8)
    adj_sp_2 = (sp.coo_matrix.dot(adj_sp,adj_sp) + adj_sp).tocoo()
    
    rev_map = {v:k for k,v in idx_map.items()}
    adj_map = {i:[] for i in range(N)}
    with tqdm(total=len(adj_sp_2.row)) as pbar:
        for i,j,v in zip(adj_sp_2.row, adj_sp_2.col, adj_sp_2.data):
            if adj_mat[i, j] != 1 and v == 1:
                adj_map[i].append(j)
            pbar.update(1)
#             print i,N
                
    while True:
        src = training_node_list[np.random.randint(exist_N)]
        
#         choose neighbor link
        if np.random.rand(1) <= neighbor_link_rate:
        
            i = idx_map[src]
            high = len(adj_map[i])
            while high == 0:
                src = training_node_list[np.random.randint(exist_N)]
                i = idx_map[src]
                high = len(adj_map[i])
                
            idx = np.random.randint(high)
            dst = adj_map[i][idx]
            dst = rev_map[dst]
        else:
            dst = training_node_list[np.random.randint(exist_N)]
            while adj_mat[idx_map[src], idx_map[dst]] == 1:
                dst = training_node_list[np.random.randint(exist_N)]
        Q, Q_w = xml_id_map[dst]
        K, K_w = xml_id_map[src]
        Q_fea = node_emb_dict[dst]
        K_fea = node_emb_dict[src]
        yield Q, K, Q_fea, K_fea, Q_w, K_w

def val_data(edges, xml_id_map):
    Q, K = [],[]
    Q_f, K_f = [],[]
    Q_w, K_w = [],[]
    
    for idx in range(edges.shape[0]):
        src, dst = edges[idx, :]
        q, q_w = xml_id_map[dst]
        k, k_w = xml_id_map[src]
        q_fea = node_emb_dict[dst]
        k_fea = node_emb_dict[src]
        
        Q.append(q)
        K.append(k)
        Q_f.append(q_fea)
        K_f.append(k_fea)
        Q_w.append(q_w)
        K_w.append(k_w)
        
        
    Q = np.vstack(Q)
    K = np.vstack(K)
    Q_fea = np.vstack(Q_f)
    K_fea = np.vstack(K_f)
    Q_w = np.vstack(Q_w)
    K_w = np.vstack(K_w)
    
    return Q, K, Q_fea, K_fea, Q_w, K_w
    
N = links.shape[0]
idx = np.random.permutation(N)
train_idx = idx[N//10:]
val_idx = idx[:N//10]

pos_G = positive_bootsrap_generator(links[train_idx,:], xml_id_map, node_emb_dict)
training_node_list = list(set(links[train_idx,:].flatten().tolist()))
neg_G = negative_bootsrap_generator(adj_mat, links, idx_map, xml_id_map, training_node_list, node_emb_dict)
val_Q, val_K, val_Q_fea, val_K_fea, val_Q_w, val_K_w = val_data(links[val_idx,:], xml_id_map)
q,k,q_f,k_f,q_w,k_w = next(pos_G)
print(q.shape,k.shape, q_f.shape, k_f.shape, q_w.shape, k_w.shape)

q,k,q_f,k_f,q_w,k_w = next(neg_G)
print(q.shape,k.shape, q_f.shape, k_f.shape, q_w.shape, k_w.shape)
print(val_Q.shape,val_K.shape, val_Q_fea.shape, val_K_fea.shape, val_Q_w.shape, val_K_w.shape)


((150,), (150,), (128,), (128,), (1,), (1,))


100%|██████████| 880464/880464 [00:03<00:00, 263946.81it/s]

((150,), (150,), (128,), (128,), (1,), (1,))
((11058, 150), (11058, 150), (11058, 128), (11058, 128), (11058, 1), (11058, 1))





In [13]:

def cos_d(x,y):
    assert x.shape==y.shape==(128,)
    return np.dot(x,y)/(np.linalg.norm(x, ord=2) * np.linalg.norm(y, ord=2))

links = np.genfromtxt('./kaggle/t3-fake.txt', dtype=np.int32)

N = links.shape[0]
print N
Q = []
K = []
Y = []
for i in range(N):
    q,k,q_f,k_f,q_w,k_w = next(pos_G)
    if q_w > k_w:
        continue
    Q.append(q_f)
    K.append(k_f)
    Y.append(1)
    
    q,k,q_f,k_f,q_w,k_w = next(neg_G)
    if q_w > k_w:
        continue
    Q.append(q_f)
    K.append(k_f)
    Y.append(0)
    
Q = np.vstack(Q)
K = np.vstack(K)
Y = np.array(Y)


print Q.shape, K.shape, Y.shape

    
def cos_d(x,y):
    assert x.shape==y.shape and len(x.shape)==1
    return np.dot(x,y)/(np.linalg.norm(x, ord=2) * np.linalg.norm(y, ord=2))

def foo(Q, K, Y):
    N,dim = Q.shape
    out = []
    
    with tqdm(total=N) as pbar:
        for i in range(N):
            q,k = Q[i,:], K[i,:]
            d = cos_d(q, k)
            out.append(d)
            pbar.update(1)
    out = np.array(out)
    label = Y.flatten().astype(np.bool)
    best_acc = best_threshold = 0
    for threshold in np.linspace(0,1,100):
        buf = out >= threshold
        acc = np.mean(buf == label)
#         print '%.2f%%' % acc, threshold
    
        if acc > best_acc:
            best_acc = acc
            best_threshold = threshold
    return best_acc, best_threshold, out


acc, threshold, out = foo(Q,K,Y)
print acc, threshold
print np.sort(out)[len(out)//4]
print np.mean(Y),'Y', np.mean(out>=threshold)



110585


  5%|▌         | 11208/209527 [00:00<00:01, 112063.27it/s]

(209527, 128) (209527, 128) (209527,)


100%|██████████| 209527/209527 [00:01<00:00, 108935.23it/s]


0.6151522238184101 0.4040404040404041
0.37809783
0.5267053888043068 Y 0.7105766798551022


In [14]:
# random
import torch 
import numpy as np
from tqdm import tqdm


# 

edges_unordered = np.genfromtxt('./kaggle/t3-test.txt', dtype=np.int32)
# links = np.genfromtxt('./kaggle/t3-train.txt', dtype=np.int32)
# training_node_set = set(links.flatten().tolist())
Y = []
c = 0
with open('guess.csv', 'w') as g:
    with tqdm(total=edges_unordered.shape[0]) as pbar:

        g.write('query_id,prediction\n')
        for i in range(edges_unordered.shape[0]):

            src, dst = edges_unordered[i, :]
            src_v, src_w = xml_id_map[src]
            dst_v, dst_w = xml_id_map[dst]
            src_f = node_emb_dict[src]
            dst_f = node_emb_dict[dst]
            
            
#             if src_w < dst_w or abs(abs(src_w-dst_w) - 95) > 87*2:
            if src_w < dst_w :
#             if False:
                g.write('%d,%d\n' % (1 + i, 0))
                Y.append(0)
                c +=1
            else:
                out = 1 if cos_d(src_f,dst_f) > 0.39393939393939 else 0
                g.write('%d,%d\n' % (1 + i, out))
                Y.append(out)
#                 if src not in training_node_set and dst not in training_node_set:
#                     g.write('%d,%d\n' % (1 + i, 0))
            pbar.update(1)
print 'done', np.mean(Y), c, len(Y)



100%|██████████| 74798/74798 [00:01<00:00, 50817.01it/s]

done 0.10827829621112864 101 74798



