In [2]:
import numpy as np
from os.path import join
def get_node_set(path):
    # training data
    edges_unordered = np.genfromtxt(path,
                                    dtype=np.int32)
    id_set = set(edges_unordered.flatten().tolist())
    return id_set

data_path = join('./','kaggle')
# emb
with open(join('./','GCN.emb')) as f:
# with open(join('./','t1.emb')) as f:
    num_nodes, D = f.readline().strip().split(' ')
    num_nodes = int(num_nodes)
    D = int(D)
    
    ls = f.readlines()
node_emb_dict = {}
for l in ls:
    buf = l.strip().split(' ')
    node_id, emb = int(buf[0]), buf[1:]
    x = np.asarray([float(i) for i in emb], dtype=np.float32)
    node_emb_dict[node_id] = x
    
# training data
with open(join(data_path,'t1-merge.txt')) as f:
    ls = f.readlines()
node_set = get_node_set(join(data_path,'t1-merge.txt'))
idx_map = {k:i for i,k in enumerate(list(node_set))}
N = len(node_set)
X = []
adj_mat = np.zeros([N,N], dtype=np.uint8)
for l in ls:
    buf = l.strip().split(' ')
    src, dst = int(buf[0]), int(buf[1])
    adj_mat[idx_map[src], idx_map[dst]] = 1
    fea = np.concatenate([node_emb_dict[src], node_emb_dict[dst]], axis=-1)
    X.append(fea)
X = np.vstack(X)

# test data
with open(join(data_path,'t1-test.txt')) as f:
    ls = f.readlines()
N2 = len(ls)
test_X = []
for l in ls:
    buf = l.strip().split(' ')
    src, dst = int(buf[0]), int(buf[1])
    if src not in node_emb_dict:
        src = 37019
    if dst not in node_emb_dict:
        dst = 37482
    fea = np.concatenate([node_emb_dict[src], node_emb_dict[dst]], axis=-1)
    
    test_X.append(fea)
test_X = np.vstack(test_X)
print(X.shape, test_X.shape)
# print 'done'
    

(285789, 256) (88074, 256)


In [10]:
import numpy as np
batch_size = 128
def naive_bootsrap_generator(X, adj_mat, idx_map, node_emb_dict, train_node_set, batch_size=128, neg_rate=1. ):
    train_node_list = list(train_node_set)
    train_N = len(train_node_list)
    num_edge = X.shape[0]
        
    while True:
        idx = np.random.choice(num_edge, batch_size)
        pos_X = X[idx, :]
        
        neg_count = int(batch_size*neg_rate)
        neg_idx = np.random.randint(train_N, size=[neg_count, 2])
        neg_X = []
        for i in range(neg_count):
            src, dst = neg_idx[i]
            src = train_node_list[src]
            dst = train_node_list[dst]
            if src != dst and adj_mat[idx_map[src], idx_map[dst]] == 0:
                fea = np.concatenate([node_emb_dict[src], node_emb_dict[dst]], axis=-1)
                neg_X.append(fea)
        neg_X = np.vstack(neg_X)

        ret_X = np.vstack([pos_X, neg_X])
        ret_Y = np.zeros([ret_X.shape[0], 1])
        ret_Y[:batch_size, 0] = 1
        yield ret_X, ret_Y

N = X.shape[0]
idx = np.random.permutation(N)
train_idx = idx[N//10:]
val_idx = idx[:N//10]

train_X = X[train_idx,:]
val_X = X[val_idx,:]

train_node_set = get_node_set('./kaggle/t1-train.txt')
G = naive_bootsrap_generator(train_X, adj_mat, idx_map, node_emb_dict, train_node_set, batch_size=batch_size)
val_G = naive_bootsrap_generator(val_X, adj_mat, idx_map, node_emb_dict, train_node_set,batch_size=batch_size, neg_rate=0.1)
x,y = next(G)
print x.shape,y.shape
x,y = next(val_G)
print x.shape,y.shape

(256, 256) (256, 1)
(140, 256) (140, 1)


In [11]:
import keras
from keras.models import *
from keras.layers import *

epochs = 100
def build_model():

    model = Sequential()
    model.add(Dense(256, activation='selu', input_shape=(256,)))
    model.add(Dense(256, activation='selu'))
    model.add(Dropout(0.2))
    model.add(Dense(128, activation='selu'))
    model.add(Dense(128, activation='selu'))
    model.add(Dropout(0.2))
    model.add(Dense(64, activation='selu'))
    model.add(Dense(64, activation='selu'))
    model.add(Dropout(0.2))
    model.add(Dense(32, activation='selu'))
    model.add(Dense(32, activation='selu'))
    model.add(Dropout(0.2))
    model.add(Dense(16, activation='selu'))
    model.add(Dense(16, activation='selu'))
    model.add(Dense(1, activation='sigmoid'))

    model.compile(loss=keras.losses.binary_crossentropy,
              optimizer=keras.optimizers.Adam(lr=0.001),
              metrics=['accuracy'])
    return model
x,y = next(G)
print x.shape, y.shape
np.random.seed(1337)
model = build_model()
model.fit(x,y)
ck = keras.callbacks.ModelCheckpoint('./weights.hdf5', monitor='val_loss', verbose=0, save_best_only=True, save_weights_only=False, mode='auto', period=1)
tfb = keras.callbacks.TensorBoard(log_dir='./logs')
model.fit_generator(G,
                    steps_per_epoch=train_X.shape[0]//batch_size,
                    epochs=1000, verbose=1,
                    validation_data=val_G,
                    validation_steps=val_X.shape[0]//batch_size,
                    callbacks=[ck,tfb]
                    )



Using TensorFlow backend.


(256, 256) (256, 1)
Epoch 1/1
Epoch 1/1000
Epoch 2/1000
Epoch 3/1000
Epoch 4/1000
Epoch 5/1000
Epoch 6/1000
Epoch 7/1000

KeyboardInterrupt: 

In [2]:
from keras.models import *
model = load_model('./weights.hdf5')
z = model.predict(test_X)
with open('pred.txt', 'w') as f:
    for i in range(z.shape[0]):
        p = z[i,0]
        ans = 1 if p >= 0.5 else 0
        f.write('%d\n' % ans)
pred_file = 'pred.txt'
with open(pred_file, 'r') as f, open(pred_file + '.csv', 'w') as g:
    g.write('query_id,prediction\n')
    for idx, line in enumerate(f):
        g.write('%d,%d\n' % (1 + idx, int(line)))


print 'done'
print z.shape

Using TensorFlow backend.


done
(88074, 1)


In [2]:
from svm import *
from svmutil import *
# prepare data for SVM
def generate_neg_X(X, adj_mat, node_emb_dict,neg_rate=1.5):
    exist_node_list = node_emb_dict.keys()
    exist_N = len(exist_node_list)
    num_edge = X.shape[0]
        
    pos_X = X

    neg_count = int(num_edge*neg_rate)
    neg_idx = np.random.randint(exist_N, size=[neg_count, 2])
    neg_X = []
    for i in range(neg_count):
        src, dst = neg_idx[i]
        src = exist_node_list[src]
        dst = exist_node_list[dst]
        if src != dst and adj_mat[src, dst] == 0:
            fea = np.concatenate([node_emb_dict[src], node_emb_dict[dst]], axis=-1)
            neg_X.append(fea)
    neg_X = np.vstack(neg_X)

    ret_X = np.vstack([pos_X, neg_X])
    ret_Y = np.ones([ret_X.shape[0], 1])
    ret_Y[pos_X.shape[0]:, 0] = -1
    return ret_X, ret_Y
train_X, train_Y = generate_neg_X(X, adj_mat, node_emb_dict,neg_rate=1.5)
print train_X.shape
prob = svm_problem(train_Y.flatten(),train_X)
print 'prob done'

(714296, 256)
prob done


In [None]:
param = svm_parameter('-s 0 -t 2 -m 3000')
m = svm_train(prob, param)
p_labels, p_acc, p_vals = svm_predict([], test_X, m)

### GCN version

In [3]:
# GCN version
import numpy as np
import scipy.sparse as sp
import torch


def encode_onehot(labels):
    classes = set(labels)
    classes_dict = {c: np.identity(len(classes))[i, :] for i, c in
                    enumerate(classes)}
    labels_onehot = np.array(list(map(classes_dict.get, labels)),
                             dtype=np.int32)
    return labels_onehot


def load_data(path, idx_map):
    print('Loading from file {} ...'.format(path))


    # build graph
    edges_unordered = np.genfromtxt(path,
                                    dtype=np.int32)
    N = len(idx_map)
    
#     print edges_unordered.shape
    edges = np.array(list(map(idx_map.get, edges_unordered.flatten())),
                     dtype=np.int32).reshape(edges_unordered.shape)
#     print edges.shape
    adj = sp.coo_matrix((np.ones(edges.shape[0]), (edges[:, 0], edges[:, 1])),
                        shape=(N, N),
                        dtype=np.float32)
    src_adj = adj
    # build symmetric adjacency matrix
    adj = adj + adj.T.multiply(adj.T > adj) - adj.multiply(adj.T > adj)

    adj = normalize(adj + sp.eye(adj.shape[0]))

    adj = sparse_mx_to_torch_sparse_tensor(adj)

    return src_adj, adj


def normalize(mx):
    """Row-normalize sparse matrix"""
    rowsum = np.array(mx.sum(1))
    r_inv = np.power(rowsum, -1).flatten()
    r_inv[np.isinf(r_inv)] = 0.
    r_mat_inv = sp.diags(r_inv)
    mx = r_mat_inv.dot(mx)
    return mx


def accuracy(output, labels):
    preds = output.max(1)[1].type_as(labels)
    correct = preds.eq(labels).double()
    correct = correct.sum()
    return correct / len(labels)
def accuracy_mse(output, labels):
    correct = torch.abs(output - labels) < 0.5
    return torch.sum(correct).item() / len(labels)
def sparse_mx_to_torch_sparse_tensor(sparse_mx):
    """Convert a scipy sparse matrix to a torch sparse tensor."""
    sparse_mx = sparse_mx.tocoo().astype(np.float32)
    indices = torch.from_numpy(
        np.vstack((sparse_mx.row, sparse_mx.col)).astype(np.int64))
    values = torch.from_numpy(sparse_mx.data)
    shape = torch.Size(sparse_mx.shape)
    return torch.sparse.FloatTensor(indices, values, shape)

def get_node_set(path):
    # training data
    edges_unordered = np.genfromtxt(path,
                                    dtype=np.int32)
    id_set = set(edges_unordered.flatten().tolist())
    return id_set
adj, sym_adj = load_data('./kaggle/t1-merge.txt', idx_map)
# print adj.shape


# GCN version
import torch.nn as nn
import torch.nn.functional as F
from layers import GraphConvolution
import time
import torch.optim as optim


class GCN(nn.Module):
    def __init__(self, node_num, nhid, nclass, dropout_rate):
        super(GCN, self).__init__()
        self.emb = nn.Embedding(node_num, nhid)
        self.gc1 = GraphConvolution(nhid, nhid)
        self.mid = GraphConvolution(nhid, nhid)
        self.gc2 = GraphConvolution(nhid, nclass)
        self.dropout = nn.Dropout(dropout_rate)

    def forward(self, x, adj):
        x = self.emb(x)
        x = F.selu(self.gc1(x, adj))
        x = self.dropout(x)
        x = l1 = F.selu(self.mid(x, adj))
        x = self.dropout(x)
        x = self.gc2(x, adj)
#         return F.log_softmax(x, dim=1)
        return x, l1


def count_degree(adj):
    node_num = adj.shape[0]
    return 
def count_neighbor(adj):
    N = adj.shape[0]
    m = {i:[] for i in range(N)}
    rows, cols, values = sp.find(adj)
    for src,dst,v in zip(rows, cols, values):
        if v == 1:
            m[src].append(dst)
            m[dst].append(src)
    ret = []
    for i in range(N):
        neighbor_set = set(m[i])
        ret.append(len(neighbor_set))
    return np.array(ret).reshape([-1, 1])

# Load data
node_num = adj.shape[0]
features = np.arange(node_num)


in_degree = np.sum(adj, axis=0).flatten()
out_degree = np.sum(adj, axis=1).flatten()
neighbor_count = count_neighbor(adj)
degree = in_degree + out_degree
second_order_adj = np.dot(adj, adj)
second_order_degree = np.sum(second_order_adj, axis=0).flatten() + np.sum(second_order_adj, axis=1).flatten()


labels_1 = in_degree.reshape([-1, 1])
labels_2 = second_order_degree.reshape([-1, 1])
idx = np.random.permutation(node_num)
train_idx, val_idx = idx[:node_num//10], idx[node_num//10:]

# 
features = torch.LongTensor(features)
    
labels_1 = torch.FloatTensor(labels_1)
labels_2 = torch.FloatTensor(labels_2)
label_neighbor_count = torch.FloatTensor(neighbor_count)
# labels_1 = label_neighbor_count
print(label_neighbor_count.size())
# Model and optimizer
model = GCN(
            node_num=node_num,
            nhid=128,
            nclass=2,
            dropout_rate=0.3)
optimizer = optim.Adam(model.parameters(),
                       lr=0.001)
criterion = nn.MSELoss()
l1_criterion = nn.L1Loss()
# 
model.cuda()
features = features.cuda()
sym_adj = sym_adj.cuda()
labels_1 = labels_1.cuda()
labels_2 = labels_2.cuda()
# 
for i in range(1,1000000):
        
    t = time.time()
    optimizer.zero_grad()
    output, l1 = model(features, sym_adj)
    loss_train = 1. * criterion(output[train_idx,0], labels_1[train_idx,0]) + \
                0.*criterion(output[train_idx,1], labels_2[train_idx,0]) 
    l1_loss = l1_criterion(l1, torch.zeros_like(l1).cuda())
    l2_loss = criterion(l1, torch.zeros_like(l1).cuda())
#     loss_train = criterion(output[train_idx,0], labels_1[train_idx,0])
    a = 500
    b = 3000
    (loss_train + a*l1_loss + b*l2_loss).backward()
#     print(a*l1_loss.item() ,b*l2_loss.item())
#     (loss_train).backward()
    optimizer.step()
#     val
    loss_val = 1. * criterion(output[val_idx,0], labels_1[val_idx,0]) + \
                0.*criterion(output[val_idx,1], labels_2[val_idx,0])
#     loss_val = criterion(output[val_idx,0], labels_1[val_idx,0])

    acc1 = accuracy_mse(output[val_idx,0], labels_1[val_idx,0])
    acc2 = accuracy_mse(output[val_idx,1], labels_2[val_idx,0])
    if i % 10 ==0:
        print('Epoch: {:04d}'.format(i+1),
              'loss_train: {:.4f}'.format(loss_train.item()),
              'loss_val: {:.4f}'.format(loss_val.item()),
              'acc1_val: {:.4f}'.format(acc1),
              'acc2_val: {:.4f}'.format(acc2),
              'time: {:.4f}s'.format((time.time() - t)*100))

    nn.init.xavier_uniform_

# Train model
print("Optimization Finished!")
# print("Total time elapsed: {:.4f}s".format(time.time() - t_total))




Loading from file ./kaggle/t1-merge.txt ...
torch.Size([29402, 1])
Epoch: 0011 loss_train: 591.0887 loss_val: 487.7440 acc1_val: 0.1408 acc2_val: 0.0183 time: 14.8307s
Epoch: 0021 loss_train: 588.7114 loss_val: 485.4294 acc1_val: 0.1406 acc2_val: 0.0197 time: 14.9518s
Epoch: 0031 loss_train: 588.4152 loss_val: 485.2107 acc1_val: 0.1397 acc2_val: 0.0197 time: 14.7854s
Epoch: 0041 loss_train: 588.4144 loss_val: 485.2620 acc1_val: 0.1399 acc2_val: 0.0201 time: 14.8511s
Epoch: 0051 loss_train: 588.0554 loss_val: 484.9533 acc1_val: 0.1399 acc2_val: 0.0197 time: 15.1941s
Epoch: 0061 loss_train: 587.6581 loss_val: 484.6120 acc1_val: 0.1398 acc2_val: 0.0202 time: 14.8462s
Epoch: 0071 loss_train: 587.0911 loss_val: 484.1024 acc1_val: 0.1391 acc2_val: 0.0193 time: 14.9412s
Epoch: 0081 loss_train: 586.4742 loss_val: 483.5817 acc1_val: 0.1388 acc2_val: 0.0185 time: 14.8397s
Epoch: 0091 loss_train: 585.7252 loss_val: 482.8959 acc1_val: 0.1400 acc2_val: 0.0188 time: 14.8448s
Epoch: 0101 loss_train: 

Epoch: 0821 loss_train: 316.9384 loss_val: 361.8237 acc1_val: 0.0594 acc2_val: 0.0163 time: 15.4442s
Epoch: 0831 loss_train: 315.0992 loss_val: 363.2898 acc1_val: 0.0589 acc2_val: 0.0163 time: 14.9397s
Epoch: 0841 loss_train: 312.8734 loss_val: 361.7029 acc1_val: 0.0609 acc2_val: 0.0163 time: 15.0882s
Epoch: 0851 loss_train: 310.4143 loss_val: 362.5493 acc1_val: 0.0607 acc2_val: 0.0163 time: 15.0976s
Epoch: 0861 loss_train: 307.3359 loss_val: 362.6578 acc1_val: 0.0585 acc2_val: 0.0164 time: 15.1904s
Epoch: 0871 loss_train: 306.6123 loss_val: 361.9621 acc1_val: 0.0574 acc2_val: 0.0164 time: 14.9796s
Epoch: 0881 loss_train: 302.2325 loss_val: 362.1649 acc1_val: 0.0559 acc2_val: 0.0163 time: 15.3409s
Epoch: 0891 loss_train: 301.8675 loss_val: 362.4531 acc1_val: 0.0593 acc2_val: 0.0164 time: 15.1403s
Epoch: 0901 loss_train: 300.3988 loss_val: 363.4804 acc1_val: 0.0580 acc2_val: 0.0164 time: 14.8650s
Epoch: 0911 loss_train: 298.0191 loss_val: 362.5672 acc1_val: 0.0561 acc2_val: 0.0163 time:

Epoch: 1641 loss_train: 202.2951 loss_val: 398.9748 acc1_val: 0.0448 acc2_val: 0.0164 time: 14.9870s
Epoch: 1651 loss_train: 204.3381 loss_val: 396.5375 acc1_val: 0.0449 acc2_val: 0.0163 time: 15.2906s
Epoch: 1661 loss_train: 201.7617 loss_val: 395.2855 acc1_val: 0.0452 acc2_val: 0.0164 time: 15.1136s
Epoch: 1671 loss_train: 201.4439 loss_val: 397.5036 acc1_val: 0.0448 acc2_val: 0.0164 time: 15.0747s
Epoch: 1681 loss_train: 199.7089 loss_val: 400.7058 acc1_val: 0.0475 acc2_val: 0.0164 time: 15.1904s
Epoch: 1691 loss_train: 198.7265 loss_val: 402.2788 acc1_val: 0.0466 acc2_val: 0.0164 time: 14.9898s
Epoch: 1701 loss_train: 199.6071 loss_val: 397.7327 acc1_val: 0.0464 acc2_val: 0.0163 time: 14.9878s
Epoch: 1711 loss_train: 198.8903 loss_val: 397.4721 acc1_val: 0.0452 acc2_val: 0.0163 time: 15.1448s
Epoch: 1721 loss_train: 195.9617 loss_val: 405.5149 acc1_val: 0.0454 acc2_val: 0.0164 time: 15.1911s
Epoch: 1731 loss_train: 196.5045 loss_val: 404.0506 acc1_val: 0.0451 acc2_val: 0.0164 time:

Epoch: 2461 loss_train: 148.5095 loss_val: 467.8900 acc1_val: 0.0380 acc2_val: 0.0164 time: 15.0961s
Epoch: 2471 loss_train: 148.8033 loss_val: 468.5698 acc1_val: 0.0373 acc2_val: 0.0164 time: 15.0874s
Epoch: 2481 loss_train: 146.6314 loss_val: 471.4716 acc1_val: 0.0385 acc2_val: 0.0164 time: 15.1905s
Epoch: 2491 loss_train: 146.2769 loss_val: 474.3130 acc1_val: 0.0393 acc2_val: 0.0164 time: 14.9920s
Epoch: 2501 loss_train: 145.0467 loss_val: 478.4380 acc1_val: 0.0372 acc2_val: 0.0163 time: 14.9844s
Epoch: 2511 loss_train: 141.6173 loss_val: 484.0628 acc1_val: 0.0406 acc2_val: 0.0164 time: 15.1719s
Epoch: 2521 loss_train: 143.6904 loss_val: 477.2281 acc1_val: 0.0393 acc2_val: 0.0163 time: 15.1913s
Epoch: 2531 loss_train: 142.7814 loss_val: 481.2914 acc1_val: 0.0381 acc2_val: 0.0163 time: 15.0448s
Epoch: 2541 loss_train: 147.8294 loss_val: 466.7887 acc1_val: 0.0379 acc2_val: 0.0165 time: 15.2903s
Epoch: 2551 loss_train: 141.2921 loss_val: 483.5156 acc1_val: 0.0384 acc2_val: 0.0163 time:

KeyboardInterrupt: 

In [7]:
# output GCN emb
with torch.no_grad():
    output, l1 = model(features, sym_adj)
l1 = l1.cpu().numpy()
rev_map = {v:k for k,v in idx_map.items()}
with open('GCN.emb','w' ) as f:
    f.write('%d %d\n' % (node_num, l1.shape[1]) )
    for i in range(l1.shape[0]):
        f.write('%d' % rev_map[i])
        for x in l1[i,:].flatten().tolist():
            f.write(' %f' % x)
        f.write('\n')
print 'done'
        


done
