In [1]:
import numpy as np
import pandas as pd
import networkx as nx
import networkx.convert_matrix
import os.path as osp

import torch
from torch_geometric.data import Data

import torch
import torch.nn.functional as F
from sklearn.metrics import roc_auc_score

from torch_geometric.utils import negative_sampling
from torch_geometric.datasets import Planetoid
import torch_geometric.transforms as T
from torch_geometric.nn import GCNConv
from torch_geometric.utils import train_test_split_edges
from tqdm import tqdm
from torch_geometric.nn import SAGEConv
from sklearn.utils.extmath import softmax

In [2]:
def pre(train, content):
    
    
    content.rename(columns={ content.columns[0]: "id" }, inplace=True)
    contentArray = content.values
    contentArray_sort = contentArray[contentArray[:,0].argsort()]
    contentArray_sort = pd.DataFrame(contentArray_sort)

    unconnected_pairs = []
    matrix =np.zeros( (contentArray_sort.shape[0], contentArray_sort.shape[0]) )
    
    for index,row in train.iterrows():
        if row[3]==1:
            x = row[1]
            y = row[2]
            matrix[x, y] = 1
            matrix[y, x] = 1
        else:
            x = row[1]
            y = row[2]
            matrix[x, y] = 0
            matrix[y, x] = 0

    return np.asmatrix(matrix), contentArray_sort
def edge_sort_train(train):
    train_link = train.loc[:,['to', 'from', 'label']]

    for i in train_link.index:
        t = train_link.loc[i]['to']
        f = train_link.loc[i]['from']
        if t > f :
            train_link.loc[i]['to'] = f
            train_link.loc[i]['from'] = t

    train_link = train_link.sort_values(by='to')
    train_link = train_link[~train_link.duplicated()] # -> with duplicated edge exists
    train_link.index = range(train_link.shape[0])

    return train_link

def edge_sort_test(train):
    train_link = train.loc[:,['id', 'to', 'from']]

    for i in train_link.index:
        t = train_link.loc[i]['to']
        f = train_link.loc[i]['from']
        if t > f :
            train_link.loc[i]['to'] = f
            train_link.loc[i]['from'] = t

    train_link = train_link.sort_values(by='to')
    train_link.index = range(train_link.shape[0])

    return train_link

def get_fea1(e_sort, str_fea):
    
    fea = []
    for i in e_sort.index:

        t = str_fea.loc[e_sort.loc[i, 'to'], :].values
        f = str_fea.loc[e_sort.loc[i, 'from'], :].values
        fea.append(t * f)

    fea = np.array(fea)
    
    return fea

def get_fea2(e_sort, str_fea):
    
    fea = []
    for i in e_sort.index:

        t = str_fea.loc[e_sort.loc[i, 'to'], :].values
        f = str_fea.loc[e_sort.loc[i, 'from'], :].values
        fea.append(np.dot(t, f))

    fea = np.array(fea)
    
    return fea

In [3]:
edge_test = pd.read_csv('test_3.csv')
e_sort_test = edge_sort_test(edge_test)
u_ex = pd.read_csv('upload_3.csv')
train = pd.read_csv('train_3.csv')
edge_train = edge_sort_train(train)
content = pd.read_csv('content_3.csv', delimiter='\t',header=None)
content = content.drop(0, axis=1)
train_adj, content_p = pre(train, content)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_link.loc[i]['to'] = f
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_link.loc[i]['from'] = t


In [4]:
n_fea = pd.read_csv('n_100fea3.csv')


In [5]:
edge_c = np.concatenate((edge_train.loc[:, ['from', 'to']].values, edge_train.loc[:, ['to', 'from']].values), axis=0)
edge_index = torch.tensor(edge_c, dtype=torch.long)

In [6]:
data = Data(edge_index=edge_index.t())
data.num_nodes= content.shape[0]
data.x = torch.tensor(n_fea.values, dtype=torch.float32)
data = train_test_split_edges(data,  test_ratio=0.001)

In [7]:


class Net(torch.nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.conv1 = GCNConv(data.x.shape[1], 70)
        self.conv2 = GCNConv(70, 40)
        self.conv3 = GCNConv(40, 16)

    def encode(self):
        x = self.conv1(data.x, data.train_pos_edge_index)
        x = x.relu()
        x = self.conv2(x, data.train_pos_edge_index)
        x = x.relu()
        return self.conv3(x, data.train_pos_edge_index)

    def decode(self, z, pos_edge_index, neg_edge_index):
        edge_index = torch.cat([pos_edge_index, neg_edge_index], dim=-1)
        logits = (z[edge_index[0]] * z[edge_index[1]]).sum(dim=-1)
        return logits

    def decode_all(self, z):
        prob_adj = z @ z.t()
        return (prob_adj > 0).nonzero(as_tuple=False).t()


device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model, data = Net().to(device), data.to(device)
optimizer = torch.optim.Adam(params=model.parameters(), lr=0.01)


def get_link_labels(pos_edge_index, neg_edge_index):
    E = pos_edge_index.size(1) + neg_edge_index.size(1)
    link_labels = torch.zeros(E, dtype=torch.float, device=device)
    link_labels[:pos_edge_index.size(1)] = 1.
    return link_labels


def train():
    model.train()

    neg_edge_index = negative_sampling(
        edge_index=data.train_pos_edge_index, num_nodes=data.num_nodes,
        num_neg_samples=data.train_pos_edge_index.size(1))

    optimizer.zero_grad()
    z = model.encode()
    link_logits = model.decode(z, data.train_pos_edge_index, neg_edge_index)
    link_labels = get_link_labels(data.train_pos_edge_index, neg_edge_index)
    loss = F.binary_cross_entropy_with_logits(link_logits, link_labels)
    loss.backward()
    optimizer.step()

    return loss


@torch.no_grad()
def test():
    model.eval()
    perfs = []
    for prefix in ["val", "test"]:
        pos_edge_index = data[f'{prefix}_pos_edge_index']
        neg_edge_index = data[f'{prefix}_neg_edge_index']

        z = model.encode()
        link_logits = model.decode(z, pos_edge_index, neg_edge_index)
        link_probs = link_logits.sigmoid()
        link_labels = get_link_labels(pos_edge_index, neg_edge_index)
        perfs.append(roc_auc_score(link_labels.cpu(), link_probs.cpu()))
    return perfs


In [8]:
best_val_perf = test_perf = 0
for epoch in range(1, 500):
    train_loss = train()
    val_perf, tmp_test_perf = test()
    if val_perf > best_val_perf:
        best_val_perf = val_perf
        test_perf = tmp_test_perf
    log = 'Epoch: {:03d}, Loss: {:.4f}, Val: {:.4f}, Test: {:.4f}'
    print(log.format(epoch, train_loss, best_val_perf, test_perf))



Epoch: 001, Loss: 0.6884, Val: 0.6636, Test: 0.5000
Epoch: 002, Loss: 0.6756, Val: 0.6834, Test: 0.5000
Epoch: 003, Loss: 0.6728, Val: 0.6859, Test: 0.5000
Epoch: 004, Loss: 0.6628, Val: 0.6859, Test: 0.5000
Epoch: 005, Loss: 0.6556, Val: 0.6859, Test: 0.5000
Epoch: 006, Loss: 0.6432, Val: 0.6859, Test: 0.5000
Epoch: 007, Loss: 0.6280, Val: 0.6859, Test: 0.5000
Epoch: 008, Loss: 0.6109, Val: 0.6859, Test: 0.5000
Epoch: 009, Loss: 0.5995, Val: 0.6859, Test: 0.5000
Epoch: 010, Loss: 0.5942, Val: 0.6859, Test: 0.5000
Epoch: 011, Loss: 0.5954, Val: 0.6859, Test: 0.5000
Epoch: 012, Loss: 0.5934, Val: 0.6859, Test: 0.5000
Epoch: 013, Loss: 0.5814, Val: 0.6859, Test: 0.5000
Epoch: 014, Loss: 0.5795, Val: 0.6859, Test: 0.5000
Epoch: 015, Loss: 0.5790, Val: 0.6859, Test: 0.5000
Epoch: 016, Loss: 0.5806, Val: 0.6859, Test: 0.5000
Epoch: 017, Loss: 0.5691, Val: 0.6859, Test: 0.5000
Epoch: 018, Loss: 0.5634, Val: 0.6859, Test: 0.5000
Epoch: 019, Loss: 0.5646, Val: 0.6859, Test: 0.5000
Epoch: 020, 

Epoch: 166, Loss: 0.4799, Val: 0.6897, Test: 1.0000
Epoch: 167, Loss: 0.4764, Val: 0.6897, Test: 1.0000
Epoch: 168, Loss: 0.4777, Val: 0.6897, Test: 1.0000
Epoch: 169, Loss: 0.4831, Val: 0.6897, Test: 1.0000
Epoch: 170, Loss: 0.4798, Val: 0.6897, Test: 1.0000
Epoch: 171, Loss: 0.4841, Val: 0.6897, Test: 1.0000
Epoch: 172, Loss: 0.4783, Val: 0.6897, Test: 1.0000
Epoch: 173, Loss: 0.4851, Val: 0.6897, Test: 1.0000
Epoch: 174, Loss: 0.4808, Val: 0.6897, Test: 1.0000
Epoch: 175, Loss: 0.4810, Val: 0.6897, Test: 1.0000
Epoch: 176, Loss: 0.4815, Val: 0.6897, Test: 1.0000
Epoch: 177, Loss: 0.4729, Val: 0.6897, Test: 1.0000
Epoch: 178, Loss: 0.4705, Val: 0.6897, Test: 1.0000
Epoch: 179, Loss: 0.4810, Val: 0.6897, Test: 1.0000
Epoch: 180, Loss: 0.4713, Val: 0.6897, Test: 1.0000
Epoch: 181, Loss: 0.4754, Val: 0.6897, Test: 1.0000
Epoch: 182, Loss: 0.4804, Val: 0.6897, Test: 1.0000
Epoch: 183, Loss: 0.4810, Val: 0.6897, Test: 1.0000
Epoch: 184, Loss: 0.4759, Val: 0.6897, Test: 1.0000
Epoch: 185, 

Epoch: 331, Loss: 0.4624, Val: 0.6897, Test: 1.0000
Epoch: 332, Loss: 0.4702, Val: 0.6897, Test: 1.0000
Epoch: 333, Loss: 0.4663, Val: 0.6897, Test: 1.0000
Epoch: 334, Loss: 0.4652, Val: 0.6897, Test: 1.0000
Epoch: 335, Loss: 0.4629, Val: 0.6897, Test: 1.0000
Epoch: 336, Loss: 0.4591, Val: 0.6897, Test: 1.0000
Epoch: 337, Loss: 0.4608, Val: 0.6897, Test: 1.0000
Epoch: 338, Loss: 0.4631, Val: 0.6897, Test: 1.0000
Epoch: 339, Loss: 0.4618, Val: 0.6897, Test: 1.0000
Epoch: 340, Loss: 0.4611, Val: 0.6897, Test: 1.0000
Epoch: 341, Loss: 0.4600, Val: 0.6897, Test: 1.0000
Epoch: 342, Loss: 0.4604, Val: 0.6897, Test: 1.0000
Epoch: 343, Loss: 0.4543, Val: 0.6897, Test: 1.0000
Epoch: 344, Loss: 0.4573, Val: 0.6897, Test: 1.0000
Epoch: 345, Loss: 0.4632, Val: 0.6897, Test: 1.0000
Epoch: 346, Loss: 0.4579, Val: 0.6897, Test: 1.0000
Epoch: 347, Loss: 0.4547, Val: 0.6897, Test: 1.0000
Epoch: 348, Loss: 0.4537, Val: 0.6897, Test: 1.0000
Epoch: 349, Loss: 0.4622, Val: 0.6897, Test: 1.0000
Epoch: 350, 

Epoch: 496, Loss: 0.4567, Val: 0.6897, Test: 1.0000
Epoch: 497, Loss: 0.4463, Val: 0.6897, Test: 1.0000
Epoch: 498, Loss: 0.4458, Val: 0.6897, Test: 1.0000
Epoch: 499, Loss: 0.4493, Val: 0.6897, Test: 1.0000


In [9]:
e_sort_train = edge_sort_train(edge_train)
e_sort_test = edge_sort_test(edge_test)
y_train = e_sort_train.label.values

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_link.loc[i]['to'] = f
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_link.loc[i]['from'] = t


In [10]:
con_fea = pd.read_csv('con_100fea3.csv')
x_train_con1 = get_fea1(e_sort_train, con_fea)
x_test_con1 = get_fea1(e_sort_test, con_fea)

x_train_con2 = get_fea2(e_sort_train, con_fea)
x_test_con2 = get_fea2(e_sort_test, con_fea)

In [11]:
from sklearn.linear_model import RidgeClassifier
import lightgbm as lgbm

lgb = lgbm.LGBMClassifier(random_state=0)
lgb.fit(x_train_con1, y_train)

ridge = RidgeClassifier()
ridge.fit(x_train_con2.reshape(-1, 1), y_train)

con1_result = lgb.predict_proba(x_test_con1)[:,1]

d = ridge.decision_function(x_test_con2.reshape(-1, 1))
d_2d = np.c_[-d, d]
con2_result = softmax(d_2d)[:, 1]


In [12]:
z = model.encode()
emb = pd.DataFrame(z.cpu().detach().numpy())

In [13]:
x_train_emb1 = get_fea1(e_sort_train, emb)
x_test_emb1 = get_fea1(e_sort_test, emb)

x_train_emb2 = get_fea2(e_sort_train, emb)
x_test_emb2 = get_fea2(e_sort_test, emb)

In [14]:
lgb = lgbm.LGBMClassifier(random_state=0)
lgb.fit(x_train_emb1, y_train)

ridge = RidgeClassifier()
ridge.fit(x_train_emb2.reshape(-1, 1), y_train)

emb1_result = lgb.predict_proba(x_test_emb1)[:,1]

d = ridge.decision_function(x_test_emb2.reshape(-1, 1))
d_2d = np.c_[-d, d]
emb2_result = softmax(d_2d)[:, 1]

In [15]:
test_edge = torch.tensor(e_sort_test.loc[:,['to', 'from']].values.T, dtype=torch.int64).cuda()
logits = (z[test_edge[0]] * z[test_edge[1]]).sum(dim=-1)
nn_result = logits.sigmoid().cpu().detach().numpy()

In [16]:
final = np.mean([con1_result, con2_result, emb1_result, emb2_result, nn_result], axis=0)

In [17]:
d = {'id': e_sort_test.id.values, 'prob': final}
pred = pd.DataFrame(d)
for i in u_ex.index:
    u_ex.loc[i, 'prob'] = pred[pred.id == u_ex.loc[i, 'id']].prob.values
    u_ex.to_csv('u3.csv', index= False)