# MAXP 2021初赛数据探索和处理-3

使用步骤1里处理好的节点的ID，来构建DGL的graph所需要的边列表。

In [None]:
import pandas as pd
import numpy as np
import os

import dgl

In [51]:
# path
base_path = '../'
publish_path = 'MAXPdata'

link_p1_path = os.path.join(base_path, publish_path, 'link_phase1.csv')
nodes_path = os.path.join(base_path, publish_path, 'IDandLabels.csv')

### 读取节点列表

In [None]:
nodes_df = pd.read_csv(nodes_path, dtype={'Label':str})
print(nodes_df.shape)
nodes_df.head(4)

### 读取边列表

In [None]:
edges_df = pd.read_csv(link_p1_path)
print(edges_df.shape)
edges_df.head()

## Join点列表和边列表以生成从0开始的边列表

DGL默认节点是从0开始，并以最大的ID为容量构建Graph，因此这里我们先构建从0开始的边列表。

In [None]:
# Merge paper_id列
edges = edges_df.merge(nodes_df, on='paper_id', how='left')
# Merge reference_paper_id列
edges = edges.merge(nodes_df, left_on='reference_paper_id', right_on='paper_id', how='left')
print(edges.shape)
edges.head(4)

#### 修改node_idx_* 列的名称作为新的node id，并只保留需要的列

In [None]:
edges.rename(columns={'paper_id_x': 'paper_id', 'node_idx_x':'src_nid', 'node_idx_y':'dst_nid'}, inplace=True)
edges = edges[['src_nid', 'dst_nid', 'paper_id', 'reference_paper_id']]
edges.head(4)

## 构建DGL的Graph

In [None]:
# 讲源节点和目标节点转换成Numpy的NDArray
src_nid = edges.src_nid.to_numpy()
dst_nid = edges.dst_nid.to_numpy()

In [None]:
# nodes_df['node_idx']
AllNodes = np.concatenate((src_nid,dst_nid),axis=0)
AllNodes = np.unique(AllNodes)

In [None]:
test_dict = {'EdgeNodes':AllNodes}
EdgeNodes = pd.DataFrame(test_dict)

In [None]:
EdgeNodes
AllNodes=nodes_df[['node_idx']]

DiffNodes=AllNodes[~AllNodes['node_idx'].isin(EdgeNodes['EdgeNodes'])]
DiffNodes

In [None]:
with open('../MAXPdata/AddEdges/EdgeNodes.npy', 'wb') as f:
    np.save(f, EdgeNodes['EdgeNodes'].to_numpy())

with open('../MAXPdata/AddEdges/DiffNodes.npy', 'wb') as f:
    np.save(f, DiffNodes['node_idx'].to_numpy())

In [None]:
import pandas as pd
import numpy as np
import os
import dgl

EdgeNodes = np.load('../MAXPdata/AddEdges/EdgeNodes.npy')
DiffNodes = np.load('../MAXPdata/AddEdges/DiffNodes.npy')
features = np.load(os.path.join('../MAXPdata', 'features.npy'))

In [None]:
DiffFeat = features[DiffNodes]#.astype(np.float16)
EdgeFeat = features[EdgeNodes]#.astype(np.float16)

In [None]:
def compute_distances_no_loops(MatrA, X):
    """
    Compute the distance between each test point in X and each training point
    in MatrA using no explicit loops.

    Input / Output: Same as compute_distances_two_loops
    """
    num_test = X.shape[0]
    num_train = MatrA.shape[0]
    dists = np.zeros((num_test, num_train), dtype=np.float16) 

    dists = np.sqrt(-2*np.dot(X, MatrA.T) + np.sum(np.square(MatrA), axis = 1) + np.transpose([np.sum(np.square(X), axis = 1)]))

    return dists

In [None]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
# torch.cuda.empty_cache()

In [75]:
import torch

def EuclideanDistances(a,b):
    sq_a = a**2
    sum_sq_a = torch.sum(sq_a,dim=1).unsqueeze(1)  # m->[m, 1]
    sq_b = b**2
    sum_sq_b = torch.sum(sq_b,dim=1).unsqueeze(0)  # n->[1, n]
    bt = b.t()
    return torch.sqrt(sum_sq_a+sum_sq_b-2*a.mm(bt))

Ma = np.asarray([[1,2],
                [1,2]]) 
Mb = np.asarray([[1,1],
                [1,1],
                [1,2],
                [1,8],
                [1,2],
                [1,2],
                [1,5]]) 
Dists=compute_distances_no_loops(Ma, Mb).T
print(Dists)

# with torch.no_grad():
#     Ma = torch.from_numpy(Ma).float().cuda()
#     Mb = torch.from_numpy(Mb).float().cuda()
#     Dists=EuclideanDistances(Ma, Mb)
# print(Dists)

# del Ma, Mb, Dists
# torch.cuda.empty_cache()

[[1. 1. 0. 6. 0. 0. 3.]
 [1. 1. 0. 6. 0. 0. 3.]]


In [32]:
RANGE = DiffFeat.shape[0]
Step = 300
save = torch.from_numpy(np.array([])).cpu()
with torch.no_grad():
    Mb = torch.from_numpy(EdgeFeat).float().cuda()
    for i in range(0, RANGE, Step):
        Ma = torch.from_numpy(DiffFeat[i:(i+Step if (i+Step)<RANGE else RANGE), :]).float().cuda()
        Dists=EuclideanDistances(Ma, Mb)
        save = torch.cat([save, torch.argmin(Dists, dim=1).cpu()], 0)


In [46]:
AddSrc = DiffNodes.astype(np.int32)
AddDst = save.numpy().astype(np.int32)

In [73]:
graphs, _ = dgl.load_graphs(os.path.join(base_path, publish_path, 'graph.bin'))
graphs[0]

Graph(num_nodes=3655452, num_edges=29792735,
      ndata_schemes={}
      edata_schemes={})

In [71]:
graphs[0].add_edges(AddSrc, AddDst)
graphs[0]

Graph(num_nodes=3655452, num_edges=29792735,
      ndata_schemes={}
      edata_schemes={})

In [None]:
# 构建一个DGL的graph
graph = dgl.graph((src_nid, dst_nid))
print(graph)

In [72]:
# 保存Graph为二进制格式方便后面建模时的快速读取
graph_path = os.path.join(base_path, publish_path, 'NEWgraph.bin')
dgl.data.utils.save_graphs(graph_path, [graphs[0]])