# MAXP 2021初赛数据探索和处理-1

由于节点的Feature维度比较高，所以先处理节点的ID，并保存。Feature的处理放到第2部分。

In [1]:
import pandas as pd
import numpy as np
import os
import gc

In [2]:
# path
base_path = "root/fl/maxp_baseline_model/"

link_p1_path =  'data/link_phase1.csv'
train_nodes_path = 'data/train_nodes.csv'
val_nodes_path =  'data/validation_nodes.csv'

### 读取边列表并统计节点数量

In [3]:
edge_df = pd.read_csv(link_p1_path)
print(edge_df.shape)
edge_df.head()

(29168650, 3)


Unnamed: 0,paper_id,reference_paper_id,phase
0,f10da75ad1eaf16eb2ffe0d85b76b332,711ef25bdb2c2421c0131af77b3ede1d,phase1
1,9ac5a4327bd4f3dcb424c93ca9b84087,2d91c73304c5e8a94a0e5b4956093f71,phase1
2,9d91bfd4703e55dd814dfffb3d63fc33,33d4fdfe3967a1ffde9311bfe6827ef9,phase1
3,e1bdbce05528952ed6579795373782d4,4bda690abec912b3b7b228b01fb6819a,phase1
4,eb623ac4b10df96835921edabbde2951,c1a05bdfc88a73bf2830e705b2f39dbb,phase1


In [4]:
edge_df.phase.describe()

count     29168650
unique           1
top         phase1
freq      29168650
Name: phase, dtype: object

In [5]:
nodes = pd.concat([edge_df['paper_id'], edge_df['reference_paper_id']])
nodes = pd.DataFrame(nodes.drop_duplicates())
nodes.rename(columns={0:'paper_id'}, inplace=True)

print(nodes.shape)
nodes.head(4)

(3031367, 1)


Unnamed: 0,paper_id
0,f10da75ad1eaf16eb2ffe0d85b76b332
1,9ac5a4327bd4f3dcb424c93ca9b84087
2,9d91bfd4703e55dd814dfffb3d63fc33
3,e1bdbce05528952ed6579795373782d4


#### 在边列表，一共出现了3,031,367个节点(paper_id)

## 读取并查看train_nodes和validation_nodes里面的节点

In [7]:
def process_node(line):
    nid, feat_json, label = line.strip().split('\"')
    
    feat_list = [float(feat[1:-1]) for feat in feat_json[1:-1].split(', ')]
    
    if len(feat_list) != 300:
        print('此行数据有问题 {}'.format(line))
    
    return nid[:-1], feat_list, label[1:]





In [8]:
# 先构建ID和Label的关系，保证ID的顺序和Feature的顺序一致即可
nid_list = []
label_list = []
tr_val_list = []

with open(train_nodes_path, 'r') as f:
    i = 0
    
    for line in f:
        if i > 0:
            nid, _, label = process_node(line)
            nid_list.append(nid)
            label_list.append(label)
            tr_val_list.append(0)             # 0表示train的点
        i += 1
        if i % 100000 == 0:
            print('Processed {} train rows'.format(i))

with open(val_nodes_path, 'r') as f:
    i = 0
    
    for line in f:
        if i > 0:
            nid, _, label = process_node(line)
            nid_list.append(nid)
            label_list.append(label)
            tr_val_list.append(1)             # 1表示validation的点
        i += 1
        if i % 100000 == 0:
            print('Processed {} validation rows'.format(i))
            
nid_arr = np.array(nid_list)
label_arr = np.array(label_list)
tr_val_arr = np.array(tr_val_list)
    
nid_label_df = pd.DataFrame({'paper_id':nid_arr, 'Label': label_arr, 'Split_ID':tr_val_arr})

Processed 100000 train rows
Processed 200000 train rows
Processed 300000 train rows
Processed 400000 train rows
Processed 500000 train rows
Processed 600000 train rows
Processed 700000 train rows
Processed 800000 train rows
Processed 900000 train rows
Processed 1000000 train rows
Processed 1100000 train rows
Processed 1200000 train rows
Processed 1300000 train rows
Processed 1400000 train rows
Processed 1500000 train rows
Processed 1600000 train rows
Processed 1700000 train rows
Processed 1800000 train rows
Processed 1900000 train rows
Processed 2000000 train rows
Processed 2100000 train rows
Processed 2200000 train rows
Processed 2300000 train rows
Processed 2400000 train rows
Processed 2500000 train rows
Processed 2600000 train rows
Processed 2700000 train rows
Processed 2800000 train rows
Processed 2900000 train rows
Processed 3000000 train rows
Processed 100000 validation rows
Processed 200000 validation rows
Processed 300000 validation rows
Processed 400000 validation rows
Process

In [9]:
nid_label_df.reset_index(inplace=True)
nid_label_df.rename(columns={'index':'node_idx'}, inplace=True)
print(nid_label_df.shape)
nid_label_df.head(4)

(3655033, 4)


Unnamed: 0,node_idx,paper_id,Label,Split_ID
0,0,bfdee5ab86ef5e68da974d48a138c28e,S,0
1,1,78f43b8b62f040347fec0be44e5f08bd,,0
2,2,a971601a0286d2701aa5cde46e63a9fd,G,0
3,3,ac4b88a72146bae66cedfd1c13e1552d,,0


In [10]:
# 检查ID在Train和Validation是否有重复
ids = nid_label_df.paper_id.drop_duplicates()
ids.shape

(3655033,)

#### train和validation一共有3,655,033个节点

#### 下面交叉比对边列表里的paper id和节点列表里的ID，检查是否有匹配不上的节点

In [11]:
inboth = nid_label_df.merge(nodes, on='paper_id', how='inner')
print(inboth.shape)

(3030948, 4)


In [12]:
edge_node = nodes.merge(nid_label_df, on='paper_id', how='left')
print(edge_node.shape)
print('共有{}边列表的节点在给出的节点列表里没有对应，缺乏特征'.format(edge_node[edge_node.node_idx.isna()].shape[0]))
edge_node[edge_node.node_idx.isna()].head(4)

(3031367, 4)
共有419边列表的节点在给出的节点列表里没有对应，缺乏特征


Unnamed: 0,paper_id,node_idx,Label,Split_ID
1124,cc388eaec8838ce383d8a8792014fedb,,,
1184,5d899f41e52f751fef843cf7b1d05b4a,,,
14342,2b2004ec3c99a44b5cb6045ca547453e,,,
15803,d657c4451a9617f4eec96d3b2e6092c7,,,


#### 合并边列表里独特的节点和train和validation的节点到一起，构成全部节点列表

In [13]:
# 获取未能匹配上的节点，并构建新的节点DataFrame，然后和原有的Train/Validation节点Concat起来
diff_nodes = edge_node[edge_node.node_idx.isna()]
diff_nodes.ID = diff_nodes.paper_id
diff_nodes.Split_ID = 1
diff_nodes.node_idx = 0
diff_nodes.reset_index(inplace=True)
diff_nodes.drop(['index'], axis=1, inplace=True)
diff_nodes.node_idx = diff_nodes.node_idx + diff_nodes.index + 3655033
diff_nodes = diff_nodes[['node_idx', 'paper_id', 'Label', 'Split_ID']]
diff_nodes.head(4)

  diff_nodes.ID = diff_nodes.paper_id
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[name] = value
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


Unnamed: 0,node_idx,paper_id,Label,Split_ID
0,3655033,cc388eaec8838ce383d8a8792014fedb,,1
1,3655034,5d899f41e52f751fef843cf7b1d05b4a,,1
2,3655035,2b2004ec3c99a44b5cb6045ca547453e,,1
3,3655036,d657c4451a9617f4eec96d3b2e6092c7,,1


In [14]:
# Concatenate这419个未匹配到的节点到总的node的最后，从而让nid能接上
nid_label_df = pd.concat([nid_label_df, diff_nodes])
nid_label_df.tail(4)

Unnamed: 0,node_idx,paper_id,Label,Split_ID
415,3655448,caed47d55d1e193ecb1fa97a415c13dd,,1
416,3655449,c82eb6be79a245392fb626b9a7e1f246,,1
417,3655450,926a31f6b378575204aae30b5dfa6dd3,,1
418,3655451,bbace2419c3f827158ea4602f3eb35fa,,1


In [15]:
# 保存ID和Label到本地文件
publish_path = 'data'
nid_label_df.to_csv(os.path.join(publish_path, './IDandLabels.csv'), index=False)
# 保存未匹配上的节点用于feature的处理
diff_nodes.to_csv(os.path.join(publish_path, './diff_nodes.csv'), index=False)


In [1]:
import numpy as np
data = np.load('data/pre_label_probs.npy')

In [2]:
data.shape

(3655452, 23)

In [3]:
a = data[:5]
a

array([[2.35140091e-03, 8.26791078e-02, 6.63134903e-02, 6.87445998e-02,
        6.71470864e-03, 5.67902718e-03, 1.29754506e-02, 5.50491139e-02,
        2.14257725e-02, 4.34312820e-02, 3.14764977e-02, 4.58440036e-02,
        5.99833913e-02, 1.15502581e-01, 1.23626133e-03, 2.13552210e-02,
        2.95734946e-02, 1.26863969e-02, 1.61622062e-01, 8.33789408e-02,
        4.86054979e-02, 9.96521581e-03, 1.34065319e-02],
       [1.13657152e-03, 5.86355291e-03, 3.63325812e-02, 7.96677079e-03,
        1.95025478e-03, 9.13111959e-04, 5.58269722e-03, 3.21920991e-01,
        1.09521404e-01, 3.11725698e-02, 8.83769337e-03, 8.99567306e-02,
        2.31528375e-02, 2.30967075e-01, 2.70626275e-03, 2.24672612e-02,
        1.44280191e-03, 3.39710340e-03, 2.06427765e-04, 6.66781969e-04,
        8.70764107e-02, 1.69942342e-03, 5.06270444e-03],
       [4.47253417e-03, 9.24442932e-02, 3.79132032e-02, 2.03443110e-01,
        8.16826429e-03, 2.50501791e-03, 1.08155340e-01, 2.25047376e-02,
        1.18661579e-02

In [4]:
np.sum(a, axis=-1)

array([1.00000005, 1.00000002, 0.99999996, 0.99999998, 0.99999997])