In [1]:
import numpy as np
import pandas as pd
import torch
from tqdm.notebook import tqdm

In [2]:
datasets_root = r"E:/social-bot-data/datasets/Twibot-20"
tmp_files_root = r"./tmp-files"

In [3]:
node2id_list = pd.read_csv(rf"{datasets_root}/node2id.csv", dtype={"node_id": str,"num_id": int}) # users: 1-229580 33488193-33713010
node2id = {}
for row in tqdm(node2id_list.iterrows(), desc="Generate node2id dict."):
    node2id[row[1]["node_id"]] = row[1]["num_id"]

Generate node2id dict.: 0it [00:00, ?it/s]

### 利用原始edge文件和node文件生成异构图中所有边和边的类型

In [4]:
edge = pd.read_csv(f"{datasets_root}\edge.csv")

follow = edge[edge["relation"] == "follow"]
friend = edge[edge["relation"] == "friend"]
post = edge[edge["relation"] == "post"]

In [5]:
follow_src = []
follow_dst = []
friend_src = []
friend_dst = []
post_src = []
post_dst = []

for i in tqdm(follow["source_id"].tolist(), desc="follow_src"):
    follow_src.append(node2id[i])
for i in tqdm(follow["target_id"].tolist(), desc="follow_dst"):
    follow_dst.append(node2id[i])

for i in tqdm(friend["source_id"].tolist(), desc="friend_src"):
    friend_src.append(node2id[i])
for i in tqdm(friend["target_id"].tolist(), desc="friend_dst"):
    friend_dst.append(node2id[i])

for i in tqdm(post["source_id"].tolist(), desc="post_src"):
    post_src.append(node2id[i])
for i in tqdm(post["target_id"].tolist(), desc="post_dst"):
    post_dst.append(node2id[i] - 229580)

follow_src:   0%|          | 0/110869 [00:00<?, ?it/s]

follow_dst:   0%|          | 0/110869 [00:00<?, ?it/s]

friend_src:   0%|          | 0/117110 [00:00<?, ?it/s]

friend_dst:   0%|          | 0/117110 [00:00<?, ?it/s]

post_src:   0%|          | 0/33488192 [00:00<?, ?it/s]

post_dst:   0%|          | 0/33488192 [00:00<?, ?it/s]

In [6]:
follow_df = pd.DataFrame(data={"source_id": follow_src, "target_id": follow_dst})
friend_df = pd.DataFrame(data={"source_id": friend_src, "target_id": friend_dst})
post_df = pd.DataFrame(data={"source_id": post_src, "target_id": post_dst})

In [7]:
follow_tensor = torch.tensor(np.transpose(np.array(follow_df)))
friend_tensor = torch.tensor(np.transpose(np.array(friend_df)))
post_tensor = torch.tensor(np.transpose(np.array(post_df)))

In [8]:
torch.save(follow_tensor, rf"{tmp_files_root}/all_follow_edge_index.pt")
torch.save(friend_tensor, rf"{tmp_files_root}/all_friend_edge_index.pt")
torch.save(post_tensor, rf"{tmp_files_root}/all_post_edge_index.pt")

In [10]:
follow_reduced_df = follow_df[((follow_df["source_id"] < 11826) | (follow_df["source_id"] >= 229580)) & ((follow_df["target_id"] < 11826) | (follow_df["target_id"] >= 229580))]
follow_reduced_df_contrast = follow_df[(follow_df["source_id"] < 11826) & (follow_df["target_id"] < 11826)]

friend_reduced_df = friend_df[((friend_df["source_id"] < 11826) | (friend_df["source_id"] >= 229580)) & ((friend_df["target_id"] < 11826) | (friend_df["target_id"] >= 229580))]
friend_reduced_df_contrast = friend_df[(friend_df["source_id"] < 11826) & (friend_df["target_id"] < 11826)]

post_reduced_df = post_df[(post_df["source_id"] < 11826) | (post_df["source_id"] >= 229580)]
post_reduced_df_contrast = post_df[post_df["source_id"] < 11826]

In [13]:
follow_tensor = torch.tensor(np.transpose(np.array(follow_reduced_df)))
friend_tensor = torch.tensor(np.transpose(np.array(friend_reduced_df)))
post_tensor = torch.tensor(np.transpose(np.array(post_reduced_df)))

In [14]:
torch.save(follow_tensor, rf"{tmp_files_root}/follow_edge_index.pt")
torch.save(friend_tensor, rf"{tmp_files_root}/friend_edge_index.pt")
torch.save(post_tensor, rf"{tmp_files_root}/post_edge_index.pt")