In [1]:
import json
import torch
import pandas as pd
from pathlib import Path
from tqdm import tqdm

In [2]:
DATA_DIR = Path('/data/University/TwiBot-22')
USER_DIR = DATA_DIR / 'processed' / 'users'
IN_DIR = DATA_DIR / 'processed' / 'derived' / 'combined'
OUT_DIR = DATA_DIR / "processed" / 'embeddings' / "edges"

In [3]:
edge_file = next(f for f in IN_DIR.glob("*.csv"))
edge = pd.read_csv(edge_file)

### Create user and relation index mapping

In [4]:
for rel in edge["relation"].unique():
    edge_rel = edge.loc[edge["relation"] == rel].drop_duplicates()
    print(f"-- {rel}: {edge_rel.shape[0]}")

-- followers: 1116655
-- following: 2626979
-- retweeted_user: 657383
-- co_retweeted: 251544
-- co_hashtag: 6954818


In [5]:
user = pd.read_json(DATA_DIR / 'original' / 'user.json') 
uid_index = {uid:index for index,uid in enumerate(user['id'].values)}

In [6]:
relation_map = {rel: i for i, rel in enumerate(sorted(edge["relation"].unique()))}
relation_map

{'co_hashtag': 0,
 'co_retweeted': 1,
 'followers': 2,
 'following': 3,
 'retweeted_user': 4}

### Extracting edge index and type.

In [7]:
edge_index = []
edge_type = []
for i in tqdm(range(len(edge))):
    sid = edge['source_id'][i]
    tid = edge['target_id'][i]
    relation = edge['relation'][i]
    try:
        edge_index.append([uid_index[sid],uid_index[tid]])
        edge_type.append(relation_map[relation])
    except KeyError:
        raise 

100%|█████████████████████████████████████████████████████████████████████████████| 12530639/12530639 [01:26<00:00, 144453.25it/s]


### Save edge index and type

In [8]:
torch.save(torch.LongTensor(edge_index).t().contiguous(), OUT_DIR / f"edge_index.pt")
torch.save(torch.LongTensor(edge_type), OUT_DIR / f"edge_type.pt")