In [None]:
import pandas as pd
import numpy as np
from argparse import ArgumentParser
from pathlib import Path
from sklearn.model_selection import train_test_split
import torch
import random
import json

In [None]:
def parse_args():
    parser = ArgumentParser()
    
    # data preprocess
    parser.add_argument("--train_portion", type=float, default=0.8)
    parser.add_argument("--dev_portion", type=float, default=0.1)
    parser.add_argument("--test_portion", type=float, default=0.1)
    
    # random seed
    parser.add_argument("--seed", type=int, default=0)
    
    # directory path
    parser.add_argument("--raw_dir", type=Path, default="./raw/")
    parser.add_argument("--data_dir", type=Path, default="./data/")
    
    # input file
    parser.add_argument("--raw_input_name", type=str, default="active_links_valued.csv")
    parser.add_argument("--label_input_name", type=str, default="active_nodes_onlyIdAndMainCriminal.csv")
    
    
    # output file
    parser.add_argument("--tr_output_name", type=str, default="train.edgelist")
    parser.add_argument("--dev_output_name", type=str, default="dev.edgelist")
    parser.add_argument("--test_output_name", type=str, default="test.edgelist")
    
    args = parser.parse_args(args=[])
    
    return args

In [None]:
def tr_dev_test_split(X, y, args):
    print("="*30)
    print("Start spliting data...")
    # 先切成 train v.s. (dev + test)
    X_train, X_tmp, y_train, y_tmp = train_test_split(
        X, y, test_size=(args.dev_portion + args.test_portion)
    )
    
    # 再把 (dev + test) 切成 dev v.s. test
    X_dev, X_test, y_dev, y_test = train_test_split(
        X_tmp, y_tmp, test_size=args.dev_portion/(args.dev_portion+args.test_portion)
    )
    print("Finish spliting data...")
    print("="*30)
    return X_train, X_dev, X_test 

In [None]:
def show_split_info(tr, dev, test, args):
    print("="*30)
    print("Showing split data info...")
    label_df = pd.read_csv(args.raw_dir/args.label_input_name)
    for name, split in zip(["train", "dev", "test"], [tr, dev, test]):
        split = split.merge(label_df.rename(columns={"id": "V1", "mainCriminal": "V1_label"}), on="V1")
        split = split.merge(label_df.rename(columns={"id": "V2", "mainCriminal": "V2_label"}), on="V2")
        
        print("{} data info: ".format(name))
        print("\tNumber of link pair: {}".format(len(split)))
        print("\tNumber of node: {}".format(
            len(split["V1"].append(split["V2"]).unique())
        ))
        print("\tNumber of criminal node: {}".format(
            len(split[(split["V1_label"] == True)]["V1"].append(split[(split["V2_label"] == True)]["V2"]).unique())
        ))
    print("="*30) 

In [None]:
def save_tr_dev_test(tr, dev, test, args):
    print("="*30)
    print("Saving tr, dev, test at ./{}...".format(str(args.data_dir)))
    tr.to_csv(args.data_dir/args.tr_output_name, sep=' ', index=False, header=False)
    dev.to_csv(args.data_dir/args.dev_output_name, sep=' ', index=False, header=False)
    test.to_csv(args.data_dir/args.test_output_name, sep=' ', index=False, header=False)
    print("="*30)

In [None]:
def main(args):
    np.random.seed(args.seed)
    link_df = pd.read_csv(args.raw_dir/args.raw_input_name)
    link_df["tmp_y"] = None
    
    X_train, X_dev, X_test = tr_dev_test_split(
        link_df[["V1", "V2", "counts"]], link_df["tmp_y"], args
    )

    show_split_info(X_train, X_dev, X_test, args)
    
    save_tr_dev_test(X_train, X_dev, X_test, args)

In [7]:
if __name__ == "__main__":
    args = parse_args()
    main(args)

Start spliting data...
Finish spliting data...
Showing split data info...
train data info: 
	Number of link pair: 2073830
	Number of node: 50696
	Number of criminal node: 679
dev data info: 
	Number of link pair: 259229
	Number of node: 38662
	Number of criminal node: 492
test data info: 
	Number of link pair: 259229
	Number of node: 38639
	Number of criminal node: 488
Saving tr, dev, test at ./data...
