# Load Data

In [1]:
# From the repository
from util import *
from read_data import *
data_names = list(name2file_name.keys())
print(data_names)

# Basic modules
import os
import glob
import numpy as np
import pandas as pd
import random

#name = "wiod2016"
name = data_names[3]
print("We are going to use: " + name)
# Enable original_format to use the format as originally provided
data_dict = get_data(name,original_format=False)
data_dict.keys()
#df_nodes = data_dict["df_nodes"]
df_edges = data_dict["df_edges"]

# Add weights (no peculiar meaning)
# Remove self-loops 
df_edges["weight"] = 1.0
cond = df_edges["source"] != df_edges["target"]
df_edges = df_edges.loc[cond].copy()
df_edges = df_edges[["source","target","weight"]].copy()
df_edges.drop_duplicates(inplace=True)

['blogcatalog', 'homosapiens', 'wikipos', 'enron', 'unvote', 'untrade', 'uslegis_net', 'uslegis_net_small_dyn', 'uslegis_net_dyn', 'uslegis_hyp_dyn', 'contacts', 'dawn_net', 'dawn_hyp', 'ndc_net', 'ndc_hyp', 'coauth_dblp_net', 'coauth_dblp_hyp', 'wiod2016', 'wiod2013', 'wiodlong', 'eth', 'bitcoinalpha', 'bitcoinotc', 'uscourt']
We are going to use: enron


# Set seed number and number of negative samples

In [2]:
# Seed number => to get identical result
seed_num = 12345

# This is the number of negative samples we make against the positve ones
num_neg = 5


# Making undirected edgelist

In [3]:
undirected = "True"
if undirected == "True":
    out = []
    for i in range(len(df_edges)):
        if df_edges["source"].iloc[i] < df_edges["target"].iloc[i]:
            out.append([
            df_edges["source"].iloc[i],df_edges["target"].iloc[i],df_edges["weight"].iloc[i]])
        else:
            out.append([
            df_edges["target"].iloc[i],df_edges["source"].iloc[i],df_edges["weight"].iloc[i]])

    df_edges = pd.DataFrame(out)
    df_edges.columns = ["source","target","weight"]
    
df_edges_2 = df_edges.drop_duplicates(subset=["source","target"],keep='first')

df_edges_2["source"] = df_edges_2["source"].astype(str)
df_edges_2["target"] = df_edges_2["target"].astype(str)

print("Number of original edges:" + str(len(df_edges)))
print("Number of unique undirected edges:" + str(len(df_edges_2)))



Number of original edges:3007
Number of unique undirected edges:2097


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_edges_2["source"] = df_edges_2["source"].astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_edges_2["target"] = df_edges_2["target"].astype(str)


# Create Train and Test
# We make sure that there is at least one edge for all nodes in train

In [4]:
%%time
random.seed(seed_num)
node2cnt = dict()
index_test = []
index_vali = []
index_train = []
for i in range(len(df_edges_2)):
    
    source = df_edges_2["source"].iloc[i]
    target = df_edges_2["target"].iloc[i]
    
    if source in node2cnt and target in node2cnt:
        if random.uniform(0,1) < 0.415:
            if random.uniform(0,1) < 0.5:
                index_test.append(i)
            else:
                index_vali.append(i)        
        else:
            index_train.append(i)
        
    else:
        index_train.append(i)
        if source in node2cnt:
            node2cnt[source] += 1
        else:
            node2cnt.update({source:1})
        if target in node2cnt:
            node2cnt[target] += 1
        else:
            node2cnt.update({target:1})

CPU times: user 70.3 ms, sys: 2.9 ms, total: 73.2 ms
Wall time: 71.5 ms


In [5]:
print(len(index_train) / (len(index_train) + len(index_test) + len(index_vali)))
print(len(index_train) + len(index_test) + len(index_vali) == len(df_edges_2))

df_train = df_edges_2.iloc[index_train].copy()
df_valid = df_edges_2.iloc[index_vali].copy()
df_test = df_edges_2.iloc[index_test].copy()

df_node2cnt = pd.DataFrame(node2cnt.items())
df_node2cnt.columns = ["node","cnt"]


0.6003814973772056
True


# Sanity Check

In [6]:
%%time
pair_train = dict()
pos_pair = dict()
pos_pair_test = dict()
for i in range(len(df_train)):
    pair_a = df_train["source"].iloc[i] + "," + df_train["target"].iloc[i]
    pair_b = df_train["target"].iloc[i] + "," + df_train["source"].iloc[i]
    pair_train.update({pair_a:1})
    pair_train.update({pair_b:1}) 
    pos_pair.update({pair_a:1})
    if df_train["source"].iloc[i] == df_train["target"].iloc[i]:
        print("Error 1: self loop" )
        
for i in range(len(df_valid)):
    pair_a = df_valid["source"].iloc[i] + "," + df_valid["target"].iloc[i]
    pair_b = df_valid["target"].iloc[i] + "," + df_valid["source"].iloc[i]
    pos_pair.update({pair_a:1})
    pos_pair_test.update({pair_a:1})
    if pair_a in pair_train:
        print("Error 2: Found same edge in train (validation)")
    if pair_b in pair_train:
        print("Error 3: Found same edge in train (validation)")
    if df_valid["source"].iloc[i] == df_valid["target"].iloc[i]:
        print("Error 1: self loop" )
        
for i in range(len(df_test)):
    pair_a = df_test["source"].iloc[i] + "," + df_test["target"].iloc[i]
    pair_b = df_test["target"].iloc[i] + "," + df_test["source"].iloc[i]
    pos_pair.update({pair_a:1})
    
    pos_pair_test.update({pair_a:1})
    if pair_a in pair_train:
        print("Error 4: Found same edge in train (test)")
    if pair_b in pair_train:
        print("Error 5: Found same edge in train (test)")
    if df_test["source"].iloc[i] == df_test["target"].iloc[i]:
        print("Error 1: self loop" )

CPU times: user 198 ms, sys: 2.21 ms, total: 200 ms
Wall time: 200 ms


# Create Negative samples

In [7]:
%%time
random.seed(seed_num)
out_train_neg = []
neg_pair = dict()
while len(out_train_neg) < num_neg*len(df_train):
    neg_s = df_node2cnt.sample(1)["node"].iloc[0]
    neg_t = df_node2cnt.sample(1)["node"].iloc[0]
    tmp_pair_a = neg_s + "," + neg_t
    tmp_pair_b = neg_t + "," + neg_s
    if neg_s != neg_t:
        if (tmp_pair_a not in pos_pair) &  (tmp_pair_b not in pos_pair):
            if (tmp_pair_a not in pos_pair_test) &  (tmp_pair_b not in pos_pair_test):
                if (tmp_pair_a not in neg_pair) & (tmp_pair_b not in neg_pair):
                    out_train_neg.append([neg_s,neg_t])
                    out_train_neg.append([neg_t,neg_s])
                    neg_pair.update({tmp_pair_a:1})
                    neg_pair.update({tmp_pair_b:1})
print("Finished train_neg")
    
out_valid_neg = []
while len(out_valid_neg) < num_neg*len(df_valid):
    neg_s = df_node2cnt.sample(1)["node"].iloc[0]
    neg_t = df_node2cnt.sample(1)["node"].iloc[0]
    tmp_pair_a = neg_s + "," + neg_t
    tmp_pair_b = neg_t + "," + neg_s
    if neg_s != neg_t:
        if (tmp_pair_a not in pos_pair) &  (tmp_pair_b not in pos_pair):
            if (tmp_pair_a not in pos_pair_test) &  (tmp_pair_b not in pos_pair_test):
                if (tmp_pair_a not in neg_pair) & (tmp_pair_b not in neg_pair):
                    out_valid_neg.append([neg_s,neg_t])
                    out_valid_neg.append([neg_t,neg_s])
                    neg_pair.update({tmp_pair_a:1})
                    neg_pair.update({tmp_pair_b:1})
print("Finished valid_neg")
    
out_test_neg = []
while len(out_test_neg) < num_neg*len(df_test):
    neg_s = df_node2cnt.sample(1)["node"].iloc[0]
    neg_t = df_node2cnt.sample(1)["node"].iloc[0]
    tmp_pair_a = neg_s + "," + neg_t
    tmp_pair_b = neg_t + "," + neg_s
    if neg_s != neg_t:
        if (tmp_pair_a not in pos_pair) &  (tmp_pair_b not in pos_pair):
            if (tmp_pair_a not in pos_pair_test) &  (tmp_pair_b not in pos_pair_test):
                if (tmp_pair_a not in neg_pair) & (tmp_pair_b not in neg_pair):
                    out_test_neg.append([neg_s,neg_t])
                    out_test_neg.append([neg_t,neg_s])
                    neg_pair.update({tmp_pair_a:1})
                    neg_pair.update({tmp_pair_b:1})
print("Finished test_neg")

Finished train_neg
Finished valid_neg
Finished test_neg
CPU times: user 3.07 s, sys: 439 ms, total: 3.51 s
Wall time: 2.93 s


# Create train validation test set

In [8]:
df_train_neg = pd.DataFrame(out_train_neg)
df_train_neg.columns = ["source","target"]
df_valid_neg = pd.DataFrame(out_valid_neg)
df_valid_neg.columns = ["source","target"]  
df_test_neg = pd.DataFrame(out_test_neg)
df_test_neg.columns = ["source","target"]  

# train
edges_train = np.array(pd.concat([df_train[["source","target"]],df_train_neg]))
labels_train = [1 for _ in range(len(df_train))]
labels_train.extend([0 for _ in range(len(df_train_neg))])
labels_train = np.array(labels_train)

# valid
edges_valid = np.array(pd.concat([df_valid[["source","target"]],df_valid_neg]))
labels_valid = [1 for _ in range(len(df_valid))]
labels_valid.extend([0 for _ in range(len(df_valid_neg))])
labels_valid = np.array(labels_valid)

# test
edges_test = np.array(pd.concat([df_test[["source","target"]],df_test_neg]))
labels_test = [1 for _ in range(len(df_test))]
labels_test.extend([0 for _ in range(len(df_test_neg))])
labels_test = np.array(labels_test)

# Save

In [9]:
np.savez('./tables/enron_staticlinkpred', 
         edges_train, labels_train,
         edges_valid, labels_valid,
         edges_test, labels_test,
          df_node2cnt)