In [8]:
import random
import csv
import numpy as np # linear algebra
import networkx as nx
import pandas as pd
from tqdm import tqdm
from math import log
from networkx.algorithms import community
import os

In [9]:
# Read all the edges
edge_list = []
with open('./comp90051-2020-sem2-proj1/train.txt', 'r') as training_data_set:
    for line in training_data_set:
        line_array = line.split()
        follower = int(line_array[0])
        for i in range(1, len(line_array)):
            following = int(line_array[i])
            edge_list.append([follower, following])
# Remove duplicate edge
edge_list = np.array(edge_list)
edge_list = np.unique(edge_list, axis=0)
print(len(edge_list))
print(edge_list)

23946602
[[     18   17464]
 [     18   36718]
 [     18   38399]
 ...
 [4867036 4744223]
 [4867036 4762830]
 [4867036 4855480]]


In [10]:
# Generate the testing dataset
# Read the test dataset
test_list = []
with open('./comp90051-2020-sem2-proj1/test-public.txt', 'r') as test_data_set:
    for line in test_data_set:
        line_array = line.split()
        if line_array[0] == "Id":
            continue
        else:
            test_list.append([int(line_array[1]), int(line_array[2])])
test_list = np.array(test_list)
print(len(test_list))
print(test_list)

2000
[[3563811 3600160]
 [2052043 1401960]
 [4517994 1690636]
 ...
 [4242514 1413468]
 [ 555531 1290080]
 [1707829 2373045]]


In [11]:
source_list = []
sink_list = []
for edge in edge_list:
    source_list.append(edge[0])
    sink_list.append(edge[1])
source_list = np.array(source_list)
sink_list = np.array(sink_list)

edge_df = pd.DataFrame({'Source': source_list, 'Sink': sink_list})
edge_df.to_csv('./data_processing/edge_df.csv')

test_source_list = []
test_sink_list = []
for edge in test_list:
    test_source_list.append(edge[0])
    test_sink_list.append(edge[1])
test_source_list = np.array(test_source_list)
test_sink_list = np.array(test_sink_list)

test_df = pd.DataFrame({'Source': test_source_list, 'Sink': test_sink_list})
test_df.to_csv('./data_processing/test_df.csv')

# This code is commented due to time consuming
# Generate training data
SAMPLING_SIZE = 5000
edge_df_temp = edge_df.copy()
total_nodes = len(set(edge_df_temp['Source']).union(set(edge_df_temp['Sink'])))
# empty list to store removable links
removable_links_index = []

iterative_count = 0
while len(removable_links_index) < SAMPLING_SIZE or iterative_count > SAMPLING_SIZE*100:
    if iterative_count % 100 == 0:
        print(len(removable_links_index))
        print(len(set(edge_df_temp['Source']).union(set(edge_df_temp['Sink']))))
    i = random.choice(edge_df_temp.index)
    if i not in removable_links_index:
        edge_df_temp_temp = edge_df_temp.drop(index = i)
        if len(set(edge_df_temp_temp['Source']).union(set(edge_df_temp_temp['Sink']))) == total_nodes:
            removable_links_index.append(i)
            edge_df_temp = edge_df_temp_temp
    iterative_count += 1

removable_links_index = np.array(removable_links_index)
print("removable_links_index: "+str(len(removable_links_index)))
positive_index = removable_links_index[0:SAMPLING_SIZE]
positive_data = edge_df[edge_df.index.isin(positive_index)]
positive_data = positive_data.iloc[0:SAMPLING_SIZE,:]
graph_df = edge_df.iloc[~edge_df.index.isin(positive_index)]
graph_df.to_csv('./data_processing/graph_df.csv')

print(len(positive_data))
print(positive_data)

In [12]:
G = nx.from_pandas_edgelist(edge_df, "Source", "Sink", create_using=nx.Graph())
# Expected 4867136
print(len(G.nodes()))

4867136


In [13]:
# Generate training data
SAMPLING_SIZE = 10000
positive_index = random.sample(range(0, edge_df.shape[0]), SAMPLING_SIZE*5)
x=edge_df.Sink.value_counts()==1
single_appearance= set(x[x==True].index)

# Exclude the edge, the removal of which may lead to isolated node
# Then in the feature extraction, remove the edge before processing this edge
# and add back after processing this edge, which can reduce the information leak
positive_data = edge_df[edge_df.index.isin(positive_index) & ~edge_df.Sink.isin(single_appearance)]
positive_data = positive_data.iloc[0:SAMPLING_SIZE,:]
print(len(positive_data))
print(positive_data)

# get negative data
neg_sample_source = random.sample(list(edge_df.Source), SAMPLING_SIZE*5)
neg_sample_sink = random.sample(list(edge_df.Sink), SAMPLING_SIZE*5)

neg_sample_candidate1 = pd.DataFrame({'Source':neg_sample_source, 'Sink':neg_sample_sink})

common = edge_df.merge(neg_sample_candidate1,on=['Source','Sink'])
neg_sample_candidate2 = neg_sample_candidate1[(~neg_sample_candidate1.Source.isin(common.Source))&(~neg_sample_candidate1.Sink.isin(common.Sink))]

common2 = test_df.merge(neg_sample_candidate2,on=['Source','Sink'])
neg_sample_candidate3 = neg_sample_candidate2[(~neg_sample_candidate2.Source.isin(common2.Source))&(~neg_sample_candidate2.Sink.isin(common2.Sink))]

coincide_df = pd.DataFrame({'Source': G.nodes(), 'Sink': G.nodes()})
common3 = coincide_df.merge(neg_sample_candidate3,on=['Source','Sink'])
negative_data = neg_sample_candidate3[(~neg_sample_candidate3.Source.isin(common3.Source))&(~neg_sample_candidate3.Sink.isin(common3.Sink))]

negative_data = negative_data.iloc[0:SAMPLING_SIZE,:]
print(len(negative_data))
print(negative_data)

negative_data['Label']=0
positive_data['Label']=1
positive_data.to_csv('./data_processing/positive_data.csv')
negative_data.to_csv('./data_processing/negative_data.csv')

10000
          Source     Sink
365          135  3648164
470          177  4651182
893          873  2180497
1676         877  1505989
1900         877  1916061
...          ...      ...
5308100  1250021  3348219
5308386  1250021  3365154
5308404  1250021  3365832
5308805  1250021  3387138
5309349  1250021  3419523

[10000 rows x 2 columns]
10000
        Source     Sink
2        69534   910199
3      2197375   861609
5      2150261    84224
6       441734  3668648
13     1192480  4726096
...        ...      ...
25973  1868397  3611021
25974  1736217  3445604
25976   435064  1235816
25978  4159765    14428
25981   182728  3013905

[10000 rows x 2 columns]


In [14]:
# generate community
communities = list(community.asyn_fluidc(G,100))
print(len(communities))
np.save('./data_processing/communities.npy', communities)

100
