In [None]:
# import necessary stuff
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import networkx as nx
import time
from multiprocessing import Pool, cpu_count

In [None]:
EXPORT_AS_EDGE_LIST = False

In [None]:
# define dataset file paths
dataset_path = 'data/BlogCatalog-dataset/data/'
friend_edges_csv_path = dataset_path + 'edges.csv'
group_edges_csv_path = dataset_path + 'group-edges.csv'
groups_csv_path = dataset_path + 'groups.csv'
bloggers_csv_path = dataset_path + 'nodes.csv'

In [None]:
# store cvs contents in dataframe
friend_edges_df = pd.read_csv(friend_edges_csv_path, sep=',', header=None, dtype={0: str, 1:str})
group_edges_df = pd.read_csv(group_edges_csv_path, sep=',', header=None, dtype={0: str, 1:str})
groups_df = pd.read_csv(groups_csv_path, sep=',', header=None, dtype={0: str})
bloggers_df = pd.read_csv(bloggers_csv_path, sep=',', header=None, dtype={0: str})

In [None]:
# give bloggers and groups unique node-ids
bloggers_df[0] = 'b' + bloggers_df[0]
friend_edges_df = 'b' + friend_edges_df
groups_df[0] = 'g' + groups_df[0]
group_edges_df[0] = 'b' + group_edges_df[0]
group_edges_df[1] = 'g' + group_edges_df[1]

In [None]:
# define networkx graph
blog_catalog_graph = nx.Graph()

In [None]:
# define node and edge label constants
IS_MEMBER_OF = 'is_member_of'
IS_FRIEND_WITH = 'is_friend_with'
BLOGGER = 'blogger'
GROUP = 'group'

In [None]:
# add blogger nodes to graph
blog_catalog_graph.add_nodes_from(bloggers_df[0].tolist())
print("{} nodes in graph".format(blog_catalog_graph.number_of_nodes()))
blog_catalog_graph.add_nodes_from(groups_df[0].tolist())
print("{} nodes in graph".format(blog_catalog_graph.number_of_nodes()))

In [None]:
# create edge tuples from dataframe
group_edges = list(zip(group_edges_df[0].tolist(), group_edges_df[1].tolist()))
friend_edges = list(zip(friend_edges_df[0].tolist(), friend_edges_df[1].tolist()))

In [None]:
# add (blogger)-[is_member_of]-(group) edges to graph
blog_catalog_graph.add_edges_from(group_edges, label=IS_MEMBER_OF)
print("{} edges in graph".format(blog_catalog_graph.number_of_edges()))
print("{} nodes in graph".format(blog_catalog_graph.number_of_nodes()))

In [None]:
# add (blogger)-[is_friend_with]-(blogger) edges to graph
blog_catalog_graph.add_edges_from(friend_edges, label=IS_FRIEND_WITH)
print("{} edges in graph".format(blog_catalog_graph.number_of_edges()))
print("{} nodes in graph".format(blog_catalog_graph.number_of_nodes()))

In [None]:
# export graph as edge list to given path
if EXPORT_AS_EDGE_LIST:
    edge_list_export_path = dataset_path + 'blogcatalog_edgelist.csv'
    nx.write_edgelist(blog_catalog_graph, edge_list_export_path, data=False)

In [None]:
# compute average degree of all nodes in graph
node_degrees = np.array(list(dict(blog_catalog_graph.degree(list(blog_catalog_graph.nodes))).values()),dtype=np.int64)
avg_node_degree = np.mean(node_degrees)
print("The avg. node degree is {}".format(np.round(avg_node_degree, decimals=2)))

In [None]:
# define random walk parameters
walk_length = 10
sim_G_sampling = {}
samples_per_node = 100000

In [None]:
# run single random walk
def run_single_random_walk(start_node):
    visited_nodes = []
    current_node = start_node
    
    for i in range(walk_length):
        visited_nodes.append(current_node)
        current_node = np.random.choice([n for n in blog_catalog_graph.neighbors(current_node)])
    
    return visited_nodes

In [None]:
def create_samples_for_node(node):
    s_time = time.time()
    sampled_nodes = []
    
    for i in range(samples_per_node):
        if(i % 10000 == 0):
            print("Collected {} samples for node {}".format(i, node))
            
        sampled_nodes.append(run_single_random_walk(node)[-1])
    
    e_time = time.time()
    c_time = e_time - s_time
    print("Sampling {} nodes for node {} took {} sec.".format(samples_per_node, node, np.around(c_time, decimals=2)))
    
    return sampled_nodes

In [None]:
nodes_list = ['b1','b2','b3','b4']
start_time = time.time()
pool = Pool(cpu_count())

results = pool.map(create_samples_for_node, nodes_list)

end_time = time.time()
computation_time = end_time - start_time
print("Whole sampling process took {} sec.".format(np.around(computation_time, decimals=2)))

In [None]:
for index, sampled_nodes in enumerate(results):
    start_node = nodes_list[index]
    sim_G_sampling[start_node] = sampled_nodes

In [None]:
len(sim_G_sampling['b1'])