# Module

In [17]:
# From the repository
from util import *
from read_data import *
data_names = list(name2file_name.keys())
print(data_names)

# Basic modules
import os
import glob
import networkx as nx
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from graph_tool.all import *

#name = "wiod2016"
name = data_names[3]
print("We are going to use: " + name)
# Enable original_format to use the format as originally provided
data_dict = get_data(name,original_format=False)
data_dict.keys()
#df_nodes = data_dict["df_nodes"]
df_edges = data_dict["df_edges"]

df_edges = df_edges[["source","target"]].drop_duplicates()
cond = df_edges["source"] != df_edges["target"]
df_edges = df_edges.loc[cond].copy()

['blogcatalog', 'homosapiens', 'wikipos', 'enron', 'unvote', 'untrade', 'uslegis_net', 'uslegis_net_small_dyn', 'uslegis_net_dyn', 'uslegis_hyp_dyn', 'contacts', 'dawn_net', 'dawn_hyp', 'ndc_net', 'ndc_hyp', 'coauth_dblp_net', 'coauth_dblp_hyp', 'wiod2016', 'wiod2013', 'wiodlong', 'eth', 'bitcoinalpha', 'bitcoinotc', 'uscourt']
We are going to use: enron


# Create graph object

In [18]:
g = Graph()

In [19]:
# Want to rename nodes to index that starts with 0
uni_nodes = list(set(df_edges["source"]))
uni_nodes.extend(list(set(df_edges["target"])))
uni_nodes = list(set(uni_nodes))  

node2index = dict()
for i in range(len(uni_nodes)):
    node2index.update({uni_nodes[i]:i})
    
g.add_vertex(len(uni_nodes))
weight = g.new_edge_property("double")

# add edges
for i in range(len(df_edges)):
    source_index = node2index[df_edges["source"].iloc[i]]
    target_index = node2index[df_edges["target"].iloc[i]]
    e = g.add_edge(g.vertex(source_index),g.vertex(target_index))
    weight[e] = 1

g.edge_properties["weight"] = weight

# Approximation but it works well

In [20]:
%%time
state_ndc = minimize_blockmodel_dl(g, state_args=dict(deg_corr=False))
state_dc  = minimize_blockmodel_dl(g, state_args=dict(deg_corr=True))

print("Non-degree-corrected DL:\t", state_ndc.entropy())
print("Degree-corrected DL:\t", state_dc.entropy())

Non-degree-corrected DL:	 7733.478574577481
Degree-corrected DL:	 7736.433947096408
CPU times: user 1min 56s, sys: 344 ms, total: 1min 56s
Wall time: 2.18 s


# Refinements using merge-split MCMC

In [21]:
%%time
S1 = state_ndc.entropy()

for i in range(1000): # this should be sufficiently large
    state_ndc.multiflip_mcmc_sweep(beta=np.inf, niter=10)

S2 = state_ndc.entropy()

print("Improvement:", S2 - S1)

Improvement: -12.197947184529767
CPU times: user 16.1 s, sys: 14.9 ms, total: 16.1 s
Wall time: 16.1 s


In [22]:
%%time
# All use this
#mcmc_equilibrate(state_ndc, wait=1000, nbreaks=2, mcmc_args=dict(niter=10))

CPU times: user 10 µs, sys: 0 ns, total: 10 µs
Wall time: 18.8 µs


# Get block assignments

In [23]:
b = state_ndc.get_blocks()
#B0 = state_ndc.get_matrix()

In [24]:
df_nodes = pd.DataFrame(node2index.items())
df_nodes.columns = ["nodes","index"]
out = []
for i in range(len(df_nodes)):
    ind = df_nodes["index"].iloc[i]
    out.append(b[ind])
    
df_nodes["sbm"] = out

# Result

In [25]:
df_nodes["sbm"].value_counts()

sbm
7     35
4     22
11    17
6     16
10    16
5     15
0     14
1     13
8     11
9      9
3      8
2      6
Name: count, dtype: int64