In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import networkx as nx
from collections import defaultdict
# python 3.7.3
# matplotlib 3.1.0
# networkx 2.3

#  “Broder et al” butterfly picture of the directed network

In [2]:
# Compute SCC
def scc(G):
    """
    Run Depth First Search to find out the finish times 
    to find each vertex in graph G.
    Pass the visited set in case we have a disconnected graph
    """
    visited, finish_times = set(), []
    for u in list(G.keys()):
        if u not in visited:
            finish_times.extend([v for v in dfs(G, u, visited)])
    
    # Reverse graph G in-place to save memory
    GR = defaultdict(set)
    while len(G) > 0:
        u, nodes = G.popitem()
        for v in nodes:
            GR[v].add(u)
    
    """
    Compute strongly connected components by running Depth First Search
    on a reversed graph G, by exploring in reversed finish_times order.
    Pass the visited set to no explore the same components multiple times
    """
    visited, scc = set(), defaultdict(set)
    for u in reversed(finish_times):
        if u not in visited:
            scc[u].update([node for node in dfs(GR, u, visited)])
    return scc

# Run iterative DFS to prevent stack overflows in large graphs
def dfs(G, source, visited):
    stack, popped = [source], set()
    while stack:
        node = stack[-1]
        to_visit = G[node] - visited
        if node not in visited:
            visited.add(node)
            stack.extend(to_visit)
        else:
            stack.pop()
            if node not in popped:
                popped.add(node)
                yield node

In [3]:
G = defaultdict(set)
# Load data into the graph data structure
with open("soc-Slashdot0902.txt", "r") as ins:
    for line in ins:
        if not line.startswith("#"):
            u, v = [int(i) for i in line.split("\t")]
            G[u].add(v)

# Save Graph stats
nodes, edges = set(), 0
for u in G.keys():
    nodes.update(G[u])
    edges += len(G[u])
nodes = len(nodes)

In [4]:
components = scc(G).values()
giant = max(components, key=len)

In [8]:
print("Nodes: {}".format(nodes))
print("Edges: {}".format(edges))
print("Nodes in giant SCC: {}".format(len(giant)))

Nodes: 82168
Edges: 948464
Nodes in giant SCC: 71307
