In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import networkx as nx
from collections import defaultdict
from functools import reduce

#  “Broder et al” butterfly picture of the directed network

In [2]:
SNAP_network = "web-Stanford.txt"

G = defaultdict(set)
# Load data into the graph data structure
with open(SNAP_network, "r") as ins:
    for line in ins:
        if not line.startswith("#"):
            u, v = [int(i) for i in line.split()]
            G[u].add(v)

In [3]:
# Compute SCC
def scc(G):
    """
    Run Depth First Search to find out the finish times 
    to find each vertex in graph G.
    Pass the visited set in case we have a disconnected graph
    """
    visited, finish_times = set(), []
    for u in list(G.keys()):
        if u not in visited:
            finish_times.extend([v for v in dfs(G, u, visited)])
    
    # Reverse graph G
    GR = defaultdict(set)
    for u, nodes in G.items():
        for v in nodes:
            GR[v].add(u)
    
    """
    Compute strongly connected components by running Depth First Search
    on a reversed graph G, by exploring in reversed finish_times order.
    Pass the visited set to no explore the same components multiple times
    """
    visited, scc = set(), defaultdict(set)
    for u in reversed(finish_times):
        if u not in visited:
            scc[u].update([node for node in dfs(GR, u, visited)])
    return scc

# Run iterative DFS to prevent stack overflow in large graphs
def dfs(G, source, visited):
    stack, popped = [source], set()
    while stack:
        node = stack[-1]
        to_visit = G[node] - visited
        if node not in visited:
            visited.add(node)
            stack.extend(to_visit)
        else:
            stack.pop()
            if node not in popped:
                popped.add(node)
                yield node

In [4]:
components = sorted(scc(G).values(), key=len)
# largest SCC
giant = components[-1]
# nodes not in largest SCC
not_giant = reduce(set.union, components[:-1])

In [5]:
in_nodes = 0
for u in not_giant:
    if G[u].intersection(giant):
        in_nodes += 1

In [6]:
out_nodes = 0
for u in giant:
    if G[u].intersection(not_giant):
        out_nodes += 1

In [7]:
print("Nodes in the giant SCC: {}".format(len(giant)))
print("Nodes in the IN section of the graph: {}".format(in_nodes))
print("Nodes in the OUT section of the grap: {}".format(out_nodes))

Nodes in the giant SCC: 150532
Nodes in the IN section of the graph: 26239
Nodes in the OUT section of the grap: 2604


# Graphs