The file contains the edges of a directed graph. Vertices are labeled as positive integers from 1 to 875714. Every row indicates an edge, the vertex label in first column is the tail and the vertex label in second column is the head (recall the graph is directed, and the edges are directed from the first column vertex to the second column vertex). So for example, the 11th row looks liks : "2 47646". This just means that the vertex with label 2 has an outgoing edge to the vertex with label 47646

Your task is to code up the algorithm from the video lectures for computing strongly connected components (SCCs), and to run this algorithm on the given graph.

Output Format: You should output the sizes of the 5 largest SCCs in the given graph, in decreasing order of sizes, separated by commas (avoid any spaces). So if your algorithm computes the sizes of the five largest SCCs to be 500, 400, 300, 200 and 100, then your answer should be "500,400,300,200,100" (without the quotes). If your algorithm finds less than 5 SCCs, then write 0 for the remaining terms. Thus, if your algorithm computes only 3 SCCs whose sizes are 400, 300, and 100, then your answer should be "400,300,100,0,0" (without the quotes). (Note also that your answer should not have any spaces in it.)

WARNING: This is the most challenging programming assignment of the course. Because of the size of the graph you may have to manage memory carefully. The best way to do this depends on your programming language and environment, and we strongly suggest that you exchange tips for doing this on the discussion forums.

In [1]:
# Read from the file assuming that the nodes are sorted
def read_graph(filename):
    G = {1: []}
    s0 = 1
    for line in open(filename, 'r'):
        ls = line.split()
        if len(ls) == 1:
            G[int(ls[0])] = []
        else:
            s, f = ls
            s, f = int(s), int(f)
            if s != s0:
                G[s] = [f]
                s0 = s
            else:
                G[s].append(f)
    return G

In [2]:
# Recursive version of DFS
def DFS(G, s, explored=None, current_label=None):
    if explored == None:
        explored = set()
    explored.add(s)
    for v in G[s]:
        if v not in explored:
            current_label = DFS(G, v, explored, current_label)
            
    if current_label != None:
        print "Ordering of {0}: {1}".format(s, current_label)
        current_label -= 1
        return current_label

In [3]:
# Topological ordering
def topological_ordering(G):
    explored = set()
    current_label = len(G)
    for node in G.keys():
        if node not in explored:
            DFS(G, node, explored, current_label)

In [4]:
filename = "ordering_test.txt"
graph = read_graph(filename)
print graph

topological_ordering(graph)

{1: [2, 3], 2: [4], 3: [4], 4: []}
Ordering of 4: 4
Ordering of 2: 3
Ordering of 3: 2
Ordering of 1: 1


# Strongly connected components via Kosaraju's alorithm

```python
def Kosaraju(G):
  1. Compute reverse graph Grev
  2. a. Run depth-first search (DFS) loop (DFS_Loop) on Grev
     b. Record finishing time on each node
  3. Run DFS_Loop on G starting from nodes with large finishing time
  4. SCCs are the nodes with the same leaders
```

```python
def DFS_Loop(G):
  Global t # count finishing time
  Global s # leader of the SCC
  for i in range(n, 0, -1):
    if i not in explored:
      s = i
      DFS(G, i)
```

```python
def DFS(G, i):
  explored.add(i)
  leader[i] = s
  for edge(i, j) in G:
    if j not in explored:
      DFS(G, j)
  t += 1
  finishing_time[i] = t
```

In [5]:
DEBUG = True

# Generate graph and reverse graph (we need the maximum number of nodes)
def read_graph(filename, n):
    G, Grev = {}, {}
    for i in range(1, n + 1):
        G[i] = []
        Grev[i] = []
    
    for line in open(filename, 'r'):
        s, f = line.split()
        s, f = int(s), int(f)
        G[s].append(f)
        Grev[f].append(s)
    
    if n != max(G.keys()):
        raise ValueError("Number of nodes does not match.")
    
    if DEBUG:
        print "Graph from file {0}".format(filename)
        print G
        print "Reversed graph from file {0}".format(filename)
        print Grev
    
    return G, Grev

# Kosaraju's two-pass algorithm
def Kosaraju(G, Grev, n):
    # run the 1st DFS_Loop on Grev
    finishing_time = DFS_Loop(Grev, n)
    if DEBUG:
        print "Finishing time from Grev"
        print finishing_time
    
    # determine ordering for the 2nd DFS_Loop
    ordering = [0] * n
    for k, v in finishing_time.iteritems():
        ordering[n - v] = k
    
    # run the 2nd DFS_Loop
    leaders = {}
    DFS_Loop(G, n, ordering, leaders)
    if DEBUG:
        print "Leaders"
        print leaders
    
    return leaders

# DFS-Loop
def DFS_Loop(G, n, ordering=None, leaders=None):
    global t, s
    t = 0
    s = None
    explored = set()
    finishing_time = {}
    if ordering == None:
        ordering = range(n, 0, -1)
    
    for i in ordering:
        if i not in explored:
            s = i
            DFS(G, i, explored, finishing_time, leaders)
    
    return finishing_time

# DFS with finishing time
def DFS(G, i, explored=None, finishing_time=None, leaders=None):
    global t, s
    if explored == None:
        explored = set()
    if finishing_time == None:
        finishing_time = {}
    
    explored.add(i)
    
    if leaders != None:
        leaders[i] = s

    for j in G[i]:
        if j not in explored:
            time = DFS(G, j, explored, finishing_time, leaders)
    t += 1
    finishing_time[i] = t
    return

# find the five largest SCCs
def find_large_SCC(leaders, n=5):
    counts = {}
    for i, l in leaders.iteritems():
        counts[l] = []
    for i, l in leaders.iteritems():
        counts[l].append(i)
    if DEBUG:
        print "SCCs"
        print counts
    
    biggest = [0] * n
    for l, scc in counts.iteritems():
        size = len(scc)
        if biggest[n - 1] < size:
            biggest[n - 1] = size
            biggest = sorted(biggest, reverse=True)
    return biggest

In [6]:
DEBUG = True

filename = "test1.txt"
n = 9
g, gr = read_graph(filename, n)
leaders = Kosaraju(g, gr, n)
print find_large_SCC(leaders)

filename = "test2.txt"
n = 12
g, gr = read_graph(filename, n)
leaders = Kosaraju(g, gr, n)
print find_large_SCC(leaders)

Graph from file test1.txt
{1: [4], 2: [8], 3: [6], 4: [7], 5: [2], 6: [9], 7: [1], 8: [5, 6], 9: [7, 3]}
Reversed graph from file test1.txt
{1: [7], 2: [5], 3: [9], 4: [1], 5: [8], 6: [3, 8], 7: [4, 9], 8: [2], 9: [6]}
Finishing time from Grev
{1: 7, 2: 3, 3: 1, 4: 8, 5: 2, 6: 5, 7: 9, 8: 4, 9: 6}
Leaders
{1: 7, 2: 8, 3: 9, 4: 7, 5: 8, 6: 9, 7: 7, 8: 8, 9: 9}
SCCs
{8: [2, 5, 8], 9: [3, 6, 9], 7: [1, 4, 7]}
[3, 3, 3, 0, 0]
Graph from file test2.txt
{1: [2], 2: [3, 4, 5], 3: [6], 4: [5, 7], 5: [2, 6, 7], 6: [3, 8], 7: [8, 10], 8: [7], 9: [7], 10: [9, 11], 11: [12], 12: [10]}
Reversed graph from file test2.txt
{1: [], 2: [1, 5], 3: [2, 6], 4: [2], 5: [2, 4], 6: [3, 5], 7: [4, 5, 8, 9], 8: [6, 7], 9: [10], 10: [7, 12], 11: [10], 12: [11]}
Finishing time from Grev
{1: 1, 2: 3, 3: 5, 4: 4, 5: 2, 6: 6, 7: 9, 8: 7, 9: 8, 10: 10, 11: 11, 12: 12}
Leaders
{1: 1, 2: 4, 3: 6, 4: 4, 5: 4, 6: 6, 7: 12, 8: 12, 9: 12, 10: 12, 11: 12, 12: 12}
SCCs
{1: [1], 4: [2, 4, 5], 6: [3, 6], 12: [7, 8, 9, 10, 11, 

In [7]:
# check resources
import resource
print resource.getrlimit(resource.RLIMIT_STACK)
print resource.getrlimit(resource.RLIMIT_DATA)

(8388608, 67104768)
(9223372036854775807, 9223372036854775807)


In [8]:
import sys
sys.setrecursionlimit(2 ** 20)
hardlimit = resource.getrlimit(resource.RLIMIT_STACK)[1]
resource.setrlimit(resource.RLIMIT_STACK, (hardlimit, hardlimit))

# timer grabbed from 
# https://stackoverflow.com/questions/7370801/measure-time-elapsed-in-python
from timeit import default_timer as timer
class benchmark(object):
    def __init__(self, msg, fmt="%0.3g"):
        self.msg = msg
        self.fmt = fmt

    def __enter__(self):
        self.start = timer()
        return self

    def __exit__(self, *args):
        t = timer() - self.start
        print(("%s : " + self.fmt + " seconds") % (self.msg, t))
        self.time = t

DEBUG = False
filename = "SCC.txt"
n = 875714

with benchmark("Read graph from file") as r:
    g, gr = read_graph(filename, n)

with benchmark("Perform Kosaraju") as k:
    leaders = Kosaraju(g, gr, n)

with benchmark("Count SCCs") as c:
    biggest = find_large_SCC(leaders)

print biggest

Read graph from file : 14.8 seconds
Perform Kosaraju : 6.48 seconds
Count SCCs : 1.48 seconds
