In [3]:
import numpy as np

In [12]:
# implement a union-find data structure
# thru union-by-rank and path-compression

class DisjointSet():
    def __init__(self, num_of_objs):
        """
            initialize the disjoint set using indexes instead of values
        """
        assert isinstance(num_of_objs, int)
        self.num_of_objs = num_of_objs
        self.num_of_sets = num_of_objs
        self.parents = list(range(num_of_objs))
        self.ranks   = [0] * num_of_objs
    
        return
    
    def find(self, x):
        
        p = self.parents[x]
        if(p == x): return p
        
        while (p != self.parents[p]):
            p = self.parents[p]
        
        # path compression
        if(p != self.parents[x]): self.parents[x] = p
        
        return p
    
    def union(self, x, y):
        s1 = self.find(x)
        s2 = self.find(y)
        
        if(s1 == s2): return
        
        r1 = self.ranks[s1]
        r2 = self.ranks[s2]
        
        if(r1 > r2):
            self.parents[s2] = s1
        else:
            self.parents[s1] = s2
            # rank goes up when equal
            if (r1 == r2): self.ranks[s2] +=1
        
        self.num_of_sets -= 1
        
        return

# Q1 

'cluster1.txt' describes a distance function (equivalently, a complete graph with edge costs). It has the following format:

[number_of_nodes]

[edge 1 node 1] [edge 1 node 2] [edge 1 cost]

[edge 2 node 1] [edge 2 node 2] [edge 2 cost]

...

There is one edge $(i,j)$ for each choice of $1≤i<j≤n$, where $n$ is the number of nodes.

For example, the third line of the file is "1 3 5250", indicating that the distance between nodes 1 and 3 (equivalently, the cost of the edge (1,3)) is 5250. You can assume that distances are positive, but you should NOT assume that they are distinct.

Your task in this problem is to run the clustering algorithm from lecture on this data set, where the target number k of clusters is set to 4. What is the maximum spacing of a 4-clustering?

ADVICE: If you're not getting the correct answer, try debugging your algorithm using some small test cases. And then post them to the discussion forum!

In [3]:
with open('cluster1.txt', 'r') as f:
    lines = f.readlines()

n_nodes = int(lines[0])
edges = list(map(lambda x: list(map(int, x.split())), lines[1:]))

In [4]:
edges.sort(key=lambda x: x[2], reverse=True)
len(edges)

124750

In [5]:
dj_set = DisjointSet(n_nodes)

# clustering
while (dj_set.num_of_sets != 4):
    e = edges.pop()
    dj_set.union(e[0]-1,e[1]-1)

len(edges)

123542

In [6]:
# count the remaining edge length
cluster_spacings = {}
for e in edges:
    c1 = dj_set.find(e[0]-1)
    c2 = dj_set.find(e[1]-1)
    if(c1 == c2): continue
    
    c12 = (min(c1,c2),max(c1,c2))
    if(c12 in cluster_spacings.keys()):
        if(e[2] < cluster_spacings[c12]):
            cluster_spacings[c12] = e[2]
    else:
        cluster_spacings[c12] = e[2]

In [7]:
cluster_spacings

{(413, 422): 106,
 (422, 461): 107,
 (383, 422): 123,
 (383, 461): 1641,
 (383, 413): 1162,
 (413, 461): 746}

# Q2 

'cluster2.txt' has the format is:

[# of nodes] [# of bits for each node's label]

[first bit of node 1] ... [last bit of node 1]

[first bit of node 2] ... [last bit of node 2]

...

For example, the third line of the file "0 1 1 0 0 1 1 0 0 1 0 1 1 1 1 1 1 0 1 0 1 1 0 1" denotes the 24 bits associated with node #2.

The distance between two nodes $u$ and $v$ in this problem is defined as the Hamming distance--- the number of differing bits --- between the two nodes' labels. For example, the Hamming distance between the 24-bit label of node #2 above and the label "0 1 0 0 0 1 0 0 0 1 0 1 1 1 1 1 1 0 1 0 0 1 0 1" is 3 (since they differ in the 3rd, 7th, and 21st bits).

The question is: what is the largest value of $k$ such that there is a $k$-clustering with spacing at least 3? That is, how many clusters are needed to ensure that no pair of nodes with all but 2 bits in common get split into different clusters?

NOTE: The graph implicitly defined by the data file is so big that you probably can't write it out explicitly, let alone sort the edges by cost. So you will have to be a little creative to complete this part of the question. For example, is there some way you can identify the smallest distances without explicitly looking at every pair of nodes?

In [8]:
with open('cluster2.txt', 'r') as f:
    lines = f.readlines()

n_nodes, n_bits = list(map(int, lines[0].split()))
nodes = np.array(list(map(lambda x: list(map(int, x.split())), lines[1:])))

In [9]:
# hammimng_distance
sum(nodes[0] == nodes[1])

15

In [10]:
nodes = nodes[:200]
n_nodes = len(nodes)

In [55]:
dj_set = DisjointSet(n_nodes)

for i in range(n_nodes):
    for j in range(i+1, n_nodes):
        ci = dj_set.find(i)
        cj = dj_set.find(j)
        if (ci != cj):
            # hammimng_distance 
            if(sum(nodes[i] == nodes[j]) < 3):
                dj_set.union(i,j)
    
    print('finish node %i now,  there are %i clusters' %(i,dj_set.num_of_sets))

print('finished!')

finish node 0 now,  there are 200 clusters
finish node 1 now,  there are 200 clusters
finish node 2 now,  there are 200 clusters
finish node 3 now,  there are 200 clusters
finish node 4 now,  there are 200 clusters
finish node 5 now,  there are 200 clusters
finish node 6 now,  there are 200 clusters
finish node 7 now,  there are 200 clusters
finish node 8 now,  there are 200 clusters
finish node 9 now,  there are 200 clusters
finish node 10 now,  there are 200 clusters
finish node 11 now,  there are 200 clusters
finish node 12 now,  there are 200 clusters
finish node 13 now,  there are 200 clusters
finish node 14 now,  there are 200 clusters
finish node 15 now,  there are 200 clusters
finish node 16 now,  there are 200 clusters
finish node 17 now,  there are 200 clusters
finish node 18 now,  there are 200 clusters
finish node 19 now,  there are 200 clusters
finish node 20 now,  there are 200 clusters
finish node 21 now,  there are 200 clusters
finish node 22 now,  there are 200 cluster

In [125]:
dj_set.num_of_sets

100

### dealing with large data set using bitwise operator




In [4]:
from timeit import default_timer as timer

In [212]:
def hammimng_distance(x, y, nbits):
    result = x ^ y
    return (sum([(result >> i) & 1 for i in range(nbits)]))

In [197]:
x = 0b0101
y = 0b1110
nbits = 4
hammimng_distance(x,y,nbits)

[1, 1, 0, 1]


3

In [198]:
def hammimng_distance_array(x, y):
    return sum(x == y)

In [199]:
x = np.array([1,1,1,1,1,1,1,1,1,1,0,0,0,1,0,1,0,1,0,1])
y = np.array([1,1,1,1,1,1,1,1,1,1,1,1,1,0,1,0,0,1,1,0])
start = timer()
a = hammimng_distance_array(x,y)
end = timer()
end - start

0.00012661900836974382

In [200]:
x = 0b11111111110001010101
y = 0b11111111111110100110
nbits = 7
start = timer()
a = hammimng_distance(x,y,nbits)
end = timer()
end - start

[1, 1, 0, 0, 1, 1, 1]


0.00038758100708946586

In [206]:
with open('cluster2.txt', 'r') as f:
    lines = f.readlines()

n_nodes, n_bits = list(map(int, lines[0].split()))
nodes = np.array(list(map(lambda x: list(map(int, x.split())), lines[1:])))

In [208]:
nodes = nodes[:200]
n_nodes = len(nodes)

In [209]:
# test 1
start = timer()
dj_set = DisjointSet(n_nodes)

for i in range(n_nodes):
    for j in range(i+1, n_nodes):
        ci = dj_set.find(i)
        cj = dj_set.find(j)
        if (ci != cj):
            # hammimng_distance 
            if(hammimng_distance_array(nodes[i],nodes[j]) < 3):
                dj_set.union(i,j)
    
    #print('finish node %i now,  there are %i clusters' %(i,dj_set.num_of_sets))

#print('finished!')
end = timer()
end - start

1.1032967650098726

In [223]:
with open('cluster2.txt', 'r') as f:
    lines = f.readlines()

n_nodes, n_bits = list(map(int, lines[0].split()))
nodes = list(map(lambda x: int(x.replace(' ', ''),2), lines[1:]))

In [224]:
nodes = nodes[:1000]
n_nodes = len(nodes)

In [225]:
start = timer()
dj_set = DisjointSet(n_nodes)

for i in range(n_nodes):
    for j in range(i+1, n_nodes):
        ci = dj_set.find(i)
        cj = dj_set.find(j)
        if (ci != cj):
            # hammimng_distance 
            if(hammimng_distance(nodes[i],nodes[j],nbits) < 3):
                dj_set.union(i,j)
    
    #print('finish node %i now,  there are %i clusters' %(i,dj_set.num_of_sets))

#print('finished!')
end = timer()
end - start

0.44373776798602194

In [5]:
with open('cluster2.txt', 'r') as f:
    lines = f.readlines()

n_nodes, n_bits = list(map(int, lines[0].split()))
nodes = list(map(lambda x: int(x.replace(' ', ''),2), lines[1:]))
nodes = list(set(nodes))
n_nodes = len(nodes)

In [6]:
# make dictionary for each nodes:
nodes_map = {}
for i in range(n_nodes):
    nodes_map[nodes[i]] = i

In [7]:
# make some bit masks
def gen_bit_masks(n_bits, n_diff_bits, 
                 passed_bit=None,
                 cur_bit=None,
                 cur_start=None,
                 cur_end=None):
    
    assert n_diff_bits >= 1
    assert n_bits >= n_diff_bits
    
    if passed_bit is None:
        cur_bit    = 1
        passed_bit = 0
        cur_start  = 0
        cur_end    = n_bits - n_diff_bits + 1
    
    masks = []
    if(cur_bit == n_diff_bits):
        masks += [passed_bit | 1 << i for i in range(cur_start,cur_end)]
    else:
        for i in range(cur_start,cur_end):
            passing_bit = passed_bit | 1 << i 
            masks += gen_bit_masks(n_bits, n_diff_bits, passed_bit=passing_bit,
                                   cur_bit=cur_bit+1, cur_start=i+1,cur_end=cur_end+1)      
    return masks

In [8]:
# generate nodes masks with distance less than 3
bit_masks = [0]
bit_masks += gen_bit_masks(n_bits, 1)
bit_masks += gen_bit_masks(n_bits, 2)    

In [10]:
[bin(i) for i in bit_masks]

['0b0',
 '0b1',
 '0b10',
 '0b100',
 '0b1000',
 '0b10000',
 '0b100000',
 '0b1000000',
 '0b10000000',
 '0b100000000',
 '0b1000000000',
 '0b10000000000',
 '0b100000000000',
 '0b1000000000000',
 '0b10000000000000',
 '0b100000000000000',
 '0b1000000000000000',
 '0b10000000000000000',
 '0b100000000000000000',
 '0b1000000000000000000',
 '0b10000000000000000000',
 '0b100000000000000000000',
 '0b1000000000000000000000',
 '0b10000000000000000000000',
 '0b100000000000000000000000',
 '0b11',
 '0b101',
 '0b1001',
 '0b10001',
 '0b100001',
 '0b1000001',
 '0b10000001',
 '0b100000001',
 '0b1000000001',
 '0b10000000001',
 '0b100000000001',
 '0b1000000000001',
 '0b10000000000001',
 '0b100000000000001',
 '0b1000000000000001',
 '0b10000000000000001',
 '0b100000000000000001',
 '0b1000000000000000001',
 '0b10000000000000000001',
 '0b100000000000000000001',
 '0b1000000000000000000001',
 '0b10000000000000000000001',
 '0b100000000000000000000001',
 '0b110',
 '0b1010',
 '0b10010',
 '0b100010',
 '0b1000010',
 '0b

In [13]:
# looking for connections between each pair of nodes
start = timer()
dj_set = DisjointSet(n_nodes)
nj = len(bit_masks)

for i in range(n_nodes):
    target_nodes = [ nodes[i] ^ mask for mask in bit_masks]
    for nodes_j in target_nodes:
        if nodes_j in nodes_map.keys():
            dj_set.union(i,nodes_map[nodes_j])

end = timer()
end - start

16.638952145993244

In [239]:
dj_set.num_of_sets

6118

In [14]:
dj_set.num_of_sets

6118