In [1]:
import csv
import random
from utils import *
from collections import defaultdict
import os
import time
import tqdm
import matplotlib.pyplot as plt
#import seaborn as sns
import numpy as np

## generate SBMs and Erdos-Renyi graph

In [2]:
class SBMGraphStream():
    '''
    The class of Graph Stream from the stochastic block model
    ----- Parameters -----
    # n_vertex: the number of vertices in the graph
    # p_intra: the probability for + edge (u,v) for the same cluster
    # p_inter: the probability for + edge (u,v) for different clusters
    # k_cluster: number of clusters in the clustering
    ----- Methods ----
    # read_next_edge(): read the next edge and move the index +1
    ----- Representation ----
    The graph is representation with an indexed array of vertices and a dictionary with (u_i, u_j): labels
    '''
    
    def __init__(self, n_vertex, p_intra=0.8, p_inter=0.2, k_cluster=7):
        '''
        :param n_vertex: the the number of vertices in the graph
        '''
        self.n_vertex = n_vertex
        self.p_intra = p_intra
        self.p_inter = p_inter
        self.k_cluster = k_cluster
        
        # initialize the vertex set and the cluster labels
        self.vertex_set = np.array([self.n_vertex])
        num_v_per_cluster = n_vertex//self.k_cluster
        n_residual = n_vertex % num_v_per_cluster
        cluster_labels_list = []
        for i_cluster in range(k_cluster):
            cluster_labels_list.append(i_cluster*np.ones([num_v_per_cluster]))
        if n_residual!=0:
            cluster_labels_list.append((k_cluster-1)*np.ones([n_residual]))
        # collect them as a 1-d array
        self.cluster_labels = np.reshape(np.hstack(cluster_labels_list).astype(int), [-1])
        # initialize the edges -- using +1 and -1 to represent the edge labels
        # also compute the cost
        self.cc_cost = 0
        self.edge_dict = {}
        for u_i in tqdm.tqdm(range(self.n_vertex)):
            for u_j in np.arange(u_i+1, self.n_vertex):
                if self.cluster_labels[u_i] == self.cluster_labels[u_j]:
                    if np.random.rand() <= p_intra:
                        self.edge_dict[(u_i,u_j)] = 1
                    else:
                        self.edge_dict[(u_i,u_j)] = -1
                        self.cc_cost = self.cc_cost + 1
                else:
                    if np.random.rand() <= p_inter:
                        self.edge_dict[(u_i,u_j)] = 1
                        self.cc_cost = self.cc_cost + 1
                    else:
                        self.edge_dict[(u_i,u_j)] = -1
        # randomize the order of edge arrival
        self.edge_names = list(self.edge_dict.keys())
        random.shuffle(self.edge_names)
        self.num_edges = len(self.edge_names)
        # maintain a pointer of the number of edges
        self.current_stream_ind = 0
        
    def read_next_edge(self):
        
        this_edge_name = self.edge_names[self.current_stream_ind]
        this_edge_label = self.edge_dict[this_edge_name]
        self.current_stream_ind = self.current_stream_ind + 1
        if self.current_stream_ind>=self.num_edges-1:
            return None, None
        
        return this_edge_name, this_edge_label
    
    def write_edges(self, write_path=None):
        if not write_path:
            raise ValueError('the writing path has to be specified!')
        file_name = 'SBM_n='+str(self.n_vertex)+'_p='+str(self.p_intra)+'_k=' + str(self.k_cluster) +'.csv'
        with open(os.path.join(write_path, file_name), 'w', newline='') as csvfile:
            writer = csv.writer(csvfile)
            for edge in self.edge_dict:
                if self.edge_dict[edge]>0:
                    writer.writerow([f'{edge[0]} {edge[1]}'])
    
    def reset_index(self):
        '''
        reset the pointer
        '''
        self.current_stream_ind = 0

In [3]:
sbm_graph_stream = SBMGraphStream(n_vertex=1000,p_intra=0.95, p_inter=0.05, k_cluster=4)

100%|██████████| 1000/1000 [00:00<00:00, 1838.39it/s]


In [4]:
sbm_graph_stream.write_edges(write_path='../data/')

## The functions that implements our algorithm

In [175]:
def test_sparse_vertex(current_graph, eps=0.2):
    # sample log n neighbors for each vertex
    current_num_vertex = len(current_graph)
    num_sample = max((int)(5*np.log(current_num_vertex)/eps), 20)
    sample_dict = {}
    for this_vertex in current_graph:
        sample_vertex_set = list(current_graph[this_vertex].getRandom(i=num_sample))
        sample_dict[this_vertex] = sample_vertex_set
    sparse_vertex_list = []
    # test sparsity for each vertex
    for this_vertex in current_graph:
        num_neighbor_diff = 0
        neighbors_this = sample_dict[this_vertex]
        for comp_vertex in sample_dict[this_vertex]:
            neighbors_comp = sample_dict[comp_vertex]
            sample_vertex = np.random.choice(current_num_vertex, num_sample)
            # the intersections of the neighbor
            total_diff = 0
            for test_vertex in sample_vertex:
                if (test_vertex in current_graph[this_vertex]) and (test_vertex not in current_graph[comp_vertex]):
                    total_diff = total_diff + 1
                if (test_vertex in current_graph[comp_vertex]) and (test_vertex not in current_graph[this_vertex]):
                    total_diff = total_diff + 1
            if (total_diff>=eps*num_sample):
                num_neighbor_diff = num_neighbor_diff + 1
        if num_neighbor_diff >=eps*num_sample:
#             print('==========================')
#             print(num_neighbor_diff)
#             print(num_sample)
            sparse_vertex_list.append(this_vertex)
    
    # return the list of sparse vertices
    return sparse_vertex_list

In [232]:
def sparse_dense_decop(current_graph, eps=0.2):
    current_num_vertex = len(current_graph)
    # the returned clusters
    SDD_clustering = {}
    # check sparse vertices
    current_sparse_vertice = test_sparse_vertex(current_graph, eps=eps)
    for sparse_vertex in current_sparse_vertice:
        SDD_clustering[sparse_vertex]=sparse_vertex
    # sample from the dense vertices
    dense_subgraph = {vertex: current_graph[vertex] 
                      for vertex in current_graph if vertex not in current_sparse_vertice}
    anchor_vertex_dict = {}
    for this_vertex in dense_subgraph:
        # rejection sampling
        dense_sample_prob = max((2*np.log(current_num_vertex))/(eps*dense_subgraph[this_vertex].degree), 0.02)
        if dense_sample_prob>=np.random.uniform(low=0.0, high=1.0):
            anchor_vertex_dict[this_vertex] = dense_subgraph[this_vertex]
    # recursively form almost-cliques
    num_sample = max((int)(5*np.log(current_num_vertex)), 20)
    AC_dict = {}
    # maintain a list of covered vertices
    covered_AC_vertex = []
    for this_anchor_vertex in anchor_vertex_dict:
        if this_anchor_vertex in covered_AC_vertex:
            continue
        AC_dict[this_anchor_vertex] = []
        SDD_clustering[this_anchor_vertex] = this_anchor_vertex  # assign to the cluster represented by self
        covered_AC_vertex.append(this_anchor_vertex)
        anchor_neighbor_samples = list(anchor_vertex_dict[this_anchor_vertex].getRandom(i=num_sample))
        for candidate_vertex in anchor_vertex_dict[this_anchor_vertex]:
            if (candidate_vertex in current_sparse_vertice) or (candidate_vertex in covered_AC_vertex):
                continue
            # test whether their symmetric difference is large enough
            # the intersections of the neighbor
            total_diff = 0
            for anchor_neighbor in anchor_neighbor_samples:
                if anchor_neighbor not in list(dense_subgraph[candidate_vertex]):
                    total_diff = total_diff + 1
#             anchor_neighbor_in_sample = np.intersect1d(sample_vertex, list(dense_subgraph[this_anchor_vertex]))
#             cand_neighbor_in_sample = np.intersect1d(sample_vertex, list(dense_subgraph[candidate_vertex]))
#             dif1 = np.setdiff1d(anchor_neighbor_in_sample, cand_neighbor_in_sample)
#             dif2 = np.setdiff1d(cand_neighbor_in_sample, anchor_neighbor_in_sample)
#             if sbm_graph_stream.cluster_labels[this_anchor_vertex]!=sbm_graph_stream.cluster_labels[candidate_vertex]:
#                 print('============= Different cluster happens! ==================')
#                 print(anchor_neighbor_samples)
#                 print(cand_neighbor_samples)
#                 print(len(dif1[0]))
#                 print(len(dif2[0]))
#             total_diff = len(dif1)+len(dif2)
            if (total_diff<=2*eps*num_sample):
                AC_dict[this_anchor_vertex].append(candidate_vertex)
                SDD_clustering[candidate_vertex] = this_anchor_vertex # assign the candidate vertex to the anchor
                covered_AC_vertex.append(candidate_vertex)
            # this line is for debugging purpose -- remove later
            else:
                pass
    # add codes to add vertices to the almost-cliques
   
    dense_vertex_list = [v for v in current_graph if v not in current_sparse_vertice]
    
    return current_sparse_vertice, AC_dict, SDD_clustering, anchor_vertex_dict

In [233]:
def singleton_cluster_alg(current_graph):
    return {vertex: vertex for vertex in current_graph}

## Read the edges and maintain clustering

In [234]:
adjacency_list, edge_list = create_graph_from_csv("../data/SBM_n=1000_p=0.95_k=4.csv")

In [235]:
'''
TODO

See if functions are doable

'''

'\nTODO\n\nSee if functions are doable\n\n'

In [236]:
no_edges = len(edge_list)  # No. of edges

prob_del = 0.2      # Probability to delete edge
eps_param = 0.1

current_graph = {}
current_edge_list = []

available_edge_list = np.random.permutation(edge_list).tolist()

stream_length = (int)(0.5*no_edges)

track_update_num = {}
track_update_benckmark = {}

for i in range(stream_length):
    # Insertion
    if available_edge_list: #
        current_edge_list.append(available_edge_list[i])
        u = available_edge_list[i][0]
        v = available_edge_list[i][1]
        if u not in current_graph.keys():
            current_graph[u] = OptList()
        current_graph[u].insert(v)
        if v not in current_graph.keys():
            current_graph[v] = OptList()
        current_graph[v].insert(u)
        available_edge_list.pop(0)
        
        # keep track of the benchmark for the updates
        if u not in track_update_benckmark:
            track_update_benckmark[u] = current_graph[u].degree
        if v not in track_update_benckmark:
            track_update_benckmark[v] = current_graph[v].degree
        # update the tracking of the updates on u and v
        if u not in track_update_num:
            track_update_num[u] = 1
        else:
            track_update_num[u] = track_update_num[u] + 1
            
        if v not in track_update_num:
            track_update_num[v] = 1
        else:
            track_update_num[v] = track_update_num[v] + 1
        
        
        '''
        Code for SDD and PIVOT goes here
        '''
        if (track_update_num[u]>max(100, eps_param*track_update_benckmark[u])):
            # tests
            start_SDD = time.time()
            current_sparse_vertex_list, almost_cliques, SDD_clustering, anchor_vertex_dict = sparse_dense_decop(adjacency_list, eps=0.2)
            end_SDD = time.time()
            start_pivot = time.time()
            pivot_clustering = classical_pivot(adjacency_list)
            end_pivot = time.time()
            singleton_clustering = singleton_cluster_alg(adjacency_list)
            # clear the number of updates
            track_update_num[u] = 0
            track_update_benckmark = current_graph[u].degree
            # =========== TODO: add this as a test for whether the SDD succeeds ======== 
            all_vertex_list = [v for v in current_graph]
            AC_vertex_list = []
            for anchor_ver in almost_cliques.keys():
                AC_vertex_list.append(anchor_ver)
                for ac_ver in almost_cliques[anchor_ver]:
                    AC_vertex_list.append(ac_ver)
            AC_vertex_list = list(set(AC_vertex_list))
            recovered_vertex = np.concatenate((AC_vertex_list, current_sparse_vertex_list))
            print('===============================')
            print(np.setdiff1d(all_vertex_list,recovered_vertex))
            print(len(current_sparse_vertex_list))
            print(len(anchor_vertex_dict.keys()))
            print('******************************')
            print('The number of almost-cliques is ', len(almost_cliques))
            print('---------------------')
            SDD_cost = correlation_clustering_value(adjacency_list, SDD_clustering)
            pivot_cost = correlation_clustering_value(adjacency_list, pivot_clustering)
            singleton_cost = correlation_clustering_value(adjacency_list, singleton_clustering)
            print('SDD clustering cost is', SDD_cost, 'and the running time is', end_SDD-start_SDD)
            print('Pivot clustering cost is', pivot_cost, 'and the running time is', end_pivot-start_pivot)
            print('Singleton clustering cost is', singleton_cost)
            print('The correct optimal clustering cost should be', sbm_graph_stream.cc_cost)
            print('^^^^^^^^^^^^^^^^^^^^^^^')
            for anchor in almost_cliques.keys():
                print(almost_cliques[anchor])
            break
        
#     else:
#         # We have run out of edges to insert
#         edge_to_delete = np.random.choice(current_edge_list)
        
#         u = edge_to_delete[0]
#         v = edge_to_delete[1]
#         current_graph[u].remove(v)
#         current_graph[v].remove(u)
        
#         available_edge_list.extend(edge_to_delete)
#         current_edge_list.remove(edge_to_delete)
    
        
    
    
#     if np.random.binomial(1,prob_del):
#         # Deletion
#         print(current_edge_list)
#         edge_to_delete = np.random.choice(current_edge_list)
        
#         u = edge_to_delete[0]
#         v = edge_to_delete[1]
#         current_graph[u].remove(v)
#         current_graph[v].remove(u)
        
#         available_edge_list.extend(edge_to_delete)
#         current_edge_list.remove(edge_to_delete)

[175 253 316 425 966]
0
251
******************************
The number of almost-cliques is  9
---------------------
Houston we have a problem with (1, 175) and cluster ids : (0,None)


NameError: name 'exit' is not defined

For of clustering is dict[vertex-name]: cluster-name