In [1]:
import csv
import random
from utils import *
from collections import defaultdict
import time
import matplotlib.pyplot as plt
#import seaborn as sns
import numpy as np

## The functions that implements our algorithm

In [67]:
def test_sparse_vertex(current_graph, eps=0.2):
    # sample log n neighbors for each vertex
    current_num_vertex = len(current_graph)
    num_sample = max((int)(5*np.log(current_num_vertex)), 20)
    sample_dict = {}
    for this_vertex in current_graph:
        sample_vertex_set = current_graph[this_vertex].getRandom(i=num_sample)
        sample_dict[this_vertex] = sample_vertex_set
    sparse_vertex_list = []
    # test sparsity for each vertex
    for this_vertex in current_graph:
        num_neighbor_diff = 0
        neighbors_this = sample_dict[this_vertex]
        for comp_vertex in sample_dict[this_vertex]:
            neighbors_comp = sample_dict[comp_vertex]
            dif1 = np.setdiff1d(neighbors_this, neighbors_comp)
            dif2 = np.setdiff1d(neighbors_comp, neighbors_this)
            total_diff = len(np.concatenate((dif1, dif2)))
            max_degree_between_two = max(current_graph[this_vertex].degree, 
                                         current_graph[comp_vertex].degree)
            if (total_diff>=eps*max_degree_between_two):
                num_neighbor_diff = num_neighbor_diff + 1
        if num_neighbor_diff >=eps*len(neighbors_this):
            sparse_vertex_list.append(this_vertex)
    
    # return the list of sparse vertices
    return sparse_vertex_list

In [112]:
def sparse_dense_decop(current_graph, eps=0.2):
    current_num_vertex = len(current_graph)
    # check sparse vertices
    current_sparse_vertice = test_sparse_vertex(current_graph, eps=eps)
    # sample from the dense vertices
    dense_subgraph = {vertex: current_graph[vertex] 
                      for vertex in current_graph if vertex not in current_sparse_vertice}
    anchor_vertex_dict = {}
    for this_vertex in dense_subgraph:
        # rejection sampling
        dense_sample_prob = max((5*np.log(current_num_vertex))/dense_subgraph[this_vertex].degree, 0.01)
        if dense_sample_prob>=np.random.uniform(low=0.0, high=1.0):
            anchor_vertex_dict[this_vertex] = dense_subgraph[this_vertex]
    # recursively form almost-cliques
    num_sample = max((int)(5*np.log(current_num_vertex)), 20)
    AC_dict = {}
    while True:
        if anchor_vertex_dict:
            this_anchor_vertex = next(iter(anchor_vertex_dict))
            print('======================')
            print(this_anchor_vertex)
            this_anchor_neighbors = anchor_vertex_dict[this_anchor_vertex]
            anchor_neighbor_samples = anchor_vertex_dict[this_anchor_vertex].getRandom(i=num_sample)
            this_anchor_sparse_neighbor_list = []
            for candidate_vertex in this_anchor_neighbors:
                cand_neighbor_samples = dense_subgraph[candidate_vertex].getRandom(i=num_sample)
                # test whether their symmetric difference is large enough
                dif1 = np.setdiff1d(anchor_neighbor_samples, cand_neighbor_samples)
                dif2 = np.setdiff1d(cand_neighbor_samples, anchor_neighbor_samples)
                total_diff = len(np.concatenate((dif1, dif2)))
                max_degree_between_two = max(dense_subgraph[this_anchor_vertex].degree, 
                                             dense_subgraph[candidate_vertex].degree)
                if (total_diff<=2*eps*max_degree_between_two):
                    if this_anchor_vertex not in AC_dict.keys():
                        AC_dict[this_anchor_vertex] = [candidate_vertex]
                    else:
                        AC_dict[this_anchor_vertex].append(candidate_vertex)
                    if candidate_vertex in anchor_vertex_dict.keys():
                        del anchor_vertex_dict[candidate_vertex]
            # remove the anchor vertex from the dictionary to guarantee convergence
            try:
                del anchor_vertex_dict[this_anchor_vertex]
            except:
                print(anchor_vertex_dict)
                print(this_anchor_vertex)
        else:
            return current_sparse_vertice, AC_dict

## Read the edges and maintain clustering

In [113]:
adjacency_list, edge_list = create_graph_from_csv("../data/email-Eu-core.csv")

In [114]:
'''
TODO

See if functions are doable

'''

'\nTODO\n\nSee if functions are doable\n\n'

In [115]:
no_edges = len(edge_list)  # No. of edges

prob_del = 0.2      # Probability to delete edge
eps_param = 0.1

current_graph = {}
current_edge_list = []

available_edge_list = np.random.permutation(edge_list).tolist()

stream_length = 2*no_edges

track_update_num = {}
track_update_benckmark = {}

for i in range(stream_length):
    # Insertion
    if available_edge_list: #
        current_edge_list.append(available_edge_list[i])
        u = available_edge_list[i][0]
        v = available_edge_list[i][1]
        if u not in current_graph.keys():
            current_graph[u] = OptList()
        current_graph[u].insert(v)
        if v not in current_graph.keys():
            current_graph[v] = OptList()
        current_graph[v].insert(u)
        available_edge_list.pop(0)
        
        # keep track of the benchmark for the updates
        if u not in track_update_benckmark:
            track_update_benckmark[u] = current_graph[u].degree
        if v not in track_update_benckmark:
            track_update_benckmark[v] = current_graph[v].degree
        # update the tracking of the updates on u and v
        if u not in track_update_num:
            track_update_num[u] = 1
        else:
            track_update_num[u] = track_update_num[u] + 1
            
        if v not in track_update_num:
            track_update_num[v] = 1
        else:
            track_update_num[v] = track_update_num[v] + 1
        
        
        '''
        Code for SDD and PIVOT goes here
        '''
        if (track_update_num[u]>max(100, eps_param*track_update_benckmark[u])):
            # tests
            current_sparse_vertex_list, almost_cliques = sparse_dense_decop(current_graph)
            # clear the number of updates
            track_update_num[u] = 0
            track_update_benckmark = current_graph[u].degree
            print(current_sparse_vertex_list)
            print('==========')
            all_vertex_list = [v for v in current_graph]
            print(np.setdiff1d(all_vertex_list,current_sparse_vertex_list))
            break
        
    else:
        # We have run out of edges to insert
        edge_to_delete = np.random.choice(current_edge_list)
        
        u = edge_to_delete[0]
        v = edge_to_delete[1]
        current_graph[u].remove(v)
        current_graph[v].remove(u)
        
        available_edge_list.extend(edge_to_delete)
        current_edge_list.remove(edge_to_delete)
    
        
    
    
#     if np.random.binomial(1,prob_del):
#         # Deletion
#         print(current_edge_list)
#         edge_to_delete = np.random.choice(current_edge_list)
        
#         u = edge_to_delete[0]
#         v = edge_to_delete[1]
#         current_graph[u].remove(v)
#         current_graph[v].remove(u)
        
#         available_edge_list.extend(edge_to_delete)
#         current_edge_list.remove(edge_to_delete)
        



329


KeyError: 90