diff --git a/notebooks/statistical_guarantees_paper_experiments/statistical_guarantees_paper_COLGATE88_experiments.py b/notebooks/statistical_guarantees_paper_experiments/statistical_guarantees_paper_COLGATE88_experiments.py index 37cd858..f11c501 100644 --- a/notebooks/statistical_guarantees_paper_experiments/statistical_guarantees_paper_COLGATE88_experiments.py +++ b/notebooks/statistical_guarantees_paper_experiments/statistical_guarantees_paper_COLGATE88_experiments.py @@ -59,154 +59,154 @@ print("Cluster: ", counter, " conductance: ", cond, "Size: ", len(cluster), " Volume: ", np.sum(g.d[cluster])) all_clusters.append(cluster) -# ## Collect data for ACL (with rounding) +## Collect data for ACL (with rounding) -# nodes = {} -# external_best_cond_acl = {} -# external_best_pre_cond_acl = {} -# vol_best_cond_acl = {} -# vol_best_pre_acl = {} -# size_clust_best_cond_acl = {} -# size_clust_best_pre_acl = {} -# f1score_best_cond_acl = {} -# f1score_best_pre_acl = {} -# true_positives_best_cond_acl = {} -# true_positives_best_pre_acl = {} -# precision_best_cond_acl = {} -# precision_best_pre_acl = {} -# recall_best_cond_acl = {} -# recall_best_pre_acl = {} -# cuts_best_cond_acl = {} -# cuts_best_pre_acl = {} -# cuts_acl_ALL = {} +nodes = {} +external_best_cond_acl = {} +external_best_pre_cond_acl = {} +vol_best_cond_acl = {} +vol_best_pre_acl = {} +size_clust_best_cond_acl = {} +size_clust_best_pre_acl = {} +f1score_best_cond_acl = {} +f1score_best_pre_acl = {} +true_positives_best_cond_acl = {} +true_positives_best_pre_acl = {} +precision_best_cond_acl = {} +precision_best_pre_acl = {} +recall_best_cond_acl = {} +recall_best_pre_acl = {} +cuts_best_cond_acl = {} +cuts_best_pre_acl = {} +cuts_acl_ALL = {} -# ct_outer = 0 +ct_outer = 0 -# number_experiments = 0 +number_experiments = 0 -# for rr in all_clusters: +for rr in all_clusters: -# how_many = int(len(rr)) -# print(how_many) + how_many = int(len(rr)) + print(how_many) -# random.seed(4) + random.seed(4) -# nodes[ct_outer] = np.random.choice(rr, how_many, replace=False) + nodes[ct_outer] = np.random.choice(rr, how_many, replace=False) -# eigv, lambda_val = fiedler_local(g, rr) -# lambda_val = np.real(lambda_val) + eigv, lambda_val = fiedler_local(g, rr) + lambda_val = np.real(lambda_val) -# step = (2*lambda_val - lambda_val/2)/4 + step = (2*lambda_val - lambda_val/2)/4 -# a_list = np.arange(lambda_val/2,2*lambda_val,step) + a_list = np.arange(lambda_val/2,2*lambda_val,step) -# ct = 0 + ct = 0 -# start = time.time() + start = time.time() -# for node in nodes[ct_outer]: -# ref_node = [node] + for node in nodes[ct_outer]: + ref_node = [node] -# max_precision = -1 -# min_conduct = 100 + max_precision = -1 + min_conduct = 100 -# ct_inner = 0 -# for a in a_list: + ct_inner = 0 + for a in a_list: -# if ct_outer <= 1: -# rho = 0.15/np.sum(g.d[rr]) -# else: -# rho = 0.2/np.sum(g.d[rr]) + if ct_outer <= 1: + rho = 0.10/np.sum(g.d[rr]) + else: + rho = 0.15/np.sum(g.d[rr]) -# output_pr_clustering = approximate_PageRank(g,ref_node,method = "acl", rho=rho, alpha=a, cpp = True, normalize=True,normalized_objective=True) -# number_experiments += 1 + output_pr_clustering = approximate_PageRank(g,ref_node,method = "acl", rho=rho, alpha=a, cpp = True, normalize=True,normalized_objective=True) + number_experiments += 1 -# output_pr_sc = sweep_cut(g,output_pr_clustering,cpp=True) + output_pr_sc = sweep_cut(g,output_pr_clustering,cpp=True) -# S = output_pr_sc[0] + S = output_pr_sc[0] -# cuts_acl_ALL[ct_outer,node,ct_inner] = S + cuts_acl_ALL[ct_outer,node,ct_inner] = S -# size_clust_acl_ = len(S) + size_clust_acl_ = len(S) -# cond_val_l1pr = g.compute_conductance(S) + cond_val_l1pr = g.compute_conductance(S) -# vol_ = sum(g.d[S]) -# true_positives_acl_ = set(rr).intersection(S) -# if len(true_positives_acl_) == 0: -# true_positives_acl_ = set(ref_node) -# vol_ = g.d[ref_node][0,0] -# precision = sum(g.d[np.array(list(true_positives_acl_))])/vol_ -# recall = sum(g.d[np.array(list(true_positives_acl_))])/sum(g.d[rr]) -# f1_score_ = 2*(precision*recall)/(precision + recall) + vol_ = sum(g.d[S]) + true_positives_acl_ = set(rr).intersection(S) + if len(true_positives_acl_) == 0: + true_positives_acl_ = set(ref_node) + vol_ = g.d[ref_node][0,0] + precision = sum(g.d[np.array(list(true_positives_acl_))])/vol_ + recall = sum(g.d[np.array(list(true_positives_acl_))])/sum(g.d[rr]) + f1_score_ = 2*(precision*recall)/(precision + recall) -# if f1_score_ >= max_precision: + if f1_score_ >= max_precision: -# max_precision = f1_score_ + max_precision = f1_score_ -# external_best_pre_cond_acl[ct_outer,node] = cond_val_l1pr -# vol_best_pre_acl[ct_outer,node] = vol_ + external_best_pre_cond_acl[ct_outer,node] = cond_val_l1pr + vol_best_pre_acl[ct_outer,node] = vol_ -# size_clust_best_pre_acl[ct_outer,node] = size_clust_acl_ -# true_positives_best_pre_acl[ct_outer,node] = true_positives_acl_ -# precision_best_pre_acl[ct_outer,node] = precision -# recall_best_pre_acl[ct_outer,node] = recall -# f1score_best_pre_acl[ct_outer,node] = f1_score_ + size_clust_best_pre_acl[ct_outer,node] = size_clust_acl_ + true_positives_best_pre_acl[ct_outer,node] = true_positives_acl_ + precision_best_pre_acl[ct_outer,node] = precision + recall_best_pre_acl[ct_outer,node] = recall + f1score_best_pre_acl[ct_outer,node] = f1_score_ -# cuts_best_pre_acl[ct_outer,node] = S + cuts_best_pre_acl[ct_outer,node] = S -# if cond_val_l1pr <= min_conduct: + if cond_val_l1pr <= min_conduct: -# min_conduct = cond_val_l1pr + min_conduct = cond_val_l1pr -# external_best_cond_acl[ct_outer,node] = cond_val_l1pr -# vol_best_cond_acl[ct_outer,node] = vol_ + external_best_cond_acl[ct_outer,node] = cond_val_l1pr + vol_best_cond_acl[ct_outer,node] = vol_ -# size_clust_best_cond_acl[ct_outer,node] = size_clust_acl_ -# true_positives_best_cond_acl[ct_outer,node] = true_positives_acl_ -# precision_best_cond_acl[ct_outer,node] = precision -# recall_best_cond_acl[ct_outer,node] = recall -# f1score_best_cond_acl[ct_outer,node] = f1_score_ + size_clust_best_cond_acl[ct_outer,node] = size_clust_acl_ + true_positives_best_cond_acl[ct_outer,node] = true_positives_acl_ + precision_best_cond_acl[ct_outer,node] = precision + recall_best_cond_acl[ct_outer,node] = recall + f1score_best_cond_acl[ct_outer,node] = f1_score_ -# cuts_best_cond_acl[ct_outer,node] = S + cuts_best_cond_acl[ct_outer,node] = S -# print('outer:', ct_outer, 'number of node: ',node, ' completed: ', ct/how_many, ' degree: ', g.d[node]) -# print('conductance: ', external_best_cond_acl[ct_outer,node], 'f1score: ', f1score_best_cond_acl[ct_outer,node], 'precision: ', precision_best_cond_acl[ct_outer,node], 'recall: ', recall_best_cond_acl[ct_outer,node]) -# ct += 1 -# end = time.time() -# print(" ") -# print("Outer: ", ct_outer," Elapsed time ACL with rounding: ", end - start) -# print("Outer: ", ct_outer," Number of experiments: ", number_experiments) -# print(" ") -# ct_outer += 1 + print('outer:', ct_outer, 'number of node: ',node, ' completed: ', ct/how_many, ' degree: ', g.d[node]) + print('conductance: ', external_best_cond_acl[ct_outer,node], 'f1score: ', f1score_best_cond_acl[ct_outer,node], 'precision: ', precision_best_cond_acl[ct_outer,node], 'recall: ', recall_best_cond_acl[ct_outer,node]) + ct += 1 + end = time.time() + print(" ") + print("Outer: ", ct_outer," Elapsed time ACL with rounding: ", end - start) + print("Outer: ", ct_outer," Number of experiments: ", number_experiments) + print(" ") + ct_outer += 1 -# ## Performance of ACL (with rounding). +## Performance of ACL (with rounding). -# all_data = [] -# xlabels_ = [] +all_data = [] +xlabels_ = [] -# print('Results for ACL with rounding') -# sum_precision = 0 -# sum_recall = 0 -# sum_f1 = 0 -# sum_conductance = 0 +print('Results for ACL with rounding') +sum_precision = 0 +sum_recall = 0 +sum_f1 = 0 +sum_conductance = 0 -# info_ref_nodes = all_clusters -# l_info_ref_nodes = len(info_ref_nodes) +info_ref_nodes = all_clusters +l_info_ref_nodes = len(info_ref_nodes) -# for i in range(l_info_ref_nodes): -# temp_pre = [] -# temp_rec = [] -# temp_f1 = [] -# temp_conductance = [] +for i in range(l_info_ref_nodes): + temp_pre = [] + temp_rec = [] + temp_f1 = [] + temp_conductance = [] -# for j in all_clusters[i]: -# temp_pre.append(precision_best_cond_acl[i,j]) -# temp_rec.append(recall_best_cond_acl[i,j]) -# temp_f1.append(f1score_best_cond_acl[i,j]) -# temp_conductance.append(external_best_cond_acl[i,j]) + for j in all_clusters[i]: + temp_pre.append(precision_best_cond_acl[i,j]) + temp_rec.append(recall_best_cond_acl[i,j]) + temp_f1.append(f1score_best_cond_acl[i,j]) + temp_conductance.append(external_best_cond_acl[i,j]) -# print('Feature:', i,'Precision', stat_.mean(temp_pre), 'Recall', stat_.mean(temp_rec), 'F1', stat_.mean(temp_f1), 'Cond.', stat_.mean(temp_conductance)) + print('Feature:', i,'Precision', stat_.mean(temp_pre), 'Recall', stat_.mean(temp_rec), 'F1', stat_.mean(temp_f1), 'Cond.', stat_.mean(temp_conductance)) @@ -617,9 +617,9 @@ def seed_grow_bfs_steps(g,seeds,steps,vol_target,target_cluster): for a in a_list: if ct_outer <= 1: - rho = 0.15/np.sum(g.d[rr]) + rho = 0.10/np.sum(g.d[rr]) else: - rho = 0.2/np.sum(g.d[rr]) + rho = 0.15/np.sum(g.d[rr]) output_pr_clustering = approximate_PageRank(g,ref_node,method = "acl", rho=rho, alpha=a, cpp = True, normalize=True,normalized_objective=True) number_experiments += 1 @@ -783,9 +783,9 @@ def seed_grow_bfs_steps(g,seeds,steps,vol_target,target_cluster): for a in a_list: if ct_outer <= 1: - rho = 0.15/np.sum(g.d[rr]) + rho = 0.10/np.sum(g.d[rr]) else: - rho = 0.2/np.sum(g.d[rr]) + rho = 0.15/np.sum(g.d[rr]) output_pr_clustering = approximate_PageRank(g,ref_node,method = "l1reg-rand", epsilon=1.0e-2, rho=rho, alpha=a, cpp = True, normalize=True,normalized_objective=True,iterations=1000000) number_experiments += 1