## Legacy Data Gen | No shuffling
- This notebook generates and saves data (slight modifications of original code)
- Original code is commented at the end of the notebook

In [27]:
from igraph import *
import igraph
import random
import numpy as np
import pandas as pd
from sklearn.metrics.cluster import adjusted_rand_score

# Figure 5 config (lower=upper=30, p_l=p_u=0.5, between_c=10, 200 edges)
path= './_temp/data_1_nonshuffled/'
os.makedirs(path, exist_ok=True)
# Set up bipartite graph test data (should replace by proper generators e.g. from igraph package eventually)

# Define number of vertices in both levels and numerically mark as lower / upper vertex
lower = 30  
upper = 30
p_l = 0.5 # Needs to result in integer when multiplied by lower  
p_u = 0.5 # Needs to result in integer when multiplied by upper

n_var = lower+upper
lv = np.empty(lower)
lv.fill(0)
uv = np.empty(upper)
uv.fill(1)
vertices = np.concatenate((lv, uv), axis=None)
vertices = vertices.tolist()


In [31]:
# Generate 30 different graph instances with the same connectivity stats

between_c = 10 # 20 # Specify percentage cross-community edges

for it in range(0,30):

    edges = []
    connected = [0]*(lower+upper)
    #print(connected)
    for i in range(0,200): #200
    
        dice = random.randint(0,100)
        added = False
    
        # Find an edge that does not yet exist and add
        # Stochastically generate within community or between community edges
        while added == False:
            if dice <= 50-between_c/2:
                index1 = random.randint(0,lower*p_l-1)
                index2 = random.randint(lower,lower+upper*p_u-1)
            elif dice <= 50:
                index1 = random.randint(0,lower*p_l-1)
                index2 = random.randint(lower+upper*p_u,lower+upper-1)
            elif dice <= 50+between_c/2:
                index1 = random.randint(lower*p_l,lower-1)
                index2 = random.randint(lower,lower+upper*p_u-1)
            elif dice >= 50+between_c/2:
                index1 = random.randint(lower*p_l,lower-1)
                index2 = random.randint(lower+upper*p_u,lower+upper-1)


            newedge = [index1,index2]

            if not (newedge in edges):
                edges.append(newedge)
                connected[index1]=1
                connected[index2]=1
                added = True

   
    # Specify original ground truth 
    groundtruth=[0]*int(lower*p_l)+[1]*int(lower*(1-p_l))+[0]*int(upper*p_u)+[1]*int(upper*(1-p_u))
    shapes = ["rectangle"] * int(lower*p_l) + ["circle"] * int(lower*(1-p_l)) + ["rectangle"] * int(upper*p_u) + ["circle"] * int(upper*(1-p_u))


    # Reduce to giant component
    g_i = Graph.Bipartite(vertices, edges)
  
    index_max = np.argmax(g_i.components().sizes())

    T = [groundtruth[i] for i in g_i.clusters()[index_max]]
    vT = [vertices[i] for i in g_i.clusters()[index_max]]
    g=g_i.clusters().giant()
    groundtruth=T

    # Create actual bipartite instance
    g_i = Graph.Bipartite(vT,g.get_edgelist())

    # Store instance and associated ground truth
    g_i.write_edgelist(path+"Graph"+str(it)+".dat")
    p = pd.DataFrame(groundtruth)
    p.to_csv(path+"Graph"+str(it)+".truth.dat", sep=',',header=None)
    p = pd.DataFrame(vT)
    p.to_csv(path+"Graph"+str(it)+".vertices.dat", sep=',',header=None)




In [None]:
# from igraph import *
# import igraph
# import random
# import numpy as np
# import pandas as pd
# from sklearn.metrics.cluster import adjusted_rand_score

# path = "/Users/mcdicjh2/Desktop/gecco22/"

# # Set up bipartite graph test data (should replace by proper generators e.g. from igraph package eventually)

# # Define number of vertices in both levels and numerically mark as lower / upper vertex
# lower = 20  
# upper = 100
# p_l = 0.4 # Needs to result in integer when multiplied by lower  
# p_u = 0.4 # Needs to result in integer when multiplied by upper

# n_var = lower+upper
# lv = np.empty(lower)
# lv.fill(0)
# uv = np.empty(upper)
# uv.fill(1)
# vertices = np.concatenate((lv, uv), axis=None)
# vertices = vertices.tolist()

In [28]:
# # Generate 30 different graph instances with the same connectivity stats

# between_c = 20 # Specify percentage cross-community edges

# for it in range(0,30):

#     edges = []
#     connected = [0]*(lower+upper)
#     #print(connected)
#     for i in range(0,200):
    
#         dice = random.randint(0,100)
#         added = False
    
#         # Find an edge that does not yet exist and add
#         # Stochastically generate within community or between community edges
#         while added == False:
#             if dice <= 50-between_c/2:
#                 index1 = random.randint(0,lower*p_l-1)
#                 index2 = random.randint(lower,lower+upper*p_u-1)
#             elif dice <= 50:
#                 index1 = random.randint(0,lower*p_l-1)
#                 index2 = random.randint(lower+upper*p_u,lower+upper-1)
#             elif dice <= 50+between_c/2:
#                 index1 = random.randint(lower*p_l,lower-1)
#                 index2 = random.randint(lower,lower+upper*p_u-1)
#             elif dice >= 50+between_c/2:
#                 index1 = random.randint(lower*p_l,lower-1)
#                 index2 = random.randint(lower+upper*p_u,lower+upper-1)


#             newedge = [index1,index2]

#             if not (newedge in edges):
#                 edges.append(newedge)
#                 connected[index1]=1
#                 connected[index2]=1
#                 added = True

   
#     # Specify original ground truth 
#     groundtruth=[0]*int(lower*p_l)+[1]*int(lower*(1-p_l))+[0]*int(upper*p_u)+[1]*int(upper*(1-p_u))
#     shapes = ["rectangle"] * int(lower*p_l) + ["circle"] * int(lower*(1-p_l)) + ["rectangle"] * int(upper*p_u) + ["circle"] * int(upper*(1-p_u))


#     # Reduce to giant component
#     g_i = Graph.Bipartite(vertices, edges)
  
#     index_max = np.argmax(g_i.components().sizes())

#     T = [groundtruth[i] for i in g_i.clusters()[index_max]]
#     vT = [vertices[i] for i in g_i.clusters()[index_max]]
#     g=g_i.clusters().giant()
#     groundtruth=T

#     # Create actual bipartite instance
#     g_i = Graph.Bipartite(vT,g.get_edgelist())

#     # Store instance and associated ground truth
#     g_i.write_edgelist(path+"Graph"+str(it)+".dat")
#     p = pd.DataFrame(groundtruth)
#     p.to_csv(path+"Graph"+str(it)+".truth.dat", sep=',',header=None)
#     p = pd.DataFrame(vT)
#     p.to_csv(path+"Graph"+str(it)+".vertices.dat", sep=',',header=None)




