## Legacy Data Gen | Shuffling
- This notebook generates and saves data (slight modifications of original code)
- Original code is commented at the end of the notebook

In [16]:
from igraph import *
import igraph
import random
import numpy as np
import pandas as pd
from sklearn.metrics.cluster import adjusted_rand_score

# Figure 5 config (lower=upper=30, p_l=p_u=0.5, between_c=10, 200 edges)

# path = "/Users/mcdicjh2/Desktop/gecco22/"
path = './_temp/data_2_shuffled/'
os.makedirs(path, exist_ok=True)
# Set up bipartite graph test data (should replace by proper generators e.g. from igraph package eventually)

# Define number of vertices in both levels and numerically mark as lower / upper vertex
lower = 30  
upper = 30
p_l = 0.5 # Needs to result in integer when multiplied by lower  #0.4
p_u = 0.5 # Needs to result in integer when multiplied by upper  #0.4

n_var = lower+upper
lv = np.empty(lower)
lv.fill(0)
uv = np.empty(upper)
uv.fill(1)
vertices = np.concatenate((lv, uv), axis=None)
vertices = vertices.tolist()

In [19]:
# Generate 30 different graph instances with the same connectivity stats

between_c = 10 # Specify percentage cross-community edges  #20

random.seed(20)

for it in range(0,30):

    edges = []
    connected = [0]*(lower+upper)
    #print(connected)
    for i in range(0,200): #1000
    
        dice = random.randint(0,100)
        added = False
    
        # Find an edge that does not yet exist and add
        # Stochastically generate within community or between community edges
        while added == False:
            if dice <= 50-between_c/2:
                index1 = random.randint(0,lower*p_l-1)
                index2 = random.randint(lower,lower+upper*p_u-1)
            elif dice <= 50:
                index1 = random.randint(0,lower*p_l-1)
                index2 = random.randint(lower+upper*p_u,lower+upper-1)
            elif dice <= 50+between_c/2:
                index1 = random.randint(lower*p_l,lower-1)
                index2 = random.randint(lower,lower+upper*p_u-1)
            elif dice >= 50+between_c/2:
                index1 = random.randint(lower*p_l,lower-1)
                index2 = random.randint(lower+upper*p_u,lower+upper-1)


            newedge = [index1,index2]

            if not (newedge in edges):
                edges.append(newedge)
                connected[index1]=1
                connected[index2]=1
                added = True

   
    # Specify original ground truth 
    groundtruth=[0]*int(lower*p_l)+[1]*int(lower*(1-p_l))+[0]*int(upper*p_u)+[1]*int(upper*(1-p_u))
    shapes = ["rectangle"] * int(lower*p_l) + ["circle"] * int(lower*(1-p_l)) + ["rectangle"] * int(upper*p_u) + ["circle"] * int(upper*(1-p_u))


    # Reduce to giant component
    g_i = Graph.Bipartite(vertices, edges)
  
    index_max = np.argmax(g_i.components().sizes())

    
    #T = [groundtruth[i] for i in g_i.clusters()[index_max]]
    #vT = [vertices[i] for i in g_i.clusters()[index_max]]
    #g=g_i.clusters().giant()
    #print(g)
 
    oldelist = g_i.get_edgelist()
    oldorder= g_i.clusters()[index_max]#list(range(0,len(T)))
    order=oldorder.copy()
    #print(order)
    random.shuffle(order)
    #print(groundtruth)
    #print(vT)
    print(oldelist)
    #
    #print(oldorder)
    #print(order)
    selected = g_i.clusters()[index_max]
    
    elist=[]
    
    # Create reduced edge list
    for i in range(0,len(oldelist)):
        try :
            # Retrieve old idnex
            ti1 = oldorder.index(oldelist[i][0])
            ti2 = oldorder.index(oldelist[i][1])
            # Map vertices to new ids
            i1 = order.index(oldelist[i][0])
            i2 = order.index(oldelist[i][1])
            elist.append([i1,i2])
            #print(ti1,ti2)
            #print(i1,i2)
    
        except ValueError :
            res = "Element not in list !"
            print(res)
            

       # print(elist[i])
        #print(i1,i2)
    print(elist) 
    
   # print(order)
   # print(len(elist))
   # print(len(oldelist))
   # print(T)
   # print(vT)


    labels = [groundtruth[i] for i in order]
    topbottom = [vertices[i] for i in order]
    print(topbottom)
    #print(labels)
    #print(topbottom)
    #print(groundtruth)
    #print(vT)
    #print(len(oldelist))
    #print(len(elist))

    
    # Create actual bipartite instance
    g = Graph.Bipartite(topbottom,elist)
    #g_i = Graph.Bipartite(vT,g.get_edgelist())

    # Store instance and associated ground truth
    g.write_edgelist(path+"Graph"+str(it)+".dat")
    p = pd.DataFrame(labels)
    p.to_csv(path+"Graph"+str(it)+".truth.dat", sep=',',header=None)
    p = pd.DataFrame(topbottom)
    p.to_csv(path+"Graph"+str(it)+".vertices.dat", sep=',',header=None)






[(25, 57), (29, 59), (4, 40), (28, 59), (13, 35), (29, 47), (6, 36), (1, 32), (7, 39), (21, 48), (5, 40), (20, 50), (16, 40), (27, 58), (21, 46), (9, 33), (3, 31), (3, 34), (27, 49), (11, 34), (9, 31), (4, 41), (13, 43), (0, 30), (24, 49), (2, 41), (20, 54), (22, 59), (14, 40), (8, 43), (17, 32), (29, 41), (11, 32), (24, 51), (14, 39), (8, 33), (28, 31), (12, 44), (0, 42), (27, 50), (26, 51), (23, 57), (26, 45), (17, 46), (23, 50), (1, 39), (16, 46), (1, 40), (19, 55), (12, 30), (3, 38), (17, 45), (26, 53), (14, 34), (6, 34), (6, 31), (3, 33), (1, 34), (8, 36), (10, 31), (22, 45), (3, 43), (19, 51), (14, 43), (3, 35), (19, 47), (26, 46), (4, 30), (5, 56), (6, 43), (27, 46), (9, 37), (14, 30), (16, 59), (23, 54), (11, 30), (4, 34), (28, 46), (0, 33), (18, 54), (26, 39), (9, 43), (12, 49), (15, 57), (17, 59), (21, 59), (21, 51), (7, 31), (28, 52), (3, 42), (4, 37), (6, 33), (15, 50), (1, 41), (0, 57), (7, 33), (17, 49), (17, 55), (5, 53), (6, 38), (18, 59), (6, 56), (28, 56), (8, 39), (2

In [None]:
# from igraph import *
# import igraph
# import random
# import numpy as np
# import pandas as pd
# from sklearn.metrics.cluster import adjusted_rand_score

# path = "/Users/mcdicjh2/Desktop/gecco22/"
# # Set up bipartite graph test data (should replace by proper generators e.g. from igraph package eventually)

# # Define number of vertices in both levels and numerically mark as lower / upper vertex
# lower = 100#100  
# upper = 500#500
# p_l = 0.4 # Needs to result in integer when multiplied by lower  #0.4
# p_u = 0.4 # Needs to result in integer when multiplied by upper  #0.4

# n_var = lower+upper
# lv = np.empty(lower)
# lv.fill(0)
# uv = np.empty(upper)
# uv.fill(1)
# vertices = np.concatenate((lv, uv), axis=None)
# vertices = vertices.tolist()

In [17]:
# # Generate 30 different graph instances with the same connectivity stats

# between_c = 10 # Specify percentage cross-community edges  #20

# random.seed(20)

# for it in range(0,30):

#     edges = []
#     connected = [0]*(lower+upper)
#     #print(connected)
#     for i in range(0,1000): #1000
    
#         dice = random.randint(0,100)
#         added = False
    
#         # Find an edge that does not yet exist and add
#         # Stochastically generate within community or between community edges
#         while added == False:
#             if dice <= 50-between_c/2:
#                 index1 = random.randint(0,lower*p_l-1)
#                 index2 = random.randint(lower,lower+upper*p_u-1)
#             elif dice <= 50:
#                 index1 = random.randint(0,lower*p_l-1)
#                 index2 = random.randint(lower+upper*p_u,lower+upper-1)
#             elif dice <= 50+between_c/2:
#                 index1 = random.randint(lower*p_l,lower-1)
#                 index2 = random.randint(lower,lower+upper*p_u-1)
#             elif dice >= 50+between_c/2:
#                 index1 = random.randint(lower*p_l,lower-1)
#                 index2 = random.randint(lower+upper*p_u,lower+upper-1)


#             newedge = [index1,index2]

#             if not (newedge in edges):
#                 edges.append(newedge)
#                 connected[index1]=1
#                 connected[index2]=1
#                 added = True

   
#     # Specify original ground truth 
#     groundtruth=[0]*int(lower*p_l)+[1]*int(lower*(1-p_l))+[0]*int(upper*p_u)+[1]*int(upper*(1-p_u))
#     shapes = ["rectangle"] * int(lower*p_l) + ["circle"] * int(lower*(1-p_l)) + ["rectangle"] * int(upper*p_u) + ["circle"] * int(upper*(1-p_u))


#     # Reduce to giant component
#     g_i = Graph.Bipartite(vertices, edges)
  
#     index_max = np.argmax(g_i.components().sizes())

    
#     #T = [groundtruth[i] for i in g_i.clusters()[index_max]]
#     #vT = [vertices[i] for i in g_i.clusters()[index_max]]
#     #g=g_i.clusters().giant()
#     #print(g)
 
#     oldelist = g_i.get_edgelist()
#     oldorder= g_i.clusters()[index_max]#list(range(0,len(T)))
#     order=oldorder.copy()
#     #print(order)
#     random.shuffle(order)
#     #print(groundtruth)
#     #print(vT)
#     print(oldelist)
#     #
#     #print(oldorder)
#     #print(order)
#     selected = g_i.clusters()[index_max]
    
#     elist=[]
    
#     # Create reduced edge list
#     for i in range(0,len(oldelist)):
#         try :
#             # Retrieve old idnex
#             ti1 = oldorder.index(oldelist[i][0])
#             ti2 = oldorder.index(oldelist[i][1])
#             # Map vertices to new ids
#             i1 = order.index(oldelist[i][0])
#             i2 = order.index(oldelist[i][1])
#             elist.append([i1,i2])
#             #print(ti1,ti2)
#             #print(i1,i2)
    
#         except ValueError :
#             res = "Element not in list !"
#             print(res)
            

#        # print(elist[i])
#         #print(i1,i2)
#     print(elist) 
    
#    # print(order)
#    # print(len(elist))
#    # print(len(oldelist))
#    # print(T)
#    # print(vT)


#     labels = [groundtruth[i] for i in order]
#     topbottom = [vertices[i] for i in order]
#     print(topbottom)
#     #print(labels)
#     #print(topbottom)
#     #print(groundtruth)
#     #print(vT)
#     #print(len(oldelist))
#     #print(len(elist))

    
#     # Create actual bipartite instance
#     g = Graph.Bipartite(topbottom,elist)
#     #g_i = Graph.Bipartite(vT,g.get_edgelist())

#     # Store instance and associated ground truth
#     g.write_edgelist(path+"Graph"+str(it)+".dat")
#     p = pd.DataFrame(labels)
#     p.to_csv(path+"Graph"+str(it)+".truth.dat", sep=',',header=None)
#     p = pd.DataFrame(topbottom)
#     p.to_csv(path+"Graph"+str(it)+".vertices.dat", sep=',',header=None)




