In [1]:
import networkx as nx
from tqdm.auto import tqdm

def get_edges_in_undirected_graph(graph):
    edges = set()
    for edge in graph.edges():
        if (edge[0] != edge[1]) and (edge[0], edge[1]) not in edges and (edge[1], edge[0]) not in edges:
            edges.add(edge)
    
    return edges

def get_coupling(graph, partition):
    clusters = list(set(partition.values()))
    edges_between_clusters = list()
    for i in range(len(clusters)):
        for j in range(i+1, len(clusters)):
            cluster1, cluster2 = clusters[i], clusters[j]
            sub_graph1 = graph.subgraph([node for node in partition if partition[node] == cluster1])
            sub_graph2 = graph.subgraph([node for node in partition if partition[node] == cluster2])
            edges_between_clusters += list(nx.edge_boundary(graph, sub_graph1, sub_graph2))

    coupling = len(edges_between_clusters) /  graph.number_of_edges()
    return coupling


def get_cohesion(graph, partition):
    clusters = set(partition.values())
    # print(len(clusters))
    cohesion = 0
    for cluster in clusters:
        sub_graph = graph.subgraph([node for node in partition if partition[node] == cluster])
        max_edges = sub_graph.number_of_nodes() * (sub_graph.number_of_nodes() - 1) / 2
        edges = get_edges_in_undirected_graph(sub_graph)
        cohesion += len(edges) / max_edges if max_edges != 0 else 0

    return cohesion / len(clusters)


def get_modularization_scores(graph, partition):
    scores = {
        'cohesion': get_cohesion(graph, partition),
        'coupling': get_coupling(graph, partition),
    }
    return scores

In [2]:
import pandas as pd

ga_results = pd.read_excel('results/GA2.xlsx')

In [3]:
import pickle
import networkx as nx

with open('dataset/ecore_non_dup_models.pkl', 'rb') as f:
    non_duplicate_models = pickle.load(f)

non_duplicate_numbered_graphs = [(a, nx.convert_node_labels_to_integers(b)) for a, b in non_duplicate_models if list(nx.isolates(b)) == []]

In [24]:
top50_graphs = sorted(non_duplicate_numbered_graphs, key=lambda x: x[1].number_of_edges(), reverse=True)[:50]
file_name, graph = non_duplicate_numbered_graphs[0]
graph.number_of_nodes(), graph.number_of_edges()

# graph = remove_isolated_nodes(nxg)

(31, 91)

In [7]:
prefix = '../datasets/ModelClassification/modelset/raw-data/repo-ecore-all/data/'

def get_file_name(f):
    return f.replace(prefix, '').replace('.ecore', '').replace('/', '_')

file_name = get_file_name(file_name)

file_df = ga_results[ga_results['Name'] == file_name]
max_cohesion_row = dict(file_df.loc[file_df['Cohesion'].idxmax()])
min_coupling_row = dict(file_df.loc[file_df['Coupling'].idxmin()])

print(f"Max cohesion row: {max_cohesion_row['Cohesion']}, {max_cohesion_row['Coupling']}")
print(f"Min coupling row: {min_coupling_row['Cohesion']}, {min_coupling_row['Coupling']}")

Max cohesion row: 0.604444444444444, 0.0
Min coupling row: 0.503703703703703, 0.0


In [8]:
file_name

'paulofpimenta_b-reactive_org.cirad.dsl.behaviormetamodel_model_behaviormetamodel'

In [30]:
import torch
import networkx as nx

def get_nx_adj(nxg) -> torch.Tensor:
    adj = nx.adjacency_matrix(nxg).todense()
    return torch.tensor(adj, dtype=torch.long)

In [31]:
def get_edge_index(nxg):
    edge_index = torch.tensor(list(nxg.edges)).t().contiguous()
    return edge_index

edge_index = get_edge_index(graph)

In [13]:
from tqdm.auto import tqdm
import torch
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

  from .autonotebook import tqdm as notebook_tqdm


In [25]:
from torch_geometric.nn import Node2Vec

NODE2VEC_EPOCHS = 1
NODE2VEC_WALK_LENGTH = 10
NODE2VEC_CONTEXT_SIZE = 10
NODE2VEC_DIM = 128
NODE2VEC_NEG_SAMPLES = 4
NODE2VEC_BATCH_SIZE = 128
NODE2VEC_LR = 0.01
NODE2VEC_WALKS_PER_NODE = 10
NODE2VEC_NUM_WORKERS = 4
NODE2VEC_P = 0.8
NODE2VEC_Q = 1.2

def get_node_embeddings(g):
    device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
    edge_index = get_edge_index(g)
    node2vec = Node2Vec(
        edge_index,
        embedding_dim=NODE2VEC_DIM,
        walk_length=NODE2VEC_WALK_LENGTH,
        context_size=NODE2VEC_CONTEXT_SIZE,
        walks_per_node=NODE2VEC_WALKS_PER_NODE,
        num_negative_samples=NODE2VEC_NEG_SAMPLES,
        p=NODE2VEC_P,
        q=NODE2VEC_Q,
        sparse=True,
    ).to(device)

    loader = node2vec.loader(batch_size=NODE2VEC_BATCH_SIZE, shuffle=True, num_workers=0)
    optimizer = torch.optim.SparseAdam(list(node2vec.parameters()), lr=NODE2VEC_LR)
    node2vec.train()
    total_loss = 0
    for epoch in tqdm(range(1, NODE2VEC_EPOCHS + 1), desc='Training Node2Vec For Node Embeddings'):
        total_loss = 0
        for pos_rw, neg_rw in loader:
            optimizer.zero_grad()
            loss = node2vec.loss(pos_rw.to(device), neg_rw.to(device))
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        
        loss = total_loss / len(loader)
        if epoch % 20 == 0:
            print(f'Epoch: {epoch:03d}, Loss: {loss:.4f}')

    return node2vec.embedding.weight

In [26]:
import torch_geometric
import torch
import torch.nn as nn


class GNNModel(torch.nn.Module):
    """GNN Model with multiple layers"""
    def __init__(self, model_name, input_dim, hidden_dim, out_dim, num_layers, num_heads=None, residual=False, l_norm=False, dropout=0.1):
        super(GNNModel, self).__init__()
        gnn_model = getattr(torch_geometric.nn, model_name)
        self.conv_layers = nn.ModuleList()
        if model_name == 'GINConv':
            input_layer = gnn_model(nn.Sequential(nn.Linear(input_dim, hidden_dim), nn.ReLU()), train_eps=True)
        elif num_heads is None:
            input_layer = gnn_model(input_dim, hidden_dim, aggr='SumAggregation')
        else:
            input_layer = gnn_model(input_dim, hidden_dim, heads=num_heads, aggr='SumAggregation')
        self.conv_layers.append(input_layer)

        for _ in range(num_layers - 2):
            if model_name == 'GINConv':
                self.conv_layers.append(gnn_model(nn.Sequential(nn.Linear(hidden_dim, hidden_dim), nn.ReLU()), train_eps=True))
            elif num_heads is None:
                self.conv_layers.append(gnn_model(hidden_dim, hidden_dim, aggr='SumAggregation'))
            else:
                self.conv_layers.append(gnn_model(num_heads*hidden_dim, hidden_dim, heads=num_heads, aggr='SumAggregation'))

        if model_name == 'GINConv':
            self.conv_layers.append(gnn_model(nn.Sequential(nn.Linear(hidden_dim, out_dim), nn.ReLU()), train_eps=True))
        else:
            self.conv_layers.append(gnn_model(hidden_dim if num_heads is None else num_heads*hidden_dim, out_dim, aggr='SumAggregation'))
            
        self.activation = nn.ReLU()
        self.layer_norm = nn.LayerNorm(hidden_dim if num_heads is None else num_heads*hidden_dim) if l_norm else None
        self.residual = residual
        self.dropout = nn.Dropout(dropout)


    def forward(self, in_feat, edge_index):
        h = in_feat
        h = self.conv_layers[0](h, edge_index)
        h = self.activation(h)
        if self.layer_norm is not None:
            h = self.layer_norm(h)
        h = self.dropout(h)

        for conv in self.conv_layers[1:-1]:
            h = conv(h, edge_index) if not self.residual else conv(h, edge_index) + h
            h = self.activation(h)
            if self.layer_norm is not None:
                h = self.layer_norm(h)
            h = self.dropout(h)
        
        h = self.conv_layers[-1](h, edge_index)
        return h
# Y = model(X, edge_index)

In [27]:
import torch.nn as nn
import torch.nn.functional as F

class GraphClusteringLoss(nn.Module):
    def __init__(self):
        super(GraphClusteringLoss, self).__init__()
        # Initialize lambda as a learnable parameter
        self.lambda_param = nn.Parameter(torch.tensor(0.5))  # Initial value of lambda


    def forward(self, A: torch.Tensor, Y: torch.Tensor):
        n, C = Y.shape

        Y = F.gumbel_softmax(Y, dim=1)  # Shape: n x C
        
        # Compute cohesion
        cohesion = 0
        for c in range(C):
            Y_c = Y[:, c].unsqueeze(1)  # Shape: n x 1
            A_c = Y_c @ Y_c.T  # Shape: n x n
            A_c = A_c * A  # Mask with adjacency matrix to get within-cluster edges
            total_edges_within_cluster = A_c.sum()
            
            n_c = Y[:, c].sum()
            max_possible_edges_within_cluster = n_c * (n_c - 1) / 2 + 1e-9  # Avoid division by zero
            cohesion += total_edges_within_cluster / max_possible_edges_within_cluster
        
        # cohesion /= C  # Average cohesion across all clusters
        print(f'Cohesion with loops: {cohesion.item()}')

        # Compute cohesion
        Y_YT = torch.einsum('ni, nj->nij', Y, Y)  # Shape: n x n x C
        A_expanded = A.unsqueeze(2)  # Shape: n x n x 1
        print(A_expanded.shape, Y_YT.shape)

        # Mask with adjacency matrix to get within-cluster edges
        A_Y_YT = A_expanded * Y_YT  # Shape: n x n x C
        total_edges_within_cluster = A_Y_YT.sum(dim=(0, 1))  # Shape: C

        # Number of nodes in each cluster
        n_c = Y.sum(dim=0)  # Shape: C

        # Max possible edges within each cluster
        max_possible_edges_within_cluster = n_c * (n_c - 1) / 2 + 1e-9  # Shape: C

        # Cohesion for each cluster
        cohesion_per_cluster = total_edges_within_cluster / max_possible_edges_within_cluster  # Shape: C

        # Average cohesion across all clusters
        cohesion = cohesion_per_cluster.sum()

        print(f'Cohesion without loops: {cohesion.item()}')

        total_edges = A.sum()

        inter_cluster_edges = torch.zeros(1).to(A.device)
        
        for i in range(C):
            for j in range(C):
                if i != j:
                    Y_i = Y[:, i].unsqueeze(1)
                    Y_j = Y[:, j].unsqueeze(1)
                    A_ij = Y_i @ Y_j.T
                    inter_cluster_edges += (A_ij * A).sum()


        assert inter_cluster_edges <= total_edges, f'Inter-cluster edges cannot be greater than total edges in the graph. Inter-cluster edges: {inter_cluster_edges}, Total edges: {total_edges}'
        coupling = inter_cluster_edges / (total_edges + 1e-9)  # Avoid division by zero

        print(f'Coupling with loops: {coupling.item()}')

        # Calculate the cluster probabilities matrix product
        Y_YT = Y @ Y.T  # Shape: n x n

        # Mask the intra-cluster edges
        intra_cluster_mask = torch.eye(n, device=A.device).bool()
        Y_YT[intra_cluster_mask] = 0

        # Compute inter-cluster edges
        inter_cluster_edges = (Y_YT * A).sum() / 2  # Divide by 2 to avoid double counting
        coupling = inter_cluster_edges / (total_edges + 1e-9)  # Avoid division by zero

        print(f'Coupling without loops: {coupling.item()}')

        # Calculate loss
        # loss = -cohesion
        loss = -cohesion * self.lambda_param + (1 - self.lambda_param) * coupling

        node_clusters = torch.argmax(Y, dim=1).cpu().numpy()
        clusters = {i: c.item() for i, c in enumerate(node_clusters)}
        g = nx.from_numpy_array(A.detach().cpu().numpy())
        # metrics = get_modularization_scores(g, clusters)

        # if settings.verbose:
        print(f'Loss: {loss.item():.4f}, Cohesion: {cohesion.item()/C:.4f}, Coupling: {coupling.item()/total_edges:.4f}, Lambda: {self.lambda_param.item():.4f}')
        # print(f'Loss: {loss.item():.4f}, Cohesion: {cohesion.item()/C:.4f}')
        # print(f'Actual Cohesion: {metrics["cohesion"]:.4f}, Actual Coupling: {metrics["coupling"]:.4f}, Clusters: {len(set(clusters.values()))}')

        return loss

In [28]:
def get_input_data(g):
    X = get_node_embeddings(g).float()
    A = get_nx_adj(g).float()
    E = get_edge_index(g)
    print(f"Node embeddings shape: {X.shape}")
    print(f"Adjacency matrix shape: {A.shape}")
    print(f"Edge index shape: {E.shape}")
    return X, E, A

In [18]:
X = get_node_embeddings(graph).float()
A = get_nx_adj(graph).float()
E = get_edge_index(graph)

Training Node2Vec For Node Embeddings: 100%|██████████| 1/1 [00:00<00:00,  4.29it/s]


In [25]:
sum([1 for n in graph.nodes if graph.degree(n) == 0])

1

In [37]:
GNN_NUM_EPOCHS = 1
GNN_MODEL_NAME = 'SAGEConv'
GNN_INPUT_DIM = NODE2VEC_DIM
GNN_HIDDEN_DIM = 128
GNN_NUM_LAYERS = 3
GNN_RESIDUAL = True
GNN_LNORM = True
GNN_DROPOUT = 0.1
GNN_LR = 0.01
NODES_PER_CLUSTER = 7


def train_gnn(g):
    loss_fn = GraphClusteringLoss()
    X, edge_index, A = get_input_data(g)
    model = GNNModel(
        model_name=GNN_MODEL_NAME, 
        input_dim=GNN_INPUT_DIM, 
        hidden_dim=GNN_HIDDEN_DIM, 
        out_dim=g.number_of_nodes() // NODES_PER_CLUSTER, 
        num_layers=GNN_NUM_LAYERS, 
        residual=GNN_RESIDUAL, 
        l_norm=GNN_LNORM, 
        dropout=GNN_DROPOUT
    ).to(device)

    ## loss_fn.lambda_param + model.parameters
    train_params = list(model.parameters()) + [loss_fn.lambda_param]

    model.train()
    optimizer = torch.optim.Adam(train_params, lr=GNN_LR)
    all_metrics = list()
    for epoch in tqdm(range(1, GNN_NUM_EPOCHS + 1), desc='GNN Epochs'):
        optimizer.zero_grad()
        Y = model(X, edge_index)
        loss = loss_fn(A, Y)
        loss.backward()
        optimizer.step()
        node_clusters = torch.argmax(Y, dim=1).cpu()
        node_to_cluster_map = {i: c.item() for i, c in enumerate(node_clusters)}
        metrics = get_modularization_scores(graph, node_to_cluster_map)
        all_metrics.append(metrics)
        if epoch % 10 == 0:
            print(f'Epoch: {epoch:03d}, Loss: {loss:.4f}, Cohesion: {metrics["cohesion"]:.4f}, Coupling: {metrics["coupling"]:.4f}')
            print(f"Lambda Value: {loss_fn.lambda_param.item()}")
    
    return all_metrics

In [None]:
train_gnn(graph)

In [16]:
from gnn_models.dmon import Single

DMON_EPOCHS = 1
DMON_LR = 0.001


def train_dmon(g):
    X, _, A = get_input_data(g)
    ips = (X.unsqueeze(0), A.unsqueeze(0))
    model = Single(
        X.shape[1], 
        len(g.nodes)//NODES_PER_CLUSTER, 
        skip_conn=False, 
        collapse_regularization=0.1
    )
    model.to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=DMON_LR)

    model.train()
    all_metrics = list()
    losses_list, mod_scores = list(), list()
    for _ in tqdm(range(DMON_EPOCHS), desc='DMoN Epochs'):
        optimizer.zero_grad()
        _, pred, _, losses = model(ips)
        loss = torch.FloatTensor([0]).to(device)
        for loss_val in losses.values():
            if loss_val is not None:
                loss += loss_val

        loss.backward()
        optimizer.step()
        # if epoch % 1000 == 0:
        #     print(f"Epoch: {epoch}, Loss: {loss.item()}\n")
        losses_list.append(loss.item())
        dmon_clusters = {node: cluster for node, cluster in zip(
        g.nodes(), pred[0].detach().cpu().numpy().argmax(axis=-1))}
        
        metrics = get_modularization_scores(g, dmon_clusters)
        all_metrics.append(metrics)
    
    return all_metrics

In [17]:
train_dmon(graph)

Training Node2Vec For Node Embeddings:   0%|          | 0/1 [00:00<?, ?it/s]

DMoN Epochs:   0%|          | 0/1 [00:00<?, ?it/s]

[{'cohesion': 0.07680152112583719, 'coupling': 0.6771907216494846}]

In [19]:
from gnn_models.DGI import DeepGraphInfomax, Encoder, Summarizer, corruption, cluster_net

DGI_EPOCHS = 1
DGI_LR = 0.001
DGI_HIDDEN_DIM = 128

def make_modularity_matrix(adj):
    adj = adj*(torch.ones(adj.shape[0], adj.shape[0]).to(device) - torch.eye(adj.shape[0]).to(device))
    degrees = adj.sum(dim=0).unsqueeze(1)
    mod = adj - degrees@degrees.t()/adj.sum()
    return mod


def train_dgi(g):
    X = get_node_embeddings(g)
    edge_index = get_edge_index(g)
    adj = get_nx_adj(g)
     
    model = DeepGraphInfomax(
        hidden_channels=DGI_HIDDEN_DIM, 
        encoder=Encoder(X.shape[1], DGI_HIDDEN_DIM),
        out_channels=len(g.nodes)//NODES_PER_CLUSTER,
        summary=Summarizer(),
        corruption=corruption,
        cluster=cluster_net)
    model.to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=DGI_LR, weight_decay=5e-3)

    adj = adj.float().to(device)
    mod = make_modularity_matrix(adj)
    model.train()
    epoch_losses = list()
    all_metrics = list()

    for _ in tqdm(range(DGI_EPOCHS), desc='DGI Epochs'):
        optimizer.zero_grad()
        pos_z, neg_z, summary, mu, r, _ = model(X, edge_index)
        dgi_loss = model.loss(pos_z, neg_z, summary)
        modularity_loss = model.modularity(r, adj, mod)
        comm_loss = model.comm_loss(pos_z, mu)
        # loss = -modularity_loss
        loss = 5*dgi_loss - modularity_loss + comm_loss

        # print(f"Epoch: {epoch}, Loss: {loss.item()}")
        epoch_losses.append(loss.item())
        
        commdgi_clusters = {node: cluster for node, cluster in zip(
        g.nodes(), r.detach().cpu().numpy().argmax(axis=-1))}

        metrics = get_modularization_scores(g, commdgi_clusters)
        all_metrics.append(metrics)        

        loss.backward()
        optimizer.step()
    
    return all_metrics

In [20]:
train_dgi(graph)

Training Node2Vec For Node Embeddings:   0%|          | 0/1 [00:00<?, ?it/s]

DGI Epochs:   0%|          | 0/1 [00:00<?, ?it/s]

[{'cohesion': 0.3125866964576641, 'coupling': 0.8582474226804123}]

### Calculating Results

In [49]:
import pandas as pd
import ast
raw_ga = pd.read_excel('results/GA2.xlsx')
raw_ga.shape

(42504, 8)

In [50]:
na_cols = raw_ga['Coupling'].isna()
### remove rows with na_cols

raw_ga = raw_ga[~na_cols]
raw_ga.shape

(42294, 8)

In [51]:
gnn = pd.read_excel('results/all_gnn_results.xlsx')
gnn['file_name'] = gnn['file_name'].apply(lambda x: x.replace("../datasets/ModelClassification/modelset/raw-data/repo-ecore-all/data/", "").replace('.ecore', "").replace("/", '_'))
gnn['metrics'] = gnn['metrics'].apply(lambda x: ast.literal_eval(x))

In [52]:
ga_vs_gnn_files = list(set(gnn['file_name']).intersection(set(raw_ga['Name'])))
len(ga_vs_gnn_files)

263

In [53]:
import ast
from tqdm.auto import tqdm

ga_rows = list()
for ga_vs_gnn_file in tqdm(ga_vs_gnn_files):
    d = {'file_name': ga_vs_gnn_file}
    solution_paretos = list()
    ga_results = raw_ga[raw_ga['Name'] == ga_vs_gnn_file]
    paretos = list(set(ga_results['Pareto'].values))
    for pareto in paretos:
        solutions = list()
        instances = ga_results[ga_results['Pareto'] == pareto]
        for i, r in instances.iterrows():
            solutions.append({
                'cohesion': r['Cohesion'],
                'coupling': r['Coupling'],
                # 'clusters': ast.literal_eval(r['String'])
            })
    
        solution_paretos.append(solutions)
    d['metrics'] = solution_paretos
    ga_rows.append(d)

pd.DataFrame(ga_rows).to_excel('results/all_ga_results.xlsx', index=False)
ga = pd.read_excel('results/all_ga_results.xlsx')
ga['metrics'] = ga['metrics'].apply(lambda x: ast.literal_eval(x.replace('nan', '\"nan\"')))
ga_f2metrics = {r['file_name']: r['metrics'] for i, r in ga.iterrows()}

  0%|          | 0/263 [00:00<?, ?it/s]

In [54]:
ga_f2metrics['47hm4n3_idmDrone_plugin_sar.drone_model_generated_Drn'][0]

[{'cohesion': 0.549842638204707, 'coupling': 0.214285714285714},
 {'cohesion': 0.0586712138436276, 'coupling': 0.0},
 {'cohesion': 0.374623078071354, 'coupling': 0.0714285714285714},
 {'cohesion': 0.389520824003582, 'coupling': 0.142857142857142}]

In [55]:
def get_combined_results_dict(gnn_results, ga_file_map):
    combined_results = dict()
    for _, r in gnn_results.iterrows():
        file_name = r['file_name']
        if file_name not in ga_file_map:
            continue
        ga_metrics = ga_file_map[file_name]
        gnn_metrics = r['metrics']
        gnn_metrics['ga'] = ga_metrics
        combined_results[file_name] = gnn_metrics
    
    print(f"Combined results length: {len(combined_results)}")
    return combined_results

In [56]:
def get_pareto_area(pareto_result):
    def get_area(pareto_points):
        area = 0.0
        for i in range(len(pareto_points)):
            x1, y1 = pareto_points[i]
            if i  == 0:
                a = x1*y1
            else:
                x2, _ = pareto_points[i-1]
                a = (x1 - x2) * y1
            area += a
        return area
    points = list()
    for d in pareto_result:
        cohesion = d['cohesion'] if isinstance(d['cohesion'], float) else 0
        coupling = d['coupling'] if isinstance(d['coupling'], float) else 1
        points.append((cohesion, 1 - coupling))
    points = sorted(points, key=lambda x: x[0])
    return get_area(points)

In [57]:
def get_gnn_area(gnn_result):
    max_cohesion = gnn_result['max_cohesion']
    min_coupling = gnn_result['min_coupling']
    max_cohesion_area =  max_cohesion['cohesion'] * (1 - max_cohesion['coupling'])
    min_coupling_area = min_coupling['cohesion'] * (1 - min_coupling['coupling'])
    # print(max_cohesion['cohesion'], max_cohesion['coupling'])
    # print(min_coupling['cohesion'], min_coupling['coupling'])
    # print('---')
    return max(max_cohesion_area, min_coupling_area)

In [58]:
def get_gnn_max_cohesion(gnn_result):
    score = gnn_result['max_cohesion']['cohesion'] 
    return score if score != 'nan' else 0


def get_gnn_min_coupling(gnn_result):
    score = gnn_result['max_cohesion']['coupling']
    return score if score != 'nan' else 1

def get_max_area_pareto(ga_result):
    max_area = 0
    best_instance = None
    for pareto in ga_result:
        for instance in pareto:
            if instance['cohesion'] == 'nan' or instance['coupling'] == 'nan':
                area = 0
            else:
                area = instance['cohesion'] * (1 - instance['coupling'])
            if area > max_area:
                max_area = area
                best_instance = instance

    return best_instance

def get_ga_max_cohesion(ga_result):
    best_instance = get_max_area_pareto(ga_result)
    return best_instance['cohesion']


def get_ga_min_coupling(ga_result):
    best_instance = get_max_area_pareto(ga_result)
    return best_instance['coupling']


def get_overall_max_cohesion(result):
    max_gnn_cohesion = max([(get_gnn_max_cohesion(result[k]), k) for k in ['dgi', 'dmon', 'gnn']])
    max_ga_cohesion = (get_ga_max_cohesion(result['ga']), 'ga')
    max_overall_cohesion, max_overall_cohesion_algo = max(max_gnn_cohesion, max_ga_cohesion)
    max_gnn_cohesion_score, max_gnn_cohesion_algo = max_gnn_cohesion
    return max_overall_cohesion, max_overall_cohesion_algo, max_gnn_cohesion_score, max_gnn_cohesion_algo

def get_overall_min_coupling(result):
    min_gnn_coupling = min([(get_gnn_min_coupling(result[k]), k) for k in ['dgi', 'dmon', 'gnn']])
    min_ga_coupling = (get_ga_min_coupling(result['ga']), 'ga')
    min_overall_coupling, min_overall_coupling_algo = min(min_gnn_coupling, min_ga_coupling)
    min_gnn_coupling_score, min_gnn_coupling_algo = min_gnn_coupling
    return min_overall_coupling, min_overall_coupling_algo, min_gnn_coupling_score, min_gnn_coupling_algo

In [74]:
import numpy as np


def get_all_results_metrics(all_results):
    rows = list()
    for file_name, model_results in all_results.items():
        ga_areas = [get_pareto_area(p) for p in model_results['ga']]
        mean_ga = (sum(ga_areas)/len(ga_areas), 'ga')
        max_ga = (max(ga_areas), 'ga')
        max_ga_area = max_ga[0]

        dgi_area = (get_gnn_area(model_results['dgi']), 'dgi')
        dmon_area = (get_gnn_area(model_results['dmon']), 'dmon')
        gnn_area = (get_gnn_area(model_results['gnn']), 'gnn')

        dgi_hp_score = (np.sum([dgi_area[0] >= ga_area for ga_area in ga_areas]), 'dgi')
        dmon_hp_score = (np.sum([dmon_area[0] >= ga_area for ga_area in ga_areas]), 'dmon')
        gnn_hp_score = (np.sum([gnn_area[0] >= ga_area for ga_area in ga_areas]), 'gnn')
        best_hp_score, best_hp_algo = max([dgi_hp_score, dmon_hp_score, gnn_hp_score])
        best_hp_algo = best_hp_algo if best_hp_score / len(ga_areas) >= 0.5 else 'ga'
        
        
        max_gnn = max([dgi_area, dmon_area, gnn_area])
        max_gnn_area, max_gnn_area_algo = max_gnn
        max_overall_area, max_overall_area_aglo = max(max_ga, max_gnn)
        mean_overall_area, mean_overall_area_algo = max(mean_ga, max_gnn)
        
        max_overall_cohesion, max_overall_cohesion_algo, max_gnn_cohesion, max_gnn_cohesion_algo = get_overall_max_cohesion(model_results)
        min_overall_coupling, min_overall_coupling_algo, min_gnn_coupling, min_gnn_coupling_algo = get_overall_min_coupling(model_results)

        rows.append({
            'file_name': file_name,
            'max_ga_area': max(ga_areas),
            'dgi_area': dgi_area[0],
            'dmon_area': dmon_area[0],
            'gnn_area': gnn_area[0],
            'Max Hypervolume GNN': max_gnn_area,
            'Max Hypervolume GNN Model': max_gnn_area_algo,

            'max_ga_area': max_ga_area,
            
            'Max Hypervolume': max_overall_area,
            'Max Hypervolume Model': max_overall_area_aglo,
            'Avg. Hypervolume': mean_overall_area,
            'Avg. Hypervolume Model': mean_overall_area_algo,

            'Max Cohesion': max_overall_cohesion,
            'Max Cohesion Model': max_overall_cohesion_algo,
            'Max Cohesion (GNNs)': max_gnn_cohesion,
            'Max Cohesion (GNNs) Model': max_gnn_cohesion_algo,

            'Min Coupling': min_overall_coupling,
            'Min Coupling Model': min_overall_coupling_algo,
            'Min Coupling (GNNs)': min_gnn_coupling,
            'Min Coupling (GNNs) Model': min_gnn_coupling_algo,

            'Best Hypervolume Score': best_hp_score,
            'Best Hypervolume Score (Norm)': best_hp_score / len(ga_areas),
            'Best Hypervolume Model': best_hp_algo,
        })
    
    final_result = pd.DataFrame(rows)
    final_result.to_excel('results/all_results.xlsx', index=False)
    return final_result


In [60]:
import pandas as pd
import ast

def get_results_df(f):
    df = pd.read_excel(f)
    df['file_name'] = df['file_name'].apply(lambda x: x.replace("../datasets/ModelClassification/modelset/raw-data/repo-ecore-all/data/", "").replace('.ecore', "").replace("/", '_'))
    df['metrics'] = df['metrics'].apply(lambda x: ast.literal_eval(x))
    if isinstance(df.iloc[0]['metrics'], list):
        df['metrics'] = [r['metrics'][r['run']] for i, r in df.iterrows()]
    return df

large_models = get_results_df('results/allmax_-1_min_101_results.xlsx')
medium_models = get_results_df('results/allmax_100_min_50_results.xlsx')
small_models = get_results_df('results/allmax_50_min_-1_results.xlsx')

In [61]:
import os

dmon_results = dict()
for root, _, files in os.walk('results'):
    for f in files:
        if f.endswith('.xlsx') and f.startswith('dmon__max'):
            df = get_results_df(os.path.join(root, f))
            for _, r in df.iterrows():
                run = r['run']
                file_name = r['file_name']
                dmon_results[(file_name, run)] = r['metrics']['dmon']

def set_dmon_results(df: pd.DataFrame):
    for i, r in df.iterrows():
        file_name = r['file_name']
        run = r['run']
        if (file_name, run) in dmon_results:
            df.at[i, 'metrics']['dmon'] = dmon_results[(file_name, run)]

set_dmon_results(small_models)
set_dmon_results(medium_models)
set_dmon_results(large_models)

In [62]:
from collections import defaultdict

def get_updated_df(df):

    large_models_results = defaultdict(list)
    for _, r in df.iterrows():
        file_name = r['file_name']
        metrics = r['metrics']
        large_models_results[file_name].append(metrics)

    updated_result_rows = list()
    for file_name, result in large_models_results.items():
        gnn_models_results = defaultdict(list)
        for r in result:
            for k, v in r.items():
                gnn_models_results[k].append(v)

        for k, v in gnn_models_results.items():
            gnn_models_results[k] = max(v, key=lambda x: x['max_cohesion']['cohesion'] * (1 - x['max_cohesion']['coupling']))

        updated_result_rows.append({
            'file_name': file_name,
            'metrics': gnn_models_results
        })

    updated_df = pd.DataFrame(updated_result_rows) 
    return updated_df

large_models_updated = get_updated_df(large_models)
medium_models_updated = get_updated_df(medium_models)
small_models_updated = get_updated_df(small_models)

In [63]:
all_results = get_combined_results_dict(large_models_updated, ga_f2metrics)
all_results.update(get_combined_results_dict(medium_models_updated, ga_f2metrics))
all_results.update(get_combined_results_dict(small_models_updated, ga_f2metrics))

Combined results length: 25
Combined results length: 104
Combined results length: 136


In [75]:
final_result = get_all_results_metrics(all_results)

In [65]:
all_results['Buggaboo_j2sw_org.j2sw.parent_org.j2sw_model_generated_Dsl'].keys()

dict_keys(['dgi', 'gnn', 'dmon', 'ga'])

In [76]:
f2data = {r['file_name']: (r['num_nodes'], r['num_edges']) for _, r in gnn.iterrows()}

for i, r in final_result.iterrows():
    file_name = r['file_name']
    if file_name in f2data:
        num_nodes, num_edges = f2data[file_name]
        final_result.at[i, 'num_nodes'] = num_nodes
        final_result.at[i, 'num_edges'] = num_edges
    else:
        final_result.at[i, 'num_nodes'] = 0
        final_result.at[i, 'num_edges'] = 0

In [77]:
final_result.to_excel('results/final_results.xlsx', index=False)

In [20]:
## 43/136 => 31.6% GNN better than GA on average area for models with nodes <= 50

In [68]:
area_keys = [
    'Max Hypervolume Model', 
    'Max Hypervolume GNN Model', 
    'Avg. Hypervolume Model',
    'Best Hypervolume Model'
]
cohesion_keys = [
    'Max Cohesion Model', 
    'Max Cohesion (GNNs) Model'
]
coupling_keys = [
    'Min Coupling Model', 
    'Min Coupling (GNNs) Model'
]

all_keys = area_keys + cohesion_keys + coupling_keys

def get_results_for_key(df, key, min_nodes, max_nodes):
    
    conf_result = dict()
    if key == 'combined' and 'combined' not in df.columns:
        df['combined'] = df['num_nodes'] + df['num_edges']

    t = df.loc[(df[key] > min_nodes) & (df[key] <= max_nodes)]
    info = f"Results for models with {min_nodes} < {key} <= {max_nodes} with {len(t)} out of {len(df)} models"
    conf_result['info'] = info
    print(info)
    for key in all_keys:
        print(f"Results for {key}")
        d = dict(t[key].value_counts())
        result = {k: f"{v / sum(d.values())*100:.3f}%" for k, v in d.items()}
        if 'ga' in result:
            result['Non GA'] = f"{(sum(d.values()) - d['ga']) / sum(d.values())*100:.3f}%"
        print(result)
        conf_result[key] = result
    
    print('---')
    return conf_result

In [69]:
configs = ['num_nodes', 'num_edges', 'combined']

In [70]:
import numpy as np

def get_ranges(df, key):
    data = df[key] if key != 'combined' else df['num_nodes'] + df['num_edges']
    p_50 = int(np.percentile(data, 50))
    p_90 = int(np.percentile(data, 90))
    r1 = (0, p_50)
    r2 = (p_50, p_90)
    r3 = (p_90, int(max(data)))
    d = {
        'Small': r1,
        'Medium': r2,
        'Large': r3,
        'Overall': (0, int(max(data)))
    }
    return d

get_ranges(final_result, 'combined')

{'Small': (0, 163),
 'Medium': (163, 424),
 'Large': (424, 1778),
 'Overall': (0, 1778)}

In [71]:
config_results = {config_key: {k: dict() for k in get_ranges(final_result, config_key).keys()} for config_key in configs}

for config_key in configs:
    print(f"Results for {config_key}")
    ranges = get_ranges(final_result, config_key)
    for k, v in ranges.items():
        print(f"Results for {k} sized models")
        config_result = get_results_for_key(final_result, config_key, v[0], v[1])
        config_results[config_key][k] = config_result
    
    print('---\n')

Results for num_nodes
Results for Small sized models
Results for models with 0 < num_nodes <= 49 with 134 out of 263 models
Results for Max Hypervolume Model
{'ga': '88.806%', 'gnn': '7.463%', 'dgi': '2.239%', 'dmon': '1.493%', 'Non GA': '11.194%'}
Results for Max Hypervolume GNN Model
{'gnn': '61.940%', 'dmon': '23.881%', 'dgi': '14.179%'}
Results for Avg. Hypervolume Model
{'ga': '65.672%', 'gnn': '20.149%', 'dmon': '8.955%', 'dgi': '5.224%', 'Non GA': '34.328%'}
Results for Best Hypervolume Model
{'ga': '62.687%', 'gnn': '26.866%', 'dmon': '5.970%', 'dgi': '4.478%', 'Non GA': '37.313%'}
Results for Max Cohesion Model
{'ga': '67.164%', 'gnn': '17.164%', 'dgi': '12.687%', 'dmon': '2.985%', 'Non GA': '32.836%'}
Results for Max Cohesion (GNNs) Model
{'gnn': '50.000%', 'dgi': '32.836%', 'dmon': '17.164%'}
Results for Min Coupling Model
{'ga': '40.299%', 'gnn': '30.597%', 'dmon': '26.866%', 'dgi': '2.239%', 'Non GA': '59.701%'}
Results for Min Coupling (GNNs) Model
{'gnn': '50.746%', 'dmo

In [73]:
size_type_keys = list(config_results.keys())
size_keys = list(config_results[size_type_keys[0]].keys())
metric_keys = list(config_results[size_type_keys[0]][size_keys[0]].keys())

for metric_key in metric_keys:
    if metric_key == 'info':
        continue
    print('-'*100)
    print(f"Results for {metric_key}")
    print('-'*100)
    for size_type_key in size_type_keys:
        print(f"Results for considering model size as: {size_type_key}")
        for size_key in size_keys:
            # print('-'*100)
            print(f"Results for {size_key}")
            # print('-'*100)
            print(config_results[size_type_key][size_key]['info'])
            print(config_results[size_type_key][size_key][metric_key])
        print('-'*100)
    print('-'*100)

----------------------------------------------------------------------------------------------------
Results for Max Hypervolume Model
----------------------------------------------------------------------------------------------------
Results for considering model size as: num_nodes
Results for Small
Results for models with 0 < num_nodes <= 49 with 134 out of 263 models
{'ga': '88.806%', 'gnn': '7.463%', 'dgi': '2.239%', 'dmon': '1.493%', 'Non GA': '11.194%'}
Results for Medium
Results for models with 49 < num_nodes <= 97 with 102 out of 263 models
{'ga': '98.039%', 'gnn': '1.961%', 'Non GA': '1.961%'}
Results for Large
Results for models with 97 < num_nodes <= 226 with 27 out of 263 models
{'ga': '92.593%', 'gnn': '7.407%', 'Non GA': '7.407%'}
Results for Overall
Results for models with 0 < num_nodes <= 226 with 263 out of 263 models
{'ga': '92.776%', 'gnn': '5.323%', 'dgi': '1.141%', 'dmon': '0.760%', 'Non GA': '7.224%'}
--------------------------------------------------------------

In [30]:
import scipy.stats as stats

def get_ttest(values):
    t_statistic, p_value = stats.ttest_1samp(values, 0, alternative='greater')

    # Print the p-value
    print(f"P-value: {p_value}")
    if p_value < 0.05:
        print("Reject the null hypothesis")
    else:
        if t_statistic < 0:
            print("GA is better than GNN")
        else:
            print("GNN is better than GA")

final_result['GA GNN Area Diff'] = final_result['max_ga_area'] - final_result['Max Hypervolume GNN']
final_result['GA GNN Cohesion Diff'] = final_result['Max Cohesion'] - final_result['Max Cohesion (GNNs)']
final_result['GA GNN Coupling Diff'] = final_result['Min Coupling'] - final_result['Min Coupling (GNNs)']

diff_keys = ['GA GNN Area Diff', 'GA GNN Cohesion Diff', 'GA GNN Coupling Diff']
for diff_key in diff_keys:
    print(f"Results for {diff_key}")
    get_ttest(final_result[diff_key].values)

Results for GA GNN Area Diff
P-value: 1.6072338294258834e-67
Reject the null hypothesis
Results for GA GNN Cohesion Diff
P-value: 3.6901174855714833e-59
Reject the null hypothesis
Results for GA GNN Coupling Diff
P-value: 1.6293203211154507e-19
Reject the null hypothesis
