#### Dataset Creation

In [10]:
import json
import os

datasets_dir = 'datasets'
ecore_json_path = os.path.join(datasets_dir, 'ecore_555/ecore_555.jsonl')
mar_json_path = os.path.join(datasets_dir, 'mar-ecore-github/ecore-github.jsonl')
modelsets_uml_json_path = os.path.join(datasets_dir, 'modelset/uml.jsonl')
modelsets_ecore_json_path = os.path.join(datasets_dir, 'modelset/ecore.jsonl')

In [11]:
import networkx as nx
from tqdm.auto import tqdm
import pickle


class GenericGraph(nx.DiGraph):
    def __init__(self, json_obj):
        super().__init__()
        self.graph_id = json_obj.get('ids', None)
        self.graph_type = json_obj.get('model_type', None)
        self.label = json_obj.get('labels', None)
        self.is_duplicated = json_obj.get('is_duplicated', None)
        self.directed = json_obj.get('directed', None)
        self.create_graph(json_obj)

    def create_graph(self, json_obj):
        graph = json.loads(json_obj['graph'])
        nodes = graph['nodes']
        edges = graph['links']
        for node in nodes:
            self.add_node(node['id'], **node)
        for edge in edges:
            self.add_edge(edge['source'], edge['target'], **edge)
        
    def __repr__(self):
        return f'Graph({self.graph_id}, nodes={self.number_of_nodes()}, edges={self.number_of_edges()})'


class Dataset:
    def __init__(
            self, 
            file_name, 
            name, 
            dataset_dir = datasets_dir,
            save_dir = 'datasets/pickles',
            reload=False
        ):
        self.name = name
        self.dataset_dir = dataset_dir
        self.save_dir = save_dir
        os.makedirs(save_dir, exist_ok=True)

        dataset_exists = os.path.exists(os.path.join(save_dir, f'{name}.pkl'))
        if reload or not dataset_exists:
            json_objects = json.load(open(os.path.join(dataset_dir, file_name)))
            self.graphs = [GenericGraph(g) for g in tqdm(json_objects, desc=f'Loading {name.title()}')]
            self.save()
        
        else:
            self.load()


    def remove_duplicates(self):
        self.graphs = self.dedup()

    def dedup(self):
        return list({str(g.edges): g for g in self.graphs}.values())


    def __repr__(self):
        return f'Dataset({self.name}, graphs={len(self.graphs)})'
    
    def __getitem__(self, key):
        return self.graphs[key]
    
    def __iter__(self):
        return iter(self.graphs)
    
    def __len__(self):
        return len(self.graphs)
    
    def save(self):
        print(f'Saving {self.name} to pickle')
        with open(os.path.join(self.save_dir, f'{self.name}.pkl'), 'wb') as f:
            pickle.dump(self, f)
        print(f'Saved {self.name} to pickle')

    def load(self):
        print(f'Loading {self.name} from pickle')
        with open(os.path.join(self.save_dir, f'{self.name}.pkl'), 'rb') as f:
            self = pickle.load(f)
        
        print(f'Loaded {self.name} from pickle')
        print(f'Graphs: {len(self.graphs)}')
        

reload = False
ecore = Dataset('ecore_555/ecore_555.jsonl', 'ecore', reload=reload)
mar = Dataset('mar-ecore-github/ecore-github.jsonl', 'mar', reload=reload)
modelsets_uml = Dataset('modelset/uml.jsonl', 'modelsets_uml', reload=reload)
modelsets_ecore = Dataset('modelset/ecore.jsonl', 'modelsets_ecore', reload=reload)

datasets = {
    'ecore': ecore,
    'mar': mar,
    'modelsets_uml': modelsets_uml,
    'modelsets_ecore': modelsets_ecore
}

Loading ecore from pickle
Loaded ecore from pickle
Graphs: 548
Loading mar from pickle
Loaded mar from pickle
Graphs: 18110
Loading modelsets_uml from pickle
Loaded modelsets_uml from pickle
Graphs: 3720
Loading modelsets_ecore from pickle
Loaded modelsets_ecore from pickle
Graphs: 4127
