___
# edge category extraction
___

## procedure
- run `netctrl` and save GML format file for each networks
- parsing the GML(Graph Modeling Language) file and save edge category information into JSON file format
- extract critical edge source gene information

In [1]:
# import modules
from pathlib import Path
import os, re, json, subprocess
import pandas as pd

# set path
DEFAULT_NETCTRL = '/home/Program/netctrl/build/src/ui/netctrl' # netctrl path
NOTEBOOK_DIR = Path.cwd() # current path
BASE_DIR = NOTEBOOK_DIR.parents[1] # working directory
INFERRED_DIR = BASE_DIR / 'data' / 'inferred_grn' # ncol file path
RESULT_DIR = BASE_DIR / 'data' / 'result' # result file saving path
GML_ROOT = RESULT_DIR / 'gml' # gml file path
RESULT_JSON = RESULT_DIR / 'edge_classes_liu.json'
RESULT_CRITICAL_TF = RESULT_DIR / 'critical_TF.csv'

RESULT_DIR.mkdir(parents=True, exist_ok=True)
GML_ROOT.mkdir(parents=True, exist_ok=True)

## function define
- `run_netctrl_to_gml`: generate GML file from ncol
- `parse_gml_edge_classes`: generate edge category JSON file from GML

In [2]:
# function that generate GML file
def run_netctrl_to_gml(ncol_path: Path, gml_path: Path, netctrl_paht: DEFAULT_NETCTRL):
    gml_path.parent.mkdir(parents=True, exist_ok=True)
    cmd = [str(DEFAULT_NETCTRL), '-m', 'liu', '-M', 'graph', '-F', 'gml', '-o', str(gml_path), str(ncol_path)]
    res = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
    if res.returncode != 0:
        raise RuntimeError(f'netctrl failed ({res.returncode}) for {ncol_path}\nSTDOUT:\n{res.stdout}\nSTDERR:\n{res.stderr}')

# compile regular expression pattern
NAME_RE = re.compile(r'^name\s+\"(.*)\"\s*$')
EDGECLASS_RE = re.compile(r'^edgeclass\s+\"(.*)\"\s*$')

# function that generate JSON file
def parse_gml_edge_classes(gml_path: Path):
    id_to_name = {}
    categories = { 'critical': [], 'ordinary': [], 'redundant': [] }

    in_node = False
    in_edge = False
    node_id = None
    node_name = None
    src = None
    tgt = None
    cls = None

    def commit_node():
        if node_id is not None:
            id_to_name[str(node_id)] = node_name if node_name is not None else str(node_id)

    def commit_edge():
        if src is None or tgt is None:
            return
        edge_class = (cls or '').lower()
        if edge_class not in categories:
            return
        s = id_to_name.get(str(src), str(src))
        t = id_to_name.get(str(tgt), str(tgt))
        categories[edge_class].append([s, t])

    with open(gml_path, 'r', encoding='utf-8') as f:
        for raw in f:
            line = raw.strip()
            if not line:
                continue

            # block start poing
            if line.startswith('node') and not in_node and not in_edge:
                in_node = True
                node_id = None
                node_name = None
                continue
            if line.startswith('edge') and not in_edge and not in_node:
                in_edge = True
                src = None
                tgt = None
                cls = None
                continue

            # block end point
            if line == ']':
                if in_node:
                    commit_node()
                    in_node = False
                    node_id = None
                    node_name = None
                    continue
                if in_edge:
                    commit_edge()
                    in_edge = False
                    src = None
                    tgt = None
                    cls = None
                    continue

            # inside of node block
            if in_node:
                if line.startswith('id '):
                    try:
                        node_id = int(line.split()[1])
                    except Exception:
                        pass
                    continue
                m = NAME_RE.match(line)
                if m:
                    node_name = m.group(1)
                    continue

            # inside of edge block
            if in_edge:
                if line.startswith('source '):
                    try:
                        src = int(line.split()[1])
                    except Exception:
                        pass
                    continue
                if line.startswith('target '):
                    try:
                        tgt = int(line.split()[1])
                    except Exception:
                        pass
                    continue
                m = EDGECLASS_RE.match(line)
                if m:
                    cls = m.group(1)
                    continue

    return categories

## select specific phase or network (optional)
- if `PHASE_FILTER` is None, then select all phase equal to `phase*`
- if set `NETWORK_LIMIT`, then select top N networks

In [3]:
PHASE_FILTER = []  # ex: ['phase4'] or [] all phase
NETWORK_LIMIT = None  # ex: numeric value or None

## run and generate GML

In [4]:
# result dictionary
result = {}

# phase directory for loop 
phase_dirs = sorted([p for p in INFERRED_DIR.iterdir() if p.is_dir() and p.name.startswith('phase')])
for phase_dir in phase_dirs:
    phase = phase_dir.name
    if PHASE_FILTER and phase not in PHASE_FILTER:
        continue
    result.setdefault(phase, {})

    # ncol network file for loop
    ncols = sorted(phase_dir.glob('*.ncol'))
    if NETWORK_LIMIT is not None:
        ncols = ncols[:NETWORK_LIMIT]

    for ncol_path in ncols:
        network_name = ncol_path.stem
        gml_dir = GML_ROOT / phase
        gml_path = gml_dir / f'{network_name}.gml'

        # 1) generate GML
        run_netctrl_to_gml(ncol_path, gml_path, DEFAULT_NETCTRL)

        # 2) parsing GML
        categories = parse_gml_edge_classes(gml_path)
        result[phase][network_name] = categories

        c = categories
        print(f"{phase}/{network_name}: critical={len(c['critical'])}, ordinary={len(c['ordinary'])}, redundant={len(c['redundant'])}")

phase1/Blastomeres: critical=11, ordinary=861, redundant=398
phase1/Enveloping_Layer: critical=1, ordinary=2651, redundant=59
phase1/Primordial_Germ_cells: critical=5, ordinary=920, redundant=29
phase2/Ectoderm: critical=9, ordinary=769, redundant=352
phase2/Enveloping_Layer: critical=26, ordinary=1094, redundant=623
phase2/Other_Axial_Mesoderm: critical=10, ordinary=796, redundant=451
phase2/Other_Mesendoderm: critical=12, ordinary=651, redundant=180
phase2/Primordial_Germ_cells: critical=13, ordinary=488, redundant=140
phase3/Ectoderm: critical=12, ordinary=960, redundant=266
phase3/Enveloping_Layer: critical=16, ordinary=836, redundant=304
phase3/Notochord: critical=14, ordinary=551, redundant=184
phase3/Other_Mesendoderm: critical=16, ordinary=712, redundant=281
phase3/Paraxial_Mesoderm: critical=12, ordinary=682, redundant=120
phase3/Prechordal_Plate: critical=5, ordinary=546, redundant=106
phase3/Primordial_Germ_cells: critical=14, ordinary=911, redundant=283
phase4/Adaxial_Cells

## save result into JSON

In [5]:
# save into JSON
with open(RESULT_JSON, 'w', encoding='utf-8') as f:
    json.dump(result, f, ensure_ascii=False, indent=2)

In [6]:
# critical TF visualization by dataframe format
with open(RESULT_JSON, 'r', encoding='utf-8') as f:
    data = json.load(f)

rows = []
for phase, nets in data.items():
    for network, categories in nets.items():
        critical_edges = categories.get('critical', [])
        sources = [pair[0] for pair in critical_edges]  # critical TF
        sources_str = ",".join(map(str, sources)) if sources else ""
        rows.append({
            "phase": phase,
            "network": network,
            "critical_TF": sources_str
        })

df = pd.DataFrame(rows).sort_values(["phase", "network"]).reset_index(drop=True)
df


Unnamed: 0,phase,network,critical_TF
0,phase1,Blastomeres,"FOXB1A,FOXD3,FOXD5,HOPX,KLF6A,POU5F3,SI:DKEY-2..."
1,phase1,Enveloping_Layer,FOXD3
2,phase1,Primordial_Germ_cells,"FOXB1A,ID3,RARGA,VED,ZNF503"
3,phase2,Ectoderm,"BLF,EVE1,HER7,ID1,MSX1B,OTX1B,TFAP2C,VOX,ZEB1A"
4,phase2,Enveloping_Layer,"BLF,CABZ01070258.1,CEBPD,CTH1,ETV4,FOXA,FOXD3,..."
5,phase2,Other_Axial_Mesoderm,"CABZ01070258.1,DRL,HER7,HMGB1A,HOPX,MYT1A,NOTO..."
6,phase2,Other_Mesendoderm,"CDX4,E2F7,EVE1,FOSAB,FOXA2,HER5,MSGN1,OSR1,OTX..."
7,phase2,Primordial_Germ_cells,"CDX4,HER3,HER5,HER7,ID2A,ID3,IRX7,MYCN,OTX1B,T..."
8,phase3,Ectoderm,"DLX3B,GATA3,HER5,HMGB1A,IRX1B,OLIG3,OTX2,PAX2A..."
9,phase3,Enveloping_Layer,"ATF3,FOXB1A,GBX1,ID1,ID3,MEIS2B,OTX1A,RARGA,SI..."


In [7]:
# save critical TF
df.to_csv(RESULT_CRITICAL_TF)