In [3]:
import pandas as pd
from pathlib import Path

# 파일 경로 설정
# Type A: "0,phase1_Blastomeres,FOXB1A,FOXD3,..."
TYPE_A_CSV = Path('../../data/result/critical_TF.csv')          # 예시 경로
# Type B: ",phase,network,critical_TF" (콤마로 구분된 문자열)
TYPE_B_CSV = Path('./critical_TF.csv')                  # 예시 경로

assert TYPE_A_CSV.exists(), f"Not found: {TYPE_A_CSV}"
assert TYPE_B_CSV.exists(), f"Not found: {TYPE_B_CSV}"

def parse_type_a(csv_path: Path):
    # header 없음, 0열: index, 1열: "phaseX_network", 2열~: TF들
    df = pd.read_csv(csv_path, header=None, dtype=str).fillna('')
    data = {}
    for _, row in df.iterrows():
        key = str(row.iloc[1]).strip()
        if '_' in key:
            phase, network = key.split('_', 1)  # 첫 '_'만 분리
        else:
            # 비정형이면 phase 비움
            phase, network = '', key
        tfs = {str(x).strip() for x in row.iloc[2:].tolist() if str(x).strip()}
        data.setdefault(phase, {}).setdefault(network, set()).update(tfs)
    return data

def parse_type_b(csv_path: Path):
    # header 있음: phase, network, critical_TF (콤마로 연결된 문자열)
    df = pd.read_csv(csv_path, dtype=str).fillna('')
    data = {}
    for _, row in df.iterrows():
        phase = str(row['phase']).strip()
        network = str(row['network']).strip()
        tfs = str(row['critical_TF']).strip()
        if tfs.startswith('[') and tfs.endswith(']'):
            # 혹시 리스트 문자열로 저장된 경우
            tfs = tfs.strip('[]')
        tf_list = [s.strip().strip('"').strip("'") for s in tfs.split(',') if s.strip()]
        data.setdefault(phase, {}).setdefault(network, set()).update(tf_list)
    return data

a = parse_type_a(TYPE_A_CSV)  # 내 방식
b = parse_type_b(TYPE_B_CSV)  # netctrl에서 정리한 형식(or 너가 만든 형식)

# (phase, network) 전체 키 집합
keys = set()
for p in a:
    for n in a[p]:
        keys.add((p, n))
for p in b:
    for n in b[p]:
        keys.add((p, n))

rows = []
for phase, network in sorted(keys):
    my_set = a.get(phase, {}).get(network, set())
    our_set = b.get(phase, {}).get(network, set())
    rows.append({
        'phase': phase,
        'network': network,
        'my_count': len(my_set),
        'our_count': len(our_set),
        'intersection': len(my_set & our_set),
        'only_in_my': len(my_set - our_set),
        'only_in_our': len(our_set - my_set),
        'is_my_subset_of_our': my_set.issubset(our_set),
        'is_our_subset_of_my': our_set.issubset(my_set),
        # 상세 목록 보고 싶으면 아래 주석 해제
        # 'only_in_my_list': ','.join(sorted(my_set - our_set)),
        # 'only_in_our_list': ','.join(sorted(our_set - my_set)),
    })

cmp_df = pd.DataFrame(rows).sort_values(['phase','network']).reset_index(drop=True)
cmp_df


Unnamed: 0,phase,network,my_count,our_count,intersection,only_in_my,only_in_our,is_my_subset_of_our,is_our_subset_of_my
0,phase1,Blastomeres,10,11,10,0,1,True,False
1,phase1,Enveloping_Layer,0,1,0,0,1,True,False
2,phase1,Primordial_Germ_cells,0,5,0,0,5,True,False
3,phase2,Ectoderm,9,9,9,0,0,True,True
4,phase2,Enveloping_Layer,21,26,21,0,5,True,False
5,phase2,Other_Axial_Mesoderm,8,10,8,0,2,True,False
6,phase2,Other_Mesendoderm,7,12,7,0,5,True,False
7,phase2,Primordial_Germ_cells,11,13,11,0,2,True,False
8,phase3,Ectoderm,9,12,9,0,3,True,False
9,phase3,Enveloping_Layer,14,16,14,0,2,True,False
