In [1]:
from pathlib import Path
import re
import pandas as pd

DIR_A = Path("/project/def-nahee/kbas/POM_Response")
DIR_B = Path("/project/def-nahee/kbas/POM_Response_GZ")

BATCH_RE = re.compile(r"pom_response_batch_(\d+)", re.IGNORECASE)

In [2]:
def scan_batches(root: Path):
    rows = []
    # İstersen pattern'ı genişletebilirsin: "*.i3*", "*.i3", "*.i3.gz"
    for p in root.rglob("pom_response_batch_*"):
        if p.is_dir():
            continue
        m = BATCH_RE.search(p.name)
        if not m:
            continue
        batch = int(m.group(1))
        rows.append({
            "batch": batch,
            "file": str(p),
            "name": p.name,
            "suffixes": "".join(p.suffixes),   # örn ".i3.gz" veya ".i3"
            "size_bytes": p.stat().st_size if p.exists() else None,
        })
    df = pd.DataFrame(rows)
    if df.empty:
        return df
    # aynı batch için birden fazla dosya olabilir -> hepsini tutuyoruz
    return df.sort_values(["batch", "name"]).reset_index(drop=True)

In [3]:
dfA = scan_batches(DIR_A)
dfB = scan_batches(DIR_B)

setA = set(dfA["batch"].unique()) if not dfA.empty else set()
setB = set(dfB["batch"].unique()) if not dfB.empty else set()

both = sorted(setA & setB)
onlyA = sorted(setA - setB)
onlyB = sorted(setB - setA)

In [4]:
print("=== SUMMARY ===")
print("DIR_A:", DIR_A, "unique batches:", len(setA))
print("DIR_B:", DIR_B, "unique batches:", len(setB))
print("Both:", len(both))
print("Only in DIR_A:", len(onlyA))
print("Only in DIR_B:", len(onlyB))

=== SUMMARY ===
DIR_A: /project/def-nahee/kbas/POM_Response unique batches: 4996
DIR_B: /project/def-nahee/kbas/POM_Response_GZ unique batches: 4420
Both: 4420
Only in DIR_A: 576
Only in DIR_B: 0


In [5]:
all_batches = sorted(setA | setB)
summary = pd.DataFrame({
    "batch": all_batches,
    "in_POM_Response": [b in setA for b in all_batches],
    "in_POM_Response_GZ": [b in setB for b in all_batches],
})

cntA = dfA.groupby("batch").size() if not dfA.empty else pd.Series(dtype=int)
cntB = dfB.groupby("batch").size() if not dfB.empty else pd.Series(dtype=int)

summary["n_files_in_POM_Response"] = summary["batch"].map(cntA).fillna(0).astype(int)
summary["n_files_in_POM_Response_GZ"] = summary["batch"].map(cntB).fillna(0).astype(int)

print("\nFirst 20 both:", both[:20])
print("First 20 onlyA:", onlyA[:20])
print("First 20 onlyB:", onlyB[:20])

summary



First 20 both: [100, 101, 102, 103, 104, 105, 106, 108, 109, 110, 112, 113, 114, 115, 117, 118, 120, 121, 123, 125]
First 20 onlyA: [506, 507, 508, 510, 512, 513, 514, 516, 518, 520, 521, 522, 524, 526, 527, 528, 529, 530, 531, 532]
First 20 onlyB: []


Unnamed: 0,batch,in_POM_Response,in_POM_Response_GZ,n_files_in_POM_Response,n_files_in_POM_Response_GZ
0,100,True,True,1,1
1,101,True,True,1,1
2,102,True,True,1,1
3,103,True,True,1,1
4,104,True,True,1,1
...,...,...,...,...,...
4991,5584,True,False,1,0
4992,5585,True,False,1,0
4993,5586,True,False,1,0
4994,5587,True,False,1,0
