In [10]:
#!/usr/bin/env python3

import pandas as pd
import glob
import os

# ---- CONFIG ----
GRIST_DIR = "."   # change if needed
OUTPUT_FILE = "grist_summary.csv"
PATTERN = "*.summary.csv"
# ----------------

rows = []

for grist_path in sorted(glob.glob(os.path.join(GRIST_DIR, PATTERN))):
    df = pd.read_csv(grist_path)

    if df.empty:
        continue

    # infer sample_id (SRRxxxxxx)
    sample_id = df.loc[0, "sample_id"]
    if pd.isna(sample_id):
        sample_id = os.path.basename(grist_path).replace(".summary.csv", "")

    summary = {
        "sample_id": sample_id,
        "n_match_genomes": df["genome_id"].nunique(),
        "avg_f_covered_bp": df["f_covered_bp"].mean(),
        "avg_effective_coverage": df["effective_coverage"].mean(),
        "n_mapped_reads": df["n_mapped_reads"].sum(),
    }

    rows.append(summary)

summary_df = pd.DataFrame(rows).sort_values("sample_id")
summary_df.to_csv(OUTPUT_FILE, index=False)

print(f"Wrote {OUTPUT_FILE} with {len(summary_df)} samples")


Wrote grist_summary.csv with 83 samples


In [11]:
df[df['sample_id'] == 'SRR975540']

Unnamed: 0,index,n_chrom,n_snps,n_genome_bp,n_missed_bp,f_missed_bp,n_covered_bp,f_covered_bp,avg_coverage,effective_coverage,genome_id,sample_id,n_mapped_reads
0,GCF_037901485,2,768,2968938,42728,0.014392,2926210,0.985608,20.295682,20.592036,GCF_037901485,SRR975540,821666
1,GCA_003476345,54,669,2158847,66994,0.031032,2091853,0.968968,36.722827,37.898918,GCA_003476345,SRR975540,1083321
2,GCF_000219535,3,1181,5153859,2527329,0.490376,2626530,0.509624,3.961225,7.772838,GCF_000219535,SRR975540,279064
3,GCF_004376875,76,1121,5175933,3698713,0.714598,1477220,0.285402,0.965632,3.383413,GCF_004376875,SRR975540,69141
4,GCF_000742975,3,394,3048131,2205277,0.723485,842854,0.276515,1.303318,4.71337,GCF_000742975,SRR975540,54377
5,GCF_027662945,22,181,1970717,1697236,0.861228,273481,0.138772,0.656471,4.730563,GCF_027662945,SRR975540,17931
6,GCA_020093245,9,479,3384012,3067810,0.90656,316202,0.09344,0.248734,2.661963,GCA_020093245,SRR975540,11564
7,GCF_025122195,22,405,1890550,1677193,0.887146,213357,0.112854,0.47478,4.20701,GCF_025122195,SRR975540,12324


In [12]:
summary = pd.read_csv("grist_summary.csv")
summary

Unnamed: 0,sample_id,n_match_genomes,avg_f_covered_bp,avg_effective_coverage,n_mapped_reads
0,DRR014782,735,0.283430,5.906585,75612507
1,DRR014785,892,0.272454,5.303160,83163638
2,DRR121397,234,0.131079,1.655980,2121643
3,DRR121401,175,0.121921,1.409490,1311236
4,ERR10695318,57,0.265579,126.762150,123175020
...,...,...,...,...,...
78,SRR7299214,351,0.276998,3.056625,6705137
79,SRR7610133,13,0.817984,18.349528,3058996
80,SRR9109404,90,0.415485,15.212256,25075735
81,SRR9182859,739,0.317031,17.608389,104629074
