In [1]:
import pandas as pd
import glob

In [5]:
# choose the columns you want to extract
cols = [
    "contig", "gene_id", "position",
    "A_frequency_p_value_tTest_abs", "A_frequency_p_value_MannWhitney_abs",
    "T_frequency_p_value_tTest_abs", "T_frequency_p_value_MannWhitney_abs",
    "G_frequency_p_value_tTest_abs", "G_frequency_p_value_MannWhitney_abs",
    "C_frequency_p_value_tTest_abs", "C_frequency_p_value_MannWhitney_abs",
    "num_samples_UC", "num_samples_control","notes"
]

# find all *.tsv.gz files
files = glob.glob("two_sample_unpaired_initial_second-UC_control/*.tsv.gz")

subset_files = []

for f in files:
    print("Reading:", f)
    df = pd.read_csv(f, sep="\t", compression="gzip")
    df_subset = df[cols]

    out = f.replace(".tsv.gz", "_subset.tsv.gz")
    df_subset.to_csv(out, sep="\t", compression="gzip", index=False)
    subset_files.append(out)

    print("Wrote:", out)

# --- concatenate all subset files ---
print("\nConcatenating all subset files...")

dfs = [pd.read_csv(f, sep="\t", compression="gzip") for f in subset_files]
merged = pd.concat(dfs, ignore_index=True)

merged.to_csv("all_subsets_merged.tsv.gz", sep="\t", compression="gzip", index=False)

print("Done! → all_subsets_merged.tsv.gz")

Reading: two_sample_unpaired_initial_second-UC_control/MGYG000002545_two_sample_unpaired.tsv.gz
Wrote: two_sample_unpaired_initial_second-UC_control/MGYG000002545_two_sample_unpaired_subset.tsv.gz
Reading: two_sample_unpaired_initial_second-UC_control/MGYG000002549_two_sample_unpaired.tsv.gz
Wrote: two_sample_unpaired_initial_second-UC_control/MGYG000002549_two_sample_unpaired_subset.tsv.gz
Reading: two_sample_unpaired_initial_second-UC_control/MGYG000001378_two_sample_unpaired.tsv.gz
Wrote: two_sample_unpaired_initial_second-UC_control/MGYG000001378_two_sample_unpaired_subset.tsv.gz
Reading: two_sample_unpaired_initial_second-UC_control/MGYG000001346_two_sample_unpaired.tsv.gz
Wrote: two_sample_unpaired_initial_second-UC_control/MGYG000001346_two_sample_unpaired_subset.tsv.gz
Reading: two_sample_unpaired_initial_second-UC_control/MGYG000000196_two_sample_unpaired.tsv.gz
Wrote: two_sample_unpaired_initial_second-UC_control/MGYG000000196_two_sample_unpaired_subset.tsv.gz
Reading: two_sa

In [9]:
from statsmodels.stats.multitest import multipletests
import pandas as pd

df = pd.read_csv("all_subsets_merged.tsv.gz", sep="\t", compression="gzip")

pvals = df[
    [
        "A_frequency_p_value_tTest_abs",
        "A_frequency_p_value_MannWhitney_abs",
        "T_frequency_p_value_tTest_abs",
        "T_frequency_p_value_MannWhitney_abs",
        "G_frequency_p_value_tTest_abs",
        "G_frequency_p_value_MannWhitney_abs",
        "C_frequency_p_value_tTest_abs",
        "C_frequency_p_value_MannWhitney_abs"
    ]
]
    
corrected = pvals.apply(lambda col: multipletests(col, method='fdr_bh')[1])


corrected.columns = [c + "_FDR" for c in corrected.columns]
df_corrected = pd.concat([df, corrected], axis=1)

df_corrected.to_csv("all_subsets_with_FDR.tsv.gz", sep="\t", compression="gzip", index=False)

In [11]:
df_corrected.to_csv("all_subsets_with_FDR.tsv", sep="\t", index=False)