In [28]:
import pandas as pd
import pathlib as pl
import collections as col

top_folder = pl.Path("/home/ebertp/work/projects/hgsvc/2024_busco/per_sample")


def read_karyo_file(file_path):
    df = pd.read_csv(file_path, sep="\t", header=0, skiprows=1)
    karyo_lut = dict(
        ((row.sample, row.asm_unit), row.karyotype) for row in df.itertuples()
    )
    return karyo_lut


sex_specific = col.defaultdict(set)
for assembler in ["verkko"]:  #, "hifiasm"]:
    karyo_file = top_folder.joinpath(f"karyo-est.hgsvc3-{assembler}.tsv")
    karyo_lut = read_karyo_file(karyo_file)

    n_samples = 0
    gene_status = col.Counter()
    for table_file in top_folder.joinpath(assembler).glob("*tsv.gz"):
        sample = table_file.name.rsplit(".", 5)[0]
        df = pd.read_csv(table_file, sep="\t", header=0)
        if assembler == "verkko":
            df.drop("asm-unassigned_label", axis=1, inplace=True)
        sex1 = karyo_lut[(sample, df.columns[1].split("_")[0])]
        sex2 = karyo_lut[(sample, df.columns[2].split("_")[0])]
        if sex1 == sex2 or sex1 == "any" or sex2 == "any":
            # female
            continue
        n_samples += 1
        hap1_missing = df["asm-hap1_label"] == "Missing"
        hap2_missing = df["asm-hap2_label"] == "Missing"
        any_missing = hap1_missing | hap2_missing
        df = df.loc[any_missing, :].copy()
        df.columns = ["gene", sex1, sex2]
        for row in df.itertuples():
            gene_status[(row.gene, sex1, getattr(row, sex1))] += 1
            gene_status[(row.gene, sex2, getattr(row, sex2))] += 1

    assert n_samples == 30
    for (gene, sex, status), count in gene_status.items():
        if status != "Missing":
            continue
        if sex == "male":
            female = gene_status[(gene, "female", "Missing")]
            male_ratio = round(count/n_samples, 1)
            female_ratio = round(female/n_samples, 1)
            if male_ratio > 0.9 and female_ratio < 0.1:
                sex_specific["female"].add(gene)
        if sex == "female":
            male = gene_status[(gene, "male", "Missing")]
            male_ratio = round(male/n_samples, 1)
            female_ratio = round(count/n_samples, 1)
            if male_ratio < 0.1 and female_ratio > 0.9:
                sex_specific["male"].add(gene)

out_list = pl.Path(
    "/home/ebertp/work/code/cubi/project-run-hgsvc-hybrid-assemblies/annotations",
    "autogen", "odb10_primates.sex-specific-genes.txt"
)

for sex, genes in sex_specific.items():
    out_file = out_list.with_suffix(f".{sex}.txt")
    with open(out_file, "w") as dump:
        _ = dump.write("\n".join(sorted(genes)) + "\n")

                
                
                
            
            
    