In [4]:
import pandas as pd
import pathlib as pl
import collections as col

import matplotlib as mpl
import matplotlib.pyplot as plt

input_folder = pl.Path("/home/ebertp/work/projects/hgsvc/2023_assm_gene/asmgene_raw")

dups = col.Counter()
dups_iv = dict()
misses = col.Counter()
misses_iv = dict()
total_haps = 0
for txt_file in input_folder.glob("*asm-hap*.txt"):
    
    with open(txt_file, "r") as listing:
        total_haps += 1
        for line in listing:
            parts = line.strip().split()
            if parts[0] in ["0", "M", "D"]:
                
                bucket = dups if parts[0] == "D" else misses
                regions = dups_iv if parts[0] == "D" else misses_iv
                
                id_parts = parts[2].strip("|").split("|")
                biotype = id_parts[-1]
                length = int(id_parts[-2])
                gene_name = id_parts[-3]
                if gene_name.startswith("MT"):
                    continue
                if parts[-3] in ["chrX", "chrY", "chrM", "chrMT"]:
                    continue
                region = parts[-3], int(parts[-2]), int(parts[-1]), gene_name, length
                if "protein_coding" in biotype:
                    bucket[gene_name] += 1
                    regions[gene_name] = region
                elif ("TR_" in biotype or "IG_" in biotype) and "pseudogene" not in biotype:
                    bucket[gene_name] += 1
                    regions[gene_name] = region
                else:
                    bucket["other"] += 1                
            else:
                continue

bar_heights = []
bar_labels = []

with open("missing_genes.tsv", "w") as dump:
    for name in sorted(misses.keys()):
        dump.write(f"{name}\t{misses[name]}\n")
    
raise
boxes_by_chrom = col.defaultdict(list)
for gene_name, missed_haps in misses.most_common():
    if gene_name == "other":
        continue
    pct_missed_haps = round(missed_haps / total_haps * 100, 2)
    region = misses_iv[gene_name]
    boxes_by_chrom[region[0]].append(pct_missed_haps)
    
    bar_heights.append(pct_missed_haps)
    bar_labels.append(gene_name)
    
fig, ax = plt.subplots(figsize=(12,8))

_ = ax.bar(
    list(range(1, len(bar_heights)+1)),
    bar_heights,
    align="center",
    width=1
)

ax.set_xlabel("Distinct coding genes (unlabeled)", fontsize=14)
ax.set_ylabel("Missing in haplotype assembly (%)", fontsize=14)

fig, ax = plt.subplots(figsize=(12,8))

_ = ax.bar(
    list(range(1, 51)),
    bar_heights[:50],
    align="center",
    width=1
)

ax.set_xticks(list(range(1, 51)))
ax.set_xticklabels(bar_labels[:50], rotation=90, fontsize=12)

ax.set_xlabel("Distinct coding genes (first 50)", fontsize=14)
ax.set_ylabel("Missing in haplotype assembly (%)", fontsize=14)

fig, ax = plt.subplots(figsize=(12,8))

boxes = [[]] * 22
box_labels = [[]] * 22
for chrom, values in boxes_by_chrom.items():
    idx = int(chrom.strip("chr")) - 1
    boxes[idx] = values
    
    pct_of_genes = round(len(values) / len(bar_heights) * 100, 1)
    box_label = f"{pct_of_genes}% - {chrom.strip('chr')}"
    box_labels[idx] = box_label

_ = ax.boxplot(
    boxes,
    positions=list(range(1, 23)),
)

ax.set_xticks(list(range(1, 23)))
ax.set_xticklabels(box_labels, rotation=90)

ax.set_xlabel("Missing genes per chromosome (% of all distinct missing)", fontsize=14)
ax.set_ylabel("Missing in haplotype assembly (%)", fontsize=14)


RuntimeError: No active exception to reraise