In [1]:
import pandas as pd

In [15]:
def get_ginfo(s):
    parts = s.strip().split(';')
    gname = None
    gtype = None
    for x in parts:
        kv_pair = x.strip().split('=')
        if len(kv_pair) != 2: continue
        if kv_pair[0] == 'gene_name':
            gname = kv_pair[1]
        elif kv_pair[0] == 'gene_biotype':
            gtype = kv_pair[1]
    return gname, gtype

def load_gan(fn):
    hdr = ['ctg', 'src', 'type', 'start', 'end', 'score', 'strand', 'frame', 'attributes']
    df = pd.read_csv(fn, sep='\t', header=None, comment='#')
    df.columns = hdr
    n_genes = dict()
    gtype_tbl = dict()
    for _, row in df.iterrows():
        if row['type'] == 'gene':
            gname, gtype = get_ginfo(row['attributes'])
            if not gtype or not gname: print(f'error while loading line\n{row}\n'); return None
            if gname in n_genes:
                n_genes[gname] += 1
            else:
                n_genes[gname] = 1
            if gname in gtype_tbl:
                assert gtype_tbl[gname] == gtype
            else:
                gtype_tbl[gname] = gtype
    return n_genes, gtype_tbl

def write_gtype2file(fn, gtype_tbl) -> None:
    with open(fn, 'w') as fh:
        fh.write('gene_name,gene_biotype\n')
        for gname in gtype_tbl:
            fh.write(f'{gname},{gtype_tbl[gname]}\n')

def write_n_genes2file(fn, n_genes) -> None:
    with open(fn, 'w') as fh:
        fh.write('gene_name,n\n')
        for gname in n_genes:
            fh.write(f'{gname},{n_genes[gname]}\n')

def count_mat_and_pat(mat_n_genes, pat_n_genes) -> dict:
    n_genes = dict()
    gnames = set(mat_n_genes.keys()) | set(pat_n_genes.keys())
    for gname in gnames:
        mat_n = mat_n_genes[gname] if gname in mat_n_genes else 0
        pat_n = pat_n_genes[gname] if gname in pat_n_genes else 0
        n_genes[gname] = max(mat_n, pat_n)
    return n_genes

In [4]:
mat_n_genes, mat_gtype_tbl = load_gan("/ccb/salz4-3/hji20/hg002-q100-annotation/results/6_add_copies/mat_loff/fmted.modified.gff")

In [6]:
write_gtype2file("/ccb/salz4-3/hji20/hg002-q100-annotation/results/6_add_copies/mat_loff/gtypes.csv", mat_gtype_tbl)
write_n_genes2file("/ccb/salz4-3/hji20/hg002-q100-annotation/results/6_add_copies/mat_loff/n_genes.csv", mat_n_genes)

In [7]:
pat_n_genes, pat_gtype_tbl = load_gan("/ccb/salz4-3/hji20/hg002-q100-annotation/results/6_add_copies/pat_loff/fmted.modified.gff")

In [8]:
write_gtype2file("/ccb/salz4-3/hji20/hg002-q100-annotation/results/6_add_copies/pat_loff/gtypes.csv", pat_gtype_tbl)
write_n_genes2file("/ccb/salz4-3/hji20/hg002-q100-annotation/results/6_add_copies/pat_loff/n_genes.csv", pat_n_genes)

In [None]:
union_n_genes = count_mat_and_pat(mat_n_genes, pat_n_genes)

In [14]:
sum(union_n_genes.values())

60964

In [16]:
n_genes_by_gtype = dict()
for gname in union_n_genes:
    gtype = None
    if gname in mat_gtype_tbl:
        gtype = mat_gtype_tbl[gname]
    else:
        if gname in pat_gtype_tbl:
            gtype = pat_gtype_tbl[gname]
    if not gtype: print(f"error while looking up gene biotype for gene {gname}")
    if gtype not in n_genes_by_gtype:
        n_genes_by_gtype[gtype] = union_n_genes[gname]
    else:
        n_genes_by_gtype[gtype] += union_n_genes[gname]

In [18]:
fn = "/ccb/salz4-3/hji20/hg002-q100-annotation/results/6_add_copies/mat+pat_n_genes_by_gtype.csv"
with open(fn, 'w') as fh:
    fh.write('gene_biotype,n\n')
    for gtype in n_genes_by_gtype:
        fh.write(f'{gtype},{n_genes_by_gtype[gtype]}\n')