In [11]:
import pandas as pd
from lcastar import LcaStar, Lineage
from local.utils import regex

In [12]:
_df = pd.read_csv("../../data/beaver_nr.tsv", header=None, sep="\t")
print(_df.shape)
hits = {}
meta = {}
for _, row in _df.iterrows():
    query, subject, annotation, pident, bitscore = row
    hits[query] = hits.get(query, []) + [(subject, pident, bitscore)]

    if subject in meta: continue
    meta[subject] = annotation

for q, d in hits.items():
    hits[q] = sorted(d, key=lambda x: x[2], reverse=True)

len(hits), len(meta)

(102312, 5)


(4263, 90728)

In [50]:
_rows = []
for i, (q, d) in enumerate(hits.items()):
    # print(f"{i}/{len(hits)}", end="\r")
    best = d[0]
    subject, pident, bitscore = best

    ann = meta[subject]
    tax = next(regex(r"\].+?\[", ann[::-1]))[1:-1:][::-1]
    tax = tax.replace("[", "").replace("]", "")
    # if "hydroxymyristoyl" in ann:
    #     print(tax, ann, sep=" | ")

    _rows.append((q, tax, pident, bitscore))

_df2 = pd.DataFrame(_rows, columns=["query", "tax", "pident", "bitscore"])
print(_df2.shape)
_df2.head()

(4263, 4)


Unnamed: 0,query,tax,pident,bitscore
0,CEC_513.1,Muribaculaceae bacterium,55.4,455.0
1,CEC_513.2,Muribaculaceae bacterium,63.5,338.0
2,CEC_513.3,Muribaculaceae bacterium,96.0,439.0
3,CEC_513.4,Muribaculaceae bacterium,63.5,1073.0
4,CEC_513.5,Muribaculaceae bacterium,67.4,424.0


In [52]:
from lcastar.algorithm import ResultNode

trees: dict[str, LcaStar] = {}
ranks = {}
for _, row in _df2.iterrows():
    query, sci_name, pident, bitscore = row
    entry = query.split(".")[0]
    lineage = Lineage.FromSciName(sci_name)
    assert lineage is not None, (lineage, sci_name)
    if entry not in trees:
        trees[entry] = LcaStar()
    trees[entry].NewObservation(lineage)

lineages: dict[str, list[ResultNode]] = {}
for i, (k, tree) in enumerate(trees.items()):
    # if i != 3: continue
    lin = tree.BestLineage()
    lineages[k] = lin
    # print(k)
    # print([(n.level, n.name) for n in lin if n.level not in {"clade", }])
    # print([(n.cumulative_votes) for n in lin])
    # print([(n.fraction_votes) for n in lin])
    # print([(round(n.entropy*100)/100) for n in lin])
    # print([(round(n.p_value*10000)/10000) for n in lin])
    # print()

In [57]:
import json
_rows = []
tax_levels = "superkingdom, phylum, class, order, family, genus, species".split(", ")
for k, lin in lineages.items():
    by_level = {n.level: n for n in lin}
    nodes =     [by_level[l] if l in by_level else None for l in tax_levels]
    tax =       [n.name if n is not None else None for n in nodes]
    pvalues =   [round(n.p_value, 5) if n is not None else None for n in nodes]
    votes =     [n.cumulative_votes if n is not None else None for n in nodes]
    frac_votes =[round(n.fraction_votes, 3) if n is not None else None for n in nodes]
    entropy =   [round(n.entropy, 3) if n is not None else None for n in nodes]

    entry = [k]+tax+pvalues+votes+frac_votes+entropy+[json.dumps([n.__dict__ for n in lin], separators=(",", ":"))]
    _rows.append(entry)

stats = [None, "pvalue", "votes", "frac_votes", "entropy"]
tax_cols = [f"{l}_{s}" if s is not None else l for s in stats for l in tax_levels]
cols = ["fosmid"] + tax_cols + ["raw"]
df = pd.DataFrame(_rows, columns=[c for c in cols])
print(df.shape)
df.to_csv("./cache/beaver_nr_lca_star.csv", index=False)
df.head()

(166, 37)


Unnamed: 0,fosmid,superkingdom,phylum,class,order,family,genus,species,superkingdom_pvalue,phylum_pvalue,...,genus_frac_votes,species_frac_votes,superkingdom_entropy,phylum_entropy,class_entropy,order_entropy,family_entropy,genus_entropy,species_entropy,raw
0,CEC_513,Bacteria,Bacteroidota,Bacteroidia,Bacteroidales,Muribaculaceae,,Muribaculaceae bacterium,0.0,0.0,...,,0.743,-0.452,-0.452,-0.452,-0.452,-0.14,,-0.096,"[{""name"":""Bacteria"",""level"":""superkingdom"",""en..."
1,CEC_2113,Bacteria,Bacteroidota,Bacteroidia,Bacteroidales,Muribaculaceae,,Muribaculaceae bacterium,0.0001,0.00073,...,,0.64,-0.627,-0.571,-0.571,-0.571,-0.292,,-0.124,"[{""name"":""Bacteria"",""level"":""superkingdom"",""en..."
2,CEC_709,Bacteria,Bacteroidota,Bacteroidia,Bacteroidales,Muribaculaceae,,Muribaculaceae bacterium,4e-05,0.00029,...,,0.607,-0.7,-0.648,-0.648,-0.648,-0.287,,-0.132,"[{""name"":""Bacteria"",""level"":""superkingdom"",""en..."
3,CEC_657,Bacteria,Bacteroidota,Bacteroidia,Bacteroidales,Muribaculaceae,,Muribaculaceae bacterium,0.00138,0.00138,...,,0.294,-0.677,-0.677,-0.677,-0.677,-0.301,,-0.156,"[{""name"":""Bacteria"",""level"":""superkingdom"",""en..."
4,CEC_3600,Bacteria,Bacteroidota,Bacteroidia,Bacteroidales,Rikenellaceae,Alistipes,Alistipes timonensis,0.00014,0.00014,...,0.042,0.042,-0.15,-0.15,-0.15,-0.15,-0.058,-0.058,-0.058,"[{""name"":""Bacteria"",""level"":""superkingdom"",""en..."
