In [16]:
import pandas as pd
import numpy as np
import pathlib
import matplotlib.pyplot as plt
import seaborn as sns
import os
from tqdm import tqdm
from typing import List, Union, Optional, Callable
import pickle
from Bio import AlignIO, SeqIO
from ete3 import Tree, TreeNode
from gctree import CollapsedTree

from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import umap
from ete3 import Tree, faces, TreeStyle, NodeStyle, TextFace, SequenceFace, COLOR_SCHEMES
from GCTree_preparation import *
import warnings
warnings.filterwarnings("ignore")

path_to_storage = "/media/hieunguyen/HNSD01/storage/all_BSimons_datasets"

PROJECT = "220701_etc_biopsies"

output_type = "mouse_based_output"

path_to_trees = os.path.join(path_to_storage, PROJECT, "GCtrees/v0.2", output_type)

all_tree_folder = [item for item in pathlib.Path(path_to_trees).glob("*") if 
                   os.path.isfile(f"{str(item)}/02_dnapars/gctree.out.inference.1.nk") == True]

all_nk_files = [item for item in pathlib.Path(path_to_trees).glob("*/*/*gctree.out.inference.1.nk")]  
print(f"Number of trees: {len(all_tree_folder)}")   

Number of trees: 7618


In [None]:
# loop throught all trees
maindf = pd.DataFrame()
# for treedir in tqdm(all_tree_folder[0:1]):
for treedir in tqdm(all_tree_folder):
    cloneid = treedir.name
    treedir = str(treedir)

    mouseID = cloneid.split("_")[0]
    V_gene = cloneid.split("_")[1]
    J_gene = cloneid.split("_")[2]
    CDR3_len = cloneid.split("_")[3]

    nk_path = f"{treedir}/02_dnapars/gctree.out.inference.1.nk"
    ab_dict_path = os.path.join(treedir, "01_deduplicate", f"{cloneid}.abundance.csv")
    abund_df = pd.read_csv(ab_dict_path, index_col=0, names=['val'])
    ab_dict = abund_df.to_dict().get('val')
    tree_path = treedir
    tree = Tree(newick=nk_path, format=1)
    if ab_dict is not None:
        for node in tree.traverse():
            node.add_feature('abundance', ab_dict.get(node.name, 0))
    treeobj = GCtree(tree = tree, path = tree_path)

    num_nodes = len(treeobj.nodes)
    num_leaves = len(treeobj.leaves)
    num_internal_nodes = len(treeobj.internal_nodes)
    num_passthrough_nodes = len(treeobj.passthrough_nodes)
    num_split_nodes = len(treeobj.split_nodes)
    num_observed_nodes = len(treeobj.observed_nodes)
    num_inferred_nodes = len(treeobj.inferred_nodes)

    path_to_orig_fasta= f"{treedir}/01_deduplicate/{cloneid}.fasta"
    input_idmaps = f"{treedir}/01_deduplicate/{cloneid}.id_map_seq.csv"
    idmapseqdf = pd.read_csv(input_idmaps, skiprows=1, header=None)
    path_to_gctree_inference = nk_path

    idmapseqdf.columns = ["seqid", "seq"]
    with open(path_to_orig_fasta) as fasta_file:  # Will close handle cleanly
        identifiers = []
        seqs = []
        for seq_record in SeqIO.parse(fasta_file, 'fasta'):  # (generator)
            identifiers.append(seq_record.id)
            seqs.append(str(seq_record.seq))

    seqdf = pd.DataFrame(data = identifiers, columns = ["ID"])
    seqdf["seq"] = seqs
    seqdf = seqdf[seqdf["ID"] != "GL"]
    seqdf["abundance"] = seqdf["ID"].apply(lambda x: int(x.split("|")[-1].replace("Abundance:", "")))
    seqdf["MID"] = seqdf["ID"].apply(lambda x: str(x.split("|")[0].replace("Sample:", "")))
    seqdf_orig = seqdf.copy()
    seqdf = seqdf.groupby("seq")["abundance"].sum().reset_index().copy()

    seqdf = seqdf.merge(idmapseqdf, right_on = "seq", left_on = "seq")

    tmpdf = pd.DataFrame(
        {
            "cloneid": cloneid,
            "mouseID": mouseID,
            "V_gene": V_gene,
            "J_gene": J_gene,
            "CDR3_len": CDR3_len,
            "num_nodes": num_nodes,
            "num_leaves": num_leaves,
            "num_internal_nodes": num_internal_nodes,
            "num_passthrough_nodes": num_passthrough_nodes,
            "num_split_nodes": num_split_nodes,
            "num_observed_nodes": num_observed_nodes,
            "num_inferred_nodes": num_inferred_nodes,
            "num_MID": len(seqdf_orig["MID"].unique()),
            "num_seq_fasta": len(seqs)
        },
        index=[0],
    )
    maindf = pd.concat([maindf, tmpdf], axis = 0)

  0%|          | 0/7618 [00:00<?, ?it/s]

  8%|▊         | 590/7618 [00:05<01:05, 107.45it/s]