In [1]:
import pandas as pd
import numpy as np
import pathlib
import matplotlib.pyplot as plt
import seaborn as sns
import os
from tqdm import tqdm
from typing import List, Union, Optional, Callable
import pickle
from Bio import AlignIO, SeqIO
from ete3 import Tree, TreeNode
from gctree import CollapsedTree

from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import umap
from ete3 import Tree, faces, TreeStyle, NodeStyle, TextFace, SequenceFace, COLOR_SCHEMES, CircleFace
from GCTree_preparation import *
import warnings
import math
warnings.filterwarnings("ignore")

path_to_storage = "/media/hieunguyen/HNSD01/storage/all_BSimons_datasets"
outdir = "/media/hieunguyen/HNSD_mini/outdir/sc_bulk_BCR_data_analysis_v0.1"

PROJECT = "220701_etc_biopsies"
path_to_main_output = f"{outdir}/tree_analysis/{PROJECT}"
path_to_01_output = os.path.join(path_to_main_output, "01_output")
os.system(f"mkdir -p {path_to_01_output}")

output_type = "mouse_based_output"

path_to_trees = os.path.join(path_to_storage, PROJECT, "GCtrees/v0.2", output_type)

all_tree_folder = [item for item in pathlib.Path(path_to_trees).glob("*") if 
                   os.path.isfile(f"{str(item)}/02_dnapars/gctree.out.inference.1.nk") == True]

all_nk_files = [item for item in pathlib.Path(path_to_trees).glob("*/*/*gctree.out.inference.1.nk")]  
print(f"Number of trees: {len(all_tree_folder)}")   

path_to_metadata = "/media/hieunguyen/HNSD01/src/sc_bulk_BCR_data_analysis/preprocessing/220701_etc_biopsies/metadata.csv"
mid_metadata = pd.read_csv(path_to_metadata, sep =";")

  from .autonotebook import tqdm as notebook_tqdm


Number of trees: 7618


In [None]:
rerun = True
if (os.path.isfile(f"{path_to_01_output}/tree_summarydf.csv") == False) or (rerun == True):
    saveTreeobj = dict()
    # loop through all trees
    maindf = pd.DataFrame()
    # for treedir in tqdm(all_tree_folder[0:1]):
    for treedir in tqdm(all_tree_folder):
        cloneid = treedir.name
        treedir = str(treedir)

        mouseID = cloneid.split("_")[0]
        V_gene = cloneid.split("_")[1]
        J_gene = cloneid.split("_")[2]
        CDR3_len = cloneid.split("_")[3]

        nk_path = f"{treedir}/02_dnapars/gctree.out.inference.1.nk"
        ab_dict_path = os.path.join(treedir, "01_deduplicate", f"{cloneid}.abundance.csv")
        path_to_orig_fasta= f"{treedir}/01_deduplicate/{cloneid}.fasta"
        input_idmaps = f"{treedir}/01_deduplicate/{cloneid}.id_map_seq.csv"
    
        treeobj = GCtree(
            nk_path = nk_path,
            ab_dict_path = ab_dict_path,
            origin_fasta = path_to_orig_fasta,
            idmap_seq = input_idmaps
        )
        
        saveTreeobj[cloneid] = treeobj
        seqdf_orig = treeobj.seqdf
        seqs = treeobj.seqs
        num_nodes = len(treeobj.nodes)
        num_leaves = len(treeobj.leaves)
        num_internal_nodes = len(treeobj.internal_nodes)
        num_passthrough_nodes = len(treeobj.passthrough_nodes)
        num_split_nodes = len(treeobj.split_nodes)
        num_observed_nodes = len(treeobj.observed_nodes)
        num_inferred_nodes = len(treeobj.inferred_nodes)
        count_single_node = treeobj.count_single_node
        count_mix_node = treeobj.count_mix_node
        all_MIDs = seqdf_orig["MID"].unique()
        all_groups = [mid_metadata[mid_metadata["Unnamed: 0"] == item]["population"].unique()[0] for item in all_MIDs]
        tmpdf = pd.DataFrame(
            {
                "cloneid": cloneid,
                "mouseID": mouseID,
                "V_gene": V_gene,
                "J_gene": J_gene,
                "CDR3_len": CDR3_len,
                "num_nodes": num_nodes,
                "num_leaves": num_leaves,
                "num_internal_nodes": num_internal_nodes,
                "num_passthrough_nodes": num_passthrough_nodes,
                "num_split_nodes": num_split_nodes,
                "num_observed_nodes": num_observed_nodes,
                "num_inferred_nodes": num_inferred_nodes,
                "num_MID": len(all_MIDs),
                "available_population": ",".join(all_groups),
                "num_seq_fasta": len(seqs),
                "num_single_node": count_single_node,
                "num_mix_node": count_mix_node
            },
            index=[0],
        )
        maindf = pd.concat([maindf, tmpdf], axis = 0)
    maindf = maindf.sort_values(by = "num_MID", ascending = False)
    maindf.to_csv(f"{path_to_01_output}/tree_summarydf.csv", index = False)
    with open(f"{path_to_01_output}/saveTreeobj.pkl", "wb") as f:
        pickle.dump(saveTreeobj, f)
else:
    maindf = pd.read_csv(f"{path_to_01_output}/tree_summarydf.csv")
    # Reload the dictionary from the pickle file
    with open(f"{path_to_01_output}/saveTreeobj.pkl", "rb") as f:
        saveTreeobj = pickle.load(f)

 28%|██▊       | 2130/7618 [00:47<01:42, 53.29it/s]

In [None]:
treeobj = saveTreeobj["m13_IGHV9-3-01_IGHJ2-01_39_5.aln"]
color_path = "./hex_color.csv"
ts = treeobj.generate_tree_style(color_path = color_path)
treeobj.tree.render("%%inline", tree_style=ts) 

In [None]:
# cloneid = "m12_all_YFP_IGHV1-18-01_IGHJ3-01_36_2"
color_path = "./hex_color.csv"
for cloneid in tqdm(saveTreeobj.keys()):
    mouseid = cloneid.split("_")[0]
    path_to_save_tree_svg = os.path.join(path_to_01_output, mouseid)
    os.system(f"mkdir -p {path_to_save_tree_svg}")

    treeobj = saveTreeobj[cloneid] 
    avai_mids = treeobj.seqdf["MID"].unique()
    mid_color_pal = pd.read_csv(color_path, index_col = [0]).to_dict()["hex color"]

    ts = treeobj.generate_tree_style(color_path = color_path)
    # treeobj.tree.render("%%inline", tree_style=ts) 

    for input_mid in avai_mids:
        if input_mid == "GL":
            input_mid_col = "gray"
        else:
            input_mid_col = mid_color_pal[input_mid]
        ts.legend.add_face(CircleFace(10, input_mid_col), column = 0)
        ts.legend.add_face(TextFace(input_mid), column = 0)

    _ = treeobj.tree.render(f"{path_to_save_tree_svg}/{cloneid}.svg", tree_style=ts) 
