In [18]:
import pandas as pd
import numpy as np
import pathlib
import matplotlib.pyplot as plt
import seaborn as sns
import os
from tqdm import tqdm
from typing import List, Union, Optional, Callable
import pickle
from Bio import AlignIO, SeqIO
from ete3 import Tree, TreeNode
from gctree import CollapsedTree

from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import umap
from ete3 import Tree, faces, TreeStyle, NodeStyle, TextFace, SequenceFace, COLOR_SCHEMES, CircleFace
from GCTree_preparation import *
import warnings
import math
import nltk
from matplotlib.patches import Rectangle
warnings.filterwarnings("ignore")

path_to_storage = "/media/hieunguyen/HNHD01/storage/all_BSimons_datasets"
outdir = "/media/hieunguyen/GSHD_HN01/outdir/sc_bulk_BCR_data_analysis_v0.1"

bulk_project = "240826_BSimons"
sc_project = "240805_BSimons"
PROJECTS = f"{bulk_project}_{sc_project}"
mouseid = "m3"

dist_type = "full_seq"
path_to_01_output = f"{outdir}/tree_analysis/{bulk_project}/01_output"
path_to_06_output = f"{outdir}/tree_analysis/06_output/{PROJECTS}/{mouseid}/{dist_type}"
path_to_10_output = f"{outdir}/tree_analysis/10_output/{PROJECTS}/{mouseid}/{dist_type}"
os.system(f"mkdir -p {path_to_10_output}")

with open(f"{path_to_01_output}/saveTreeobj.pkl", "rb") as f:
    saveTreeobj = pickle.load(f)

maindf = pd.read_csv(f"{path_to_01_output}/tree_summarydf.csv")
maindf = maindf[maindf["mouseID"] == mouseid]

In [None]:
cloneid = "m3_IGHV1-12-01_IGHJ3-01_27_1"
os.system(f"mkdir -p {path_to_06_output}/{cloneid}")
V_gene = cloneid.split("_")[1]
J_gene = cloneid.split("_")[2]
cdr3_len = cloneid.split("_")[3]
mouseid = cloneid.split("_")[0]

treeobj = saveTreeobj[cloneid]
seqdf = treeobj.seqdf.copy()
idmapdf = treeobj.idmapseqdf.copy()
seqdf = seqdf.merge(idmapdf, right_on = "seq", left_on = "seq")

df = pd.DataFrame(data = seqdf.seqid.unique(), columns = ["seqid"])
df["mouseid"] = mouseid
df["cloneID"] = cloneid

df["dist_to_root"] = df["seqid"].apply(lambda x: treeobj.node_depth(node = [item for item in treeobj.nodes if item.name == x][0], topo = False))
df["topo_dist_to_root"] = df["seqid"].apply(lambda x: treeobj.node_depth(node = [item for item in treeobj.nodes if item.name == x][0], topo = True))

deepest_node = df[df["dist_to_root"] == df["dist_to_root"].max()].seqid.unique()[0]

df["dist_to_deepest"] = df["seqid"].apply(lambda x: treeobj.tree.get_distance(x, deepest_node) if x != deepest_node else 0)
df["topo_dist_to_deepest"] = df["seqid"].apply(lambda x: treeobj.tree.get_distance(x, deepest_node, topology_only= True) if x != deepest_node else 0)

def get_node_furthest_child(x, topo):
    output = treeobj.tree.search_nodes(name=x)[0].get_farthest_leaf(topology_only= topo)
    return output[0].name, output[1]
df[['topo_furthest_child_node', 'topo_dist_to_furthest_child_node']] = df['seqid'].apply(
    lambda x: pd.Series(
        get_node_furthest_child(x, True)))
df[['furthest_child_node', 'dist_to_furthest_child_node']] = df['seqid'].apply(
    lambda x: pd.Series(
        get_node_furthest_child(x, False)))

change_name = {
    "dist_to_root": "rootness",
    "topo_dist_to_root": "topo_rootness",
    "dist_to_furthest_child_node": "leafness",
    "topo_dist_to_furthest_child_node": "topo_leafness"
}

df.columns = [change_name[item] if item in change_name.keys() else item for item in df.columns]
dist2sc_org = pd.read_csv(os.path.join(path_to_06_output, cloneid, f"tree_Seqdf_{cloneid}.csv"), index_col = [0])

dist2sc = dist2sc_org[dist2sc_org.columns[4:]].drop(["aaSeqCDR3", "cloneid"], axis = 1)
df = df.merge(dist2sc, right_on = "seqid", left_on = "seqid")

In [52]:
df

Unnamed: 0,seqid,mouseid,cloneID,rootness,topo_rootness,dist_to_deepest,topo_dist_to_deepest,topo_furthest_child_node,topo_leafness,furthest_child_node,...,M3_CGATTGAAGAACTCGG-1_y,M3_CGCTGGACAATGGTCT-1_y,M3_GACGTGCCAGGTGCCT-1_y,M3_GGACAAGAGATGCCAG-1_y,M3_GGTGCGTAGAGCAATT-1_y,P3_AGAGCTTGTGAAAGAG-1_y,P3_CGCTTCAAGCGTCTAT-1_y,P3_CTTCTCTCATTATCTC-1_y,P3_GTACGTAGTGGTTTCA-1_y,min_dist_to_a_cell_y
0,seq1,m3,m3_IGHV1-12-01_IGHJ3-01_27_1,32.0,5.0,15.0,4.0,seq2,0.0,seq2,...,0.384824,0.116531,0.119241,0.127371,0.387534,0.108401,0.127371,0.121951,0.092141,0.092141
1,seq1,m3,m3_IGHV1-12-01_IGHJ3-01_27_1,32.0,5.0,15.0,4.0,seq2,0.0,seq2,...,0.384824,0.116531,0.119241,0.127371,0.387534,0.108401,0.127371,0.121951,0.092141,0.092141
2,seq1,m3,m3_IGHV1-12-01_IGHJ3-01_27_1,32.0,5.0,15.0,4.0,seq2,0.0,seq2,...,0.384824,0.116531,0.119241,0.127371,0.387534,0.108401,0.127371,0.121951,0.092141,0.092141
3,seq1,m3,m3_IGHV1-12-01_IGHJ3-01_27_1,32.0,5.0,15.0,4.0,seq2,0.0,seq2,...,0.384824,0.116531,0.119241,0.127371,0.387534,0.108401,0.127371,0.121951,0.092141,0.092141
4,seq2,m3,m3_IGHV1-12-01_IGHJ3-01_27_1,33.0,6.0,16.0,5.0,seq2,0.0,seq2,...,0.384824,0.119241,0.121951,0.130081,0.387534,0.111111,0.130081,0.124661,0.094851,0.094851
5,seq3,m3,m3_IGHV1-12-01_IGHJ3-01_27_1,31.0,4.0,14.0,3.0,seq2,1.0,seq4,...,0.384824,0.113821,0.116531,0.124661,0.387534,0.105691,0.124661,0.119241,0.089431,0.089431
6,seq4,m3,m3_IGHV1-12-01_IGHJ3-01_27_1,35.0,5.0,18.0,4.0,seq4,0.0,seq4,...,0.392954,0.121951,0.124661,0.130081,0.395664,0.111111,0.127371,0.127371,0.094851,0.094851
7,seq5,m3,m3_IGHV1-12-01_IGHJ3-01_27_1,35.0,4.0,18.0,3.0,seq5,0.0,seq5,...,0.395664,0.124661,0.127371,0.135501,0.398374,0.119241,0.135501,0.130081,0.102981,0.102981
8,seq6,m3,m3_IGHV1-12-01_IGHJ3-01_27_1,41.0,6.0,0.0,0.0,seq6,0.0,seq6,...,0.398374,0.127371,0.130081,0.138211,0.401084,0.124661,0.092141,0.132791,0.119241,0.092141
9,seq7,m3,m3_IGHV1-12-01_IGHJ3-01_27_1,41.0,6.0,2.0,1.0,seq7,0.0,seq7,...,0.398374,0.127371,0.130081,0.132791,0.401084,0.121951,0.086721,0.132791,0.116531,0.086721
