# Preparation

## Packages and paths

In [1]:
import pandas as pd
import numpy as np
import pathlib
import matplotlib.pyplot as plt
import seaborn as sns
import os
from tqdm import tqdm
from typing import List, Union, Optional, Callable
import pickle
from Bio import AlignIO, SeqIO
from ete3 import Tree, TreeNode
from gctree import CollapsedTree

from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import umap
from ete3 import Tree, faces, TreeStyle, NodeStyle, TextFace, SequenceFace, COLOR_SCHEMES, CircleFace
from GCTree_preparation import *
import warnings
import math
import matplotlib.image as mpimg
from PIL import Image

warnings.filterwarnings("ignore")

path_to_storage = "/media/hieunguyen/HNSD01/storage/all_BSimons_datasets"
outdir = "/media/hieunguyen/HNSD_mini/outdir/sc_bulk_BCR_data_analysis_v0.1"

PROJECT = "220701_etc_biopsies"
path_to_main_output = f"{outdir}/tree_analysis/{PROJECT}"
path_to_02_output = os.path.join(path_to_main_output, "02_output")
os.system(f"mkdir -p {path_to_02_output}")

output_type = "custom_group_output"

path_to_trees = os.path.join(path_to_storage, PROJECT, "GCtrees/v0.2", output_type)

all_tree_folder = [item for item in pathlib.Path(path_to_trees).glob("*") if 
                   os.path.isfile(f"{str(item)}/02_dnapars/gctree.out.inference.1.nk") == True]

all_nk_files = [item for item in pathlib.Path(path_to_trees).glob("*/*/*gctree.out.inference.1.nk")]  
print(f"Number of trees: {len(all_tree_folder)}")   

path_to_metadata = "/media/hieunguyen/HNSD01/src/sc_bulk_BCR_data_analysis/preprocessing/220701_etc_biopsies/metadata.csv"
mid_metadata = pd.read_csv(path_to_metadata, sep =";")
path_to_04_output = os.path.join(outdir, "VDJ_output", "04_output")
thres = 0.85

all_clone_files = list(pathlib.Path(outdir).glob(f"VDJ_output/*/VDJ_output_{thres}/preprocessed_files/clonesets*.split_clones.xlsx"))

clonedf = pd.read_csv(os.path.join(path_to_04_output, "full_clonedf_with_mutation_rate.csv"), index_col= [0])
clonedf = clonedf[clonedf['num_mutation'] != "region_not_covered-skip"]
clonedf = clonedf[clonedf['dataset.name'] == "220701_etc_biopsies"]

##### Re run the summary analysis of all trees and rendering tree figures
# rerun = True
rerun = False

  from .autonotebook import tqdm as notebook_tqdm


Number of trees: 13481


## Load the tree data 

In [2]:
#####------------------------------------------------------------------------#####
##### GENERATE TREE SUMMARY DATA FRAME
#####------------------------------------------------------------------------#####

if (os.path.isfile(f"{path_to_02_output}/tree_summarydf.csv") == False) or (rerun == True):
    saveTreeobj = dict()
    # loop through all trees
    maindf = pd.DataFrame()
    # for treedir in tqdm(all_tree_folder[0:1]):
    for treedir in tqdm(all_tree_folder):
        cloneid = treedir.name
        treedir = str(treedir)

        mouseID = cloneid.split("_")[0]
        group = "_".join(cloneid.split("_")[1:3]).replace("biopsy_YFP", "biopsy")
        V_gene = cloneid.split("_")[3]
        J_gene = cloneid.split("_")[4]
        CDR3_len = cloneid.split("_")[5]

        nk_path = f"{treedir}/02_dnapars/gctree.out.inference.1.nk"
        ab_dict_path = os.path.join(treedir, "01_deduplicate", f"{cloneid}.abundance.csv")
        path_to_orig_fasta= f"{treedir}/01_deduplicate/{cloneid}.aln.fasta"
        input_idmaps = f"{treedir}/01_deduplicate/{cloneid}.id_map_seq.csv"
    
        treeobj = GCtree(
            nk_path = nk_path,
            ab_dict_path = ab_dict_path,
            origin_fasta = path_to_orig_fasta,
            idmap_seq = input_idmaps
        )
        
        saveTreeobj[cloneid] = treeobj
        seqdf_orig = treeobj.seqdf
        seqs = treeobj.seqs
        num_nodes = len(treeobj.nodes)
        num_leaves = len(treeobj.leaves)
        num_internal_nodes = len(treeobj.internal_nodes)
        num_passthrough_nodes = len(treeobj.passthrough_nodes)
        num_split_nodes = len(treeobj.split_nodes)
        num_observed_nodes = len(treeobj.observed_nodes)
        num_inferred_nodes = len(treeobj.inferred_nodes)
        count_single_node = treeobj.count_single_node
        count_mix_node = treeobj.count_mix_node

        all_MIDs = seqdf_orig["MID"].unique()
        all_groups = [mid_metadata[mid_metadata["Unnamed: 0"] == item]["population"].unique()[0] for item in all_MIDs]
        tmpdf = pd.DataFrame(
            {
                "cloneid": cloneid,
                "mouseID": mouseID,
                "group": group,
                "V_gene": V_gene,
                "J_gene": J_gene,
                "CDR3_len": CDR3_len,
                "num_nodes": num_nodes,
                "num_leaves": num_leaves,
                "num_internal_nodes": num_internal_nodes,
                "num_passthrough_nodes": num_passthrough_nodes,
                "num_split_nodes": num_split_nodes,
                "num_observed_nodes": num_observed_nodes,
                "num_inferred_nodes": num_inferred_nodes,
                "num_MID": len(all_MIDs),
                "available_population": ",".join(all_groups),
                "num_seq_fasta": len(seqs),
                "num_single_node": count_single_node,
                "num_mix_node": count_mix_node
            },
            index=[0],
        )
        maindf = pd.concat([maindf, tmpdf], axis = 0)
    maindf = maindf.sort_values(by = "num_MID", ascending = False)

    maindf.to_csv(f"{path_to_02_output}/tree_summarydf.csv", index = False)
    # Save the dictionary to a pickle file
    with open(f"{path_to_02_output}/saveTreeobj.pkl", "wb") as f:
        pickle.dump(saveTreeobj, f)
else:
    maindf = pd.read_csv(f"{path_to_02_output}/tree_summarydf.csv")
    # Reload the dictionary from the pickle file
    with open(f"{path_to_02_output}/saveTreeobj.pkl", "rb") as f:
        saveTreeobj = pickle.load(f)

## Visualize some trees

In [3]:

if (os.path.isfile(f"{path_to_02_output}/finished_tree_rendering.csv") == False) or (rerun == True):
    color_path = "./hex_color.csv"
    for cloneid in tqdm(saveTreeobj.keys()):
        mouseid = cloneid.split("_")[0]
        group = "_".join(cloneid.split("_")[1:3]).replace("biopsy_YFP", "biopsy")
        path_to_save_tree_svg = os.path.join(path_to_02_output, mouseid, group)
        os.system(f"mkdir -p {path_to_save_tree_svg}")

        treeobj = saveTreeobj[cloneid] 
        avai_mids = treeobj.seqdf["MID"].unique()
        mid_color_pal = pd.read_csv(color_path, index_col = [0]).to_dict()["hex color"]

        ts = treeobj.generate_tree_style(color_path = color_path)
        # treeobj.tree.render("%%inline", tree_style=ts) 

        for input_mid in avai_mids:
            if input_mid == "GL":
                input_mid_col = "gray"
            else:
                input_mid_col = mid_color_pal[input_mid]
            ts.legend.add_face(CircleFace(10, input_mid_col), column = 0)
            ts.legend.add_face(TextFace(input_mid), column = 0)

        _ = treeobj.tree.render(f"{path_to_save_tree_svg}/{cloneid}.svg", tree_style=ts) 
    pd.DataFrame(data = ["finished_tree_rendering"]).to_csv(f"{path_to_02_output}/finished_tree_rendering.csv", index = False, header = False)


# Main analysis

In [4]:
maindf_mixtree = maindf[(maindf['num_mix_node'] > 0)]
maindf_mixtree.available_population.unique()

array(['Ly6c+YFP+,Ly6c+YFP-,Ly6c-YFP+,Ly6c-YFP-',
       'Ly6c+YFP+,Ly6c+YFP-,Ly6c-YFP-', 'Ly6c+YFP+,Ly6c-YFP+,Ly6c-YFP-',
       'Ly6c+YFP-,Ly6c-YFP+,Ly6c-YFP-', 'Ly6c+YFP+,Ly6c+YFP-,Ly6c-YFP+',
       'Ly6c+YFP-,Ly6c-YFP-', 'Ly6c+YFP+,Ly6c-YFP+',
       'Ly6c+YFP+,Ly6c+YFP-', 'Ly6c+YFP+,Ly6c-YFP-',
       'Ly6c-YFP+,Ly6c-YFP-', 'Ly6c+YFP-,Ly6c-YFP+'], dtype=object)

In [None]:
mouseid = "m11"
yfp_case = "all"

##### list of biopsy samples from the given mouse, MID
biopsy_samples = mid_metadata[(mid_metadata["mouse"] == mouseid) & 
                              (mid_metadata["population"] == "biopsy")]["Unnamed: 0"].unique()

print(f"List of biopsy samples: {biopsy_samples}")

# clonedf: all clone information in all mice in the project. 
# maindf: tree information, all trees
mouse_mids = mid_metadata[mid_metadata["mouse"] == mouseid]["Unnamed: 0"].unique()
print(f"List of all mouse: {mouse_mids}")

mouse_clonedf = clonedf[clonedf["id"].isin(mouse_mids)]
biopsy_mouse_clonedf = clonedf[clonedf["id"].isin(biopsy_samples)]

mouse_treedf = maindf[maindf["mouseID"] == mouseid]
yfp_mouse_treedf = maindf[(maindf["mouseID"] == mouseid) & (maindf["group"] != "biopsy")]
biopsy_mouse_treedf = maindf[(maindf["mouseID"] == mouseid) & (maindf["group"] == "biopsy")]

path_to_tree_fasta = f"{outdir}/VDJ_output/05_output/220701_etc_biopsies/FASTA/{mouseid}/{yfp_case}"
mouse_mids = mid_metadata[mid_metadata["mouse"] == mouseid]["Unnamed: 0"].unique()

biopsy_mouse_clonedf 

List of biopsy samples: ['MID4']
List of all mouse: ['MID4' 'MID55' 'MID12' 'MID57' 'MID58']


Unnamed: 0,id,VJseq.combi,V.gene,J.gene,D.gene,nSeqFR1,nSeqCDR1,nSeqFR2,nSeqCDR2,nSeqFR3,...,aaSeqFR4,VJ.len.combi,barcode,targetSequences,uniqueMoleculeCount,sampletype,len_aaSeqCDR3,len_nSeqCDR3,dataset.name,num_mutation
16057,MID4,IGHV1-26_IGHJ2_CARAFTTVVATDYW_TGTGCAAGAGCCTTTA...,IGHV1-26,IGHJ2,IGHD1-1*01,GAGGTCCAGCTGCAACAATCTGGACCTGAGCTGGTGAAGCCTGGGG...,GGATACACGTTCACTGACTACTAC,ATGAACTGGTTGAAGCAGAGCCATGGAAAGAGCCTTGAGTGGATTG...,ATTAATCCTAACAATGGTGGTACT,AGCTACAACCAGAAGTTCAAGGCCAAGGCCACATTGACTGTAGACA...,...,GQGTTLTVSS_,IGHV1-26_IGHJ2_42,220701_etc_biopsies_396,GAGGTCCAGCTGCAACAATCTGGACCTGAGCTGGTGAAGCCTGGGG...,7.0,bulk,14,42,220701_etc_biopsies,4
16058,MID4,IGHV1-26_IGHJ2_CARSVTTIITTDYW_TGTGCAAGATCGGTTA...,IGHV1-26,IGHJ2,IGHD2-5*01,GAGGTCCAGCTGCAACAATCTGGACCTGAGCTGGTGAAGCCTGGGG...,GGATACACGTTCACTGACTACTAC,ATGAACTGGGTGAAGCAGAGCCATGGAAAGAGCCTTGAGTGGATTG...,ATTAATCCTAACAATGGTGGTACT,AGCTACAACCAGAAGTTCAAGGGCAAGGCCACATTGACTGTAGACA...,...,GQGTTLTVSS_,IGHV1-26_IGHJ2_42,220701_etc_biopsies_397,GAGGTCCAGCTGCAACAATCTGGACCTGAGCTGGTGAAGCCTGGGG...,1.0,bulk,14,42,220701_etc_biopsies,2
16739,MID4,IGHV1-26_IGHJ2_CASDYGSSLGYFDYW_TGTGCAAGTGACTAC...,IGHV1-26,IGHJ2,IGHD1-1*01,GAGGTCCAGCTGCAACAATCTGGACCTGAGCTGGTGAAGCCTGGGG...,GGATACACGTTCACTGACTACTAC,ATGAACTGGGTGAAGCAGAGCCATGGAAAGAGCCTTGAGTGGATTG...,ATTAATCCTAACAATGGTGGTACT,ACCTACACCCAGAACTTCAAGGGCAAGGCCACATTGACTGTAGACA...,...,GQGTTLTVSS_,IGHV1-26_IGHJ2_45,220701_etc_biopsies_1078,GAGGTCCAGCTGCAACAATCTGGACCTGAGCTGGTGAAGCCTGGGG...,8.0,bulk,15,45,220701_etc_biopsies,4
16740,MID4,IGHV1-26_IGHJ2_CASDYGSSLGYFDYW_TGTGCAAGTGACTAC...,IGHV1-26,IGHJ2,IGHD1-1*01,GAGGTCCAGCTGCAACAATCTGGACCTGAGCTGGTGAAGCCTGGGG...,GGATACACGTTCACTGACTACTAC,ATGAACTGGGTGAAGCAGAGCCATGGAAAGAGCCTTGAGTGGATTG...,ATTAATCCTAACAATGGTGGTACT,ACCTACAAACAGAACTTCAAGGGCAAGGCCACATTGACTGTAGACA...,...,GQGTTLTVSS_,IGHV1-26_IGHJ2_45,220701_etc_biopsies_1079,GAGGTCCAGCTGCAACAATCTGGACCTGAGCTGGTGAAGCCTGGGG...,6.0,bulk,15,45,220701_etc_biopsies,4
16741,MID4,IGHV1-26_IGHJ2_CASDYGSSLGYFDYW_TGTGCAAGTGACTAC...,IGHV1-26,IGHJ2,IGHD1-1*01,GAGGTCCAGCTGCAACAATCTGGACCTGAGCTGGTGAAGCCTGGGG...,GGATACACGTTCACTGACTACTAC,ATGAACTGGGTGAAGCAGAGCCATGGAAAGAGCCTTGAGTGGATTG...,ATTAATCCTAACAATGGTGGTACT,AGCTACAACCAGAAGTTCAAGGGCAAGGCCACATTGACTGTAGACA...,...,GQGTTLTVSS_,IGHV1-26_IGHJ2_45,220701_etc_biopsies_1080,GAGGTCCAGCTGCAACAATCTGGACCTGAGCTGGTGAAGCCTGGGG...,3.0,bulk,15,45,220701_etc_biopsies,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
98394,MID4,IGHV7-3_IGHJ4_CARYWGSAYAMDYW_TGTGCAAGATATTGGGG...,IGHV7-3,IGHJ4,NA-NA,GAGGTGAAGCTGGTGGAGTCTGGAGGAGGCTTGGTACAGCCTGGGG...,GGATTCACCTTCACTGATTACTAC,ATGAGCTGGGTCCGTCAGCCTCCAGGGAAGGCACTTGAGTGGTTGG...,ATTAGAAACAAAGCTAATGATTACACAACA,GAGTACAGTGCATCTGTGAAGGGTCGGTTCACCATCTCCAGAGATA...,...,GQGTSVTVSS_,IGHV7-3_IGHJ4_42,220701_etc_biopsies_82733,GAGGTGAAGCTGGTGGAGTCTGGAGGAGGCTTGGTACAGCCTGGGG...,1.0,bulk,14,42,220701_etc_biopsies,7
98396,MID4,IGHV2-6_IGHJ4_CARHSHDDWMDYW_TGTGCCAGACATAGTCAC...,IGHV2-6,IGHJ4,IGHD2-12*01,CAGGTGCAGCTGAAGGAGTCAGGACCTGGCCTGGTGGCGCCCTCAC...,GGGTTCTCATTAACCAGCTATGGT,GTACACTGGGTTCGCCAGCCTCCAGGAAAGGGTCTGGAGTGGCTGG...,ATATGGAGTGATGGAAGCACA,ACCTATAATTCAGCTCTCAAATCCAGACTGAGCATCAGCAAGGACA...,...,GQGTSVTVSS_,IGHV2-6_IGHJ4_39,220701_etc_biopsies_82735,CAGGTGCAGCTGAAGGAGTCAGGACCTGGCCTGGTGGCGCCCTCAC...,1.0,bulk,13,39,220701_etc_biopsies,19
98402,MID4,IGHV1-34_IGHJ2_CALITTVIAPSYLDYW_TGTGCCCTTATTAC...,IGHV1-34,IGHJ2,IGHD1-1*01,GAGGTCCAGCTGCAACAGTCTGGACCTGAGTTGGTGAAGCCTGGGG...,GGCTACACATTCACTGACTACTAC,ATGCACTGGGTGAAGCAGAGCCATGGAAAGAGTCTTGAGTGGATTG...,ATTTATCCTAACAATGGTGGTTAT,GGCTACAACCAGAAGTTCAAGGGCAAGGCCACATTGACTGTAGACA...,...,GQGTTLTVSS_,IGHV1-34_IGHJ2_48,220701_etc_biopsies_82741,GAGGTCCAGCTGCAACAGTCTGGACCTGAGTTGGTGAAGCCTGGGG...,1.0,bulk,16,48,220701_etc_biopsies,5
98403,MID4,IGHV5-12_IGHJ2_CARPRYYGSRGEFYFDYW_TGTGCAAGACCG...,IGHV5-12,IGHJ2,IGHD1-1*01,GAAGTGAAGCTGGTGGAGTCTGGGGGAGGCTTAGTGCAGCCTGGAG...,GGATTCACTTTCAGTGACTATTAC,ATGTATTGGGTTCGCCAGACTCCAGAGAAGAGGCTGGAGTGGGTCG...,ATTAGTAATGGTGGTGGTAACACC,TATTATCCAGACACTATAAAGGGCCGATTCACCATCTCCAGAGACA...,...,GQGTTLTVSS_,IGHV5-12_IGHJ2_54,220701_etc_biopsies_82742,GAAGTGAAGCTGGTGGAGTCTGGGGGAGGCTTAGTGCAGCCTGGAG...,1.0,bulk,18,54,220701_etc_biopsies,5


In [None]:
tree = saveTreeobj["m11_all_YFP_IGHV1-12-01_IGHJ2-01_27_1"]
i = 0
idmap = tree.idmapseqdf
real_nodes = [item for item in tree.nodes if "seq" in item.name]

bs_targetseq = biopsy_mouse_clonedf.iloc[i].targetSequences

In [8]:
bs_targetseq

'GAGGTCCAGCTGCAACAATCTGGACCTGAGCTGGTGAAGCCTGGGGCTTCAGTGAAGATATCCTGTAAGGCTTCTGGATACACGTTCACTGACTACTACATGAACTGGTTGAAGCAGAGCCATGGAAAGAGCCTTGAGTGGATTGGAGATATTAATCCTAACAATGGTGGTACTAGCTACAACCAGAAGTTCAAGGCCAAGGCCACATTGACTGTAGACAAGTCCTCCAGCACAGCCTACATGGAGCTCCGCAGCCTGACATCTGAGGACTCTGCAGTCTATTACTGTGCAAGAGCCTTTACTACGGTAGTAGCTACGGACTACTGGGGCCAAGGCACCACTCTCACAGTCTCCTCAG'