In [1]:
import pandas as pd
import numpy as np
import pathlib
import matplotlib.pyplot as plt
import seaborn as sns
import os
from tqdm import tqdm
from typing import List, Union, Optional, Callable
import pickle
from Bio import AlignIO, SeqIO
from ete3 import Tree, TreeNode
from gctree import CollapsedTree

from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import umap
from ete3 import Tree, faces, TreeStyle, NodeStyle, TextFace, SequenceFace, COLOR_SCHEMES, CircleFace
from GCTree_preparation import *
import warnings
import math
import nltk
from matplotlib.patches import Rectangle
warnings.filterwarnings("ignore")

path_to_storage = "/media/hieunguyen/HNHD01/storage/all_BSimons_datasets"
outdir = "/media/hieunguyen/GSHD_HN01/outdir/sc_bulk_BCR_data_analysis_v0.1"

bulk_project = "241031_BSimons"
sc_project = ["240411_BSimons", "241002_BSimons"]

PROJECTS = f"{bulk_project}_{'_'.join(sc_project)}"

path_to_01_output = f"{outdir}/tree_analysis/{bulk_project}/01_output"
path_to_07_output = f"{outdir}/tree_analysis/07_output/{PROJECTS}"
os.system(f"mkdir -p {path_to_07_output}")

with open(f"{path_to_01_output}/saveTreeobj.pkl", "rb") as f:
    saveTreeobj = pickle.load(f)

mouseid = "m3"

bulk_metadata = pd.read_excel("/media/hieunguyen/HNSD01/src/sc_bulk_BCR_data_analysis/preprocessing/241031_BSimons/241031_sample_sheet.xlsx")
bulk_metadata["MID"] = bulk_metadata["MID"].apply(lambda x: f"MID{x}")
bulk_metadata.columns = ["MID", "mouseID", "organ", "YFP", "population"]

maindf = pd.read_csv(f"{path_to_01_output}/tree_summarydf.csv")
maindf = maindf[maindf["mouseID"] == mouseid]

path_to_04_output = os.path.join(outdir, "VDJ_output", "04_output")
thres = 0.85
clonedf = pd.read_csv(os.path.join(path_to_04_output, "full_clonedf_with_mutation_rate.csv"), index_col= [0])
clonedf = clonedf[clonedf['num_mutation'] != "region_not_covered-skip"]

sc_clonedf = clonedf[clonedf['dataset.name'].isin(sc_project)][["barcode", "id", 'V.gene', 'J.gene', 'D.gene', "aaSeqCDR3", "nSeqCDR3"]].reset_index().drop("index", axis = 1)

sc_clonedf["mouseID"] = sc_clonedf["id"].apply(lambda x: "m" + x.replace("M", "").replace("P", ""))
sc_clonedf = sc_clonedf[sc_clonedf["mouseID"] == mouseid]


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
color_path = f"{bulk_project}_color.csv"
tree_dist_seqdf = pd.DataFrame()

for cloneid in tqdm(maindf.cloneid.unique()):
    os.system(f"mkdir -p {path_to_07_output}/{cloneid}")
    V_gene = cloneid.split("_")[1]
    J_gene = cloneid.split("_")[2]
    cdr3_len = cloneid.split("_")[3]
    mouseid = cloneid.split("_")[0]

    treeobj = saveTreeobj[cloneid]
    seqdf = treeobj.seqdf.copy()
    idmapdf = treeobj.idmapseqdf.copy()
    bulkdf = seqdf.merge(idmapdf, right_on = "seq", left_on = "seq")
    bulkdf["aaSeqCDR3"] = bulkdf["ID"].apply(lambda x: x.split("|")[2].split(":")[1])
    
    scdf = sc_clonedf[(sc_clonedf["V.gene"] == V_gene.split("-0")[0]) & 
                    (sc_clonedf["J.gene"] == J_gene.split("-0")[0])]

    scdf["len"] = scdf["nSeqCDR3"].apply(lambda x: len(x))
    scdf = scdf[scdf["len"] == int(cdr3_len)]

    if scdf.shape[0] > 0:
        for i in range(0, scdf.shape[0]):
            barcode = scdf.iloc[i].barcode
            sampleid = scdf.iloc[i].id
            sc_seq = scdf[(scdf["barcode"] == barcode) & (scdf["id"] == sampleid)]["aaSeqCDR3"].values[0] 

            bulkdf[f"{sampleid}_{barcode}"] = bulkdf["aaSeqCDR3"].apply(lambda x: nltk.edit_distance(x, sc_seq)/len(x))
        bulkdf["min_dist_to_a_cell"] = bulkdf[[item for item in bulkdf.columns if sampleid in item]].apply(lambda x: min(x), axis = 1)
        bulkdf["cloneid"] = cloneid
        bulkdf.to_csv(os.path.join(path_to_07_output, cloneid, f"tree_Seqdf_{cloneid}.csv"))
        bulkdf_simplified = bulkdf[["ID", "seq", "abundance", "MID", "seqid", "min_dist_to_a_cell"]]
        
        tree_dist_seqdf = pd.concat([tree_dist_seqdf, bulkdf_simplified], axis = 0)
        
        ##### Generate heatmap and save

        heatmap_plotdf = bulkdf[["seqid"] + [item for item in bulkdf.columns if "_" in item and item != "min_dist_to_a_cell"]].set_index("seqid")
        plt.figure(figsize= (10, 10))
        sns.heatmap(heatmap_plotdf,
                    cmap="coolwarm", cbar=-1, linewidths=0.5, linecolor='black')
        for tick_label in plt.gca().get_xticklabels():
            tick_text = tick_label.get_text()
            if "M" in tick_text:
                tick_label.set_color('blue')
            elif "P" in tick_text:
                tick_label.set_color('red')
        plt.tight_layout()
        plt.savefig(os.path.join(path_to_07_output, cloneid, f"heatmap_{cloneid}.svg"))
        plt.close()

        plt.figure(figsize= (10, 10))
        coordinates = list(zip(*np.where(heatmap_plotdf.values == 0)))
        ax = sns.heatmap(heatmap_plotdf,
                    cmap="coolwarm", cbar=-1, linewidths=0.5, linecolor='black')
        for tick_label in plt.gca().get_xticklabels():
            tick_text = tick_label.get_text()
            if "M" in tick_text:
                tick_label.set_color('blue')
            elif "P" in tick_text:
                tick_label.set_color('red')

        for i in range(len(coordinates)):
            ax.add_patch(Rectangle((coordinates[i][1], coordinates[i][0]),1,1, fill=False, edgecolor='red', lw=3))
        plt.tight_layout()
        plt.savefig(os.path.join(path_to_07_output, cloneid, f"heatmap_{cloneid}.annotated.svg"))
        plt.close()

        ##### generate tree and save
        treeobj = saveTreeobj[cloneid] 
        avai_mids = treeobj.seqdf["MID"].unique()
        mid_color_pal = pd.read_csv(color_path, index_col = [0]).to_dict()["hex color"]

        ts = treeobj.generate_tree_style(color_path = color_path)

        for input_mid in avai_mids:
            if input_mid == "GL":
                input_mid_col = "gray"
            else:
                input_mid_col = mid_color_pal[input_mid]
            ts.legend.add_face(CircleFace(10, input_mid_col), column = 0)
            ts.legend.add_face(TextFace(bulk_metadata[bulk_metadata["MID"]==input_mid]["population"].values[0]), column = 0)

        treeobj.tree.render(f"{path_to_07_output}/{cloneid}/{cloneid}.tree.svg", tree_style = ts)
    else:
        bulkdf.to_csv(os.path.join(path_to_07_output, f"tree_Seqdf_{cloneid}.csv"))

100%|██████████| 267/267 [01:25<00:00,  3.11it/s]


In [22]:
scdistdf = pd.DataFrame()
for cloneid in tqdm(maindf.cloneid.unique()):
    V_gene = cloneid.split("_")[1]
    J_gene = cloneid.split("_")[2]
    cdr3_len = cloneid.split("_")[3]
    mouseid = cloneid.split("_")[0]

    treeobj = saveTreeobj[cloneid]
    seqdf = treeobj.seqdf.copy()
    idmapdf = treeobj.idmapseqdf.copy()
    bulkdf = seqdf.merge(idmapdf, right_on = "seq", left_on = "seq")
    bulkdf["aaSeqCDR3"] = bulkdf["ID"].apply(lambda x: x.split("|")[2].split(":")[1])

    scdf = sc_clonedf[(sc_clonedf["V.gene"] == V_gene.split("-0")[0]) & 
                    (sc_clonedf["J.gene"] == J_gene.split("-0")[0])]

    scdf["len"] = scdf["nSeqCDR3"].apply(lambda x: len(x))
    scdf = scdf[scdf["len"] == int(cdr3_len)]

    all_barcodes = [f"{scdf['id'].values[i]}_{scdf['barcode'].values[i]}" for i in range(scdf.shape[0])]

    if scdf.shape[0] > 0:
        for i in range(0, scdf.shape[0]):
            barcode = scdf.iloc[i].barcode
            sampleid = scdf.iloc[i].id
            sc_seq = scdf[(scdf["barcode"] == barcode) & (scdf["id"] == sampleid)]["aaSeqCDR3"].values[0] 

            bulkdf[f"{sampleid}_{barcode}"] = bulkdf["aaSeqCDR3"].apply(lambda x: nltk.edit_distance(x, sc_seq)/len(x))
            
    tmp_scdistdf = bulkdf[[item for item in bulkdf.columns if item in all_barcodes]].min().reset_index()
    tmp_scdistdf.columns = ["barcode", "min_dist_to_a_tree"]
    tmp_scdistdf["treeID"] = cloneid
    scdistdf = pd.concat([scdistdf, tmp_scdistdf], axis = 0)
    print(scdistdf.shape)
scdistdf.to_csv(os.path.join(path_to_07_output, "scdistdf.csv"))

  7%|▋         | 19/267 [00:00<00:02, 93.73it/s]

(1, 3)
(2, 3)
(3, 3)
(4, 3)
(5, 3)
(6, 3)
(7, 3)
(8, 3)
(9, 3)
(10, 3)
(11, 3)
(12, 3)
(13, 3)
(14, 3)
(15, 3)
(16, 3)
(17, 3)
(18, 3)
(19, 3)
(20, 3)
(21, 3)
(22, 3)
(23, 3)
(24, 3)
(25, 3)
(26, 3)
(27, 3)
(28, 3)
(29, 3)
(30, 3)
(31, 3)


 21%|██        | 55/267 [00:00<00:01, 111.27it/s]

(32, 3)
(33, 3)
(34, 3)
(35, 3)
(36, 3)
(37, 3)
(38, 3)
(39, 3)
(40, 3)
(41, 3)
(42, 3)
(43, 3)
(44, 3)
(45, 3)
(46, 3)
(47, 3)
(48, 3)
(49, 3)
(50, 3)
(51, 3)
(52, 3)
(53, 3)
(54, 3)
(55, 3)
(56, 3)
(57, 3)
(58, 3)
(59, 3)


 31%|███       | 83/267 [00:00<00:01, 124.69it/s]

(60, 3)
(61, 3)
(62, 3)
(63, 3)
(64, 3)
(65, 3)
(66, 3)
(67, 3)
(68, 3)
(69, 3)
(70, 3)
(71, 3)
(72, 3)
(73, 3)
(74, 3)
(75, 3)
(76, 3)
(77, 3)
(78, 3)
(79, 3)
(80, 3)
(81, 3)
(82, 3)
(83, 3)
(84, 3)
(85, 3)
(86, 3)
(87, 3)
(88, 3)
(89, 3)
(90, 3)


 42%|████▏     | 112/267 [00:00<00:01, 130.41it/s]

(91, 3)
(92, 3)
(93, 3)
(94, 3)
(95, 3)
(96, 3)
(97, 3)
(98, 3)
(99, 3)
(100, 3)
(101, 3)
(102, 3)
(103, 3)
(104, 3)
(105, 3)
(106, 3)
(107, 3)
(108, 3)
(109, 3)
(110, 3)
(111, 3)
(112, 3)
(113, 3)
(114, 3)
(115, 3)
(116, 3)
(117, 3)


 57%|█████▋    | 152/267 [00:01<00:00, 120.55it/s]

(118, 3)
(119, 3)
(120, 3)
(121, 3)
(122, 3)
(123, 3)
(124, 3)
(125, 3)
(126, 3)
(127, 3)
(128, 3)
(129, 3)
(130, 3)
(131, 3)
(132, 3)
(133, 3)
(134, 3)
(135, 3)
(136, 3)
(137, 3)
(138, 3)
(139, 3)
(140, 3)
(141, 3)
(142, 3)
(143, 3)
(144, 3)
(145, 3)
(146, 3)
(147, 3)
(148, 3)
(149, 3)
(150, 3)
(151, 3)
(152, 3)


 67%|██████▋   | 180/267 [00:01<00:00, 128.01it/s]

(153, 3)
(154, 3)
(155, 3)
(156, 3)
(157, 3)
(158, 3)
(159, 3)
(160, 3)
(161, 3)
(162, 3)
(163, 3)
(164, 3)
(165, 3)
(166, 3)
(167, 3)
(168, 3)
(169, 3)
(170, 3)
(171, 3)
(172, 3)
(173, 3)
(174, 3)
(175, 3)
(176, 3)
(177, 3)
(178, 3)
(179, 3)
(180, 3)
(181, 3)
(182, 3)
(183, 3)
(184, 3)
(185, 3)
(186, 3)
(187, 3)
(188, 3)
(189, 3)
(190, 3)
(191, 3)
(192, 3)
(193, 3)
(194, 3)


 83%|████████▎ | 222/267 [00:01<00:00, 133.17it/s]

(195, 3)
(196, 3)
(197, 3)
(198, 3)
(199, 3)
(200, 3)
(201, 3)
(202, 3)
(203, 3)
(204, 3)
(205, 3)
(206, 3)
(207, 3)
(208, 3)
(209, 3)
(210, 3)
(211, 3)
(212, 3)
(213, 3)
(214, 3)
(215, 3)
(216, 3)
(217, 3)
(218, 3)
(219, 3)
(220, 3)
(221, 3)
(222, 3)
(223, 3)
(224, 3)
(225, 3)
(226, 3)
(227, 3)
(228, 3)


 94%|█████████▍| 251/267 [00:02<00:00, 135.33it/s]

(229, 3)
(230, 3)
(231, 3)
(232, 3)
(233, 3)
(234, 3)
(235, 3)
(236, 3)
(237, 3)
(238, 3)
(239, 3)
(240, 3)
(241, 3)
(242, 3)
(243, 3)
(244, 3)
(245, 3)
(246, 3)
(247, 3)
(248, 3)
(249, 3)
(250, 3)
(251, 3)
(252, 3)
(253, 3)
(254, 3)
(255, 3)
(256, 3)
(257, 3)
(258, 3)
(259, 3)
(260, 3)
(261, 3)
(262, 3)
(263, 3)
(264, 3)


100%|██████████| 267/267 [00:02<00:00, 124.35it/s]

(265, 3)
(266, 3)
(267, 3)





In [23]:
scdistdf

Unnamed: 0,barcode,min_dist_to_a_tree,treeID
0,PP3_GACTACATCAACACCA-1,0.25,m3_IGHV1-26-01_IGHJ1-03_36_1
0,PP3_GACTACATCAACACCA-1,0.25,m3_IGHV1-26-01_IGHJ1-03_36_1
0,PP3_GACTACATCAACACCA-1,0.25,m3_IGHV1-26-01_IGHJ1-03_36_1
0,PP3_GACTACATCAACACCA-1,0.25,m3_IGHV1-26-01_IGHJ1-03_36_1
0,PP3_GACTACATCAACACCA-1,0.25,m3_IGHV1-26-01_IGHJ1-03_36_1
...,...,...,...
0,PP3_GACTACATCAACACCA-1,0.25,m3_IGHV1-26-01_IGHJ1-03_36_1
0,PP3_GACTACATCAACACCA-1,0.25,m3_IGHV1-26-01_IGHJ1-03_36_1
0,PP3_GACTACATCAACACCA-1,0.25,m3_IGHV1-26-01_IGHJ1-03_36_1
0,PP3_GACTACATCAACACCA-1,0.25,m3_IGHV1-26-01_IGHJ1-03_36_1


In [21]:
scdistdf

Unnamed: 0,barcode,min_dist_to_a_tree,treeID
0,PP3_GACTACATCAACACCA-1,0.25,m3_IGHV1-26-01_IGHJ1-03_36_1


In [16]:
cloneid = "m3_IGHV1-26-01_IGHJ1-03_36_1"