In [1]:
import pandas as pd
import numpy as np
import pathlib
import matplotlib.pyplot as plt
import seaborn as sns
import os
from tqdm import tqdm
from typing import List, Union, Optional, Callable
import pickle

from ete3 import Tree, TreeNode
from gctree import CollapsedTree

from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import umap

import warnings
warnings.filterwarnings("ignore")

from preparation import *

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
storagedir = "/home/hieunguyen/CRC1382/storage"
inputdir = os.path.join(storagedir, "BCR_bulk_data_trees")
outdir = "/home/hieunguyen/CRC1382/outdir"

path_to_01_output = os.path.join(outdir, "BCR_bulk_data_trees", "01_output")
os.system("mkdir -p {}".format(path_to_01_output))

files = [item for item in pathlib.Path(inputdir).glob("m*_full")]

list_of_samples = [item.name for item in files]

print("List of samples in this dataset: {}".format(list_of_samples))
print("***** ***** ***** ***** ***** ***** ***** ***** ***** *****")
print("Number of samples in this dataset: {}".format(len(list_of_samples)))

List of samples in this dataset: ['m53_full', 'm14_full', 'm42_full', 'm43_full', 'm37_full', 'm39_full', 'm30_full', 'm32_full', 'm29_full', 'm38_full', 'm13_full', 'm31_full', 'm28_full', 'm36_full', 'm11_full', 'm12_full']
***** ***** ***** ***** ***** ***** ***** ***** ***** *****
Number of samples in this dataset: 16


A good example: 
```
samplename = "m30_full"
treename = "gctree_IGHV1-12_IGHJ3_27_62.1"
```

In [3]:
# samplename = "m30_full"
all_MID_samples = []
for samplename in list_of_samples:
    all_MIDs = []
    path_to_sample = os.path.join(inputdir, samplename)
    all_trees = [item for item in pathlib.Path(path_to_sample).glob("*") if os.path.isdir(item)]
    for treedir in all_trees:
        treedir = str(treedir)
        nk_path = os.path.join(treedir, "gctree.out.inference.1.nk")
        if os.path.exists(nk_path) == True:
            ab_dict_path = os.path.join(treedir, "abund.csv")
            abund_df = pd.read_csv(ab_dict_path, index_col=0, names=['val'])
            ab_dict = abund_df.to_dict().get('val')
            tree_path = treedir
            tree = Tree(newick=nk_path, format=1)
            if ab_dict is not None:
                for node in tree.traverse():
                    node.add_feature('abundance', ab_dict.get(node.name, 0))
            treeobj = GCtree(tree = tree, path = tree_path)
            ##### from this file we can calculate the "abundance" of each individual mice at a node.
            treecsv = pd.read_csv(os.path.join(treedir, "tree.csv"), sep = "\t")
        
        for item in treecsv["sample"].unique():
            if "," in item:
                all_MIDs += item.split(",")
            else:
                all_MIDs.append(item)
    
    all_MIDs = list(set(all_MIDs))
    print("Sample: {}".format(samplename))
    print("All MID samples: {}".format(sorted(all_MIDs)))
    all_MID_samples += all_MIDs

Sample: m53_full
All MID samples: ['MID17', 'MID31', 'MID32', 'MID33', 'MID34']
Sample: m14_full
All MID samples: ['MID28', 'MID29', 'MID3', 'MID30']
Sample: m42_full
All MID samples: ['MID43', 'MID44', 'MID45', 'MID46', 'MID5']
Sample: m43_full
All MID samples: ['MID47', 'MID48', 'MID49', 'MID50', 'MID6']
Sample: m37_full
All MID samples: ['MID63', 'MID64', 'MID65', 'MID66', 'MID8']
Sample: m39_full
All MID samples: ['MID10', 'MID18', 'MID67', 'MID68', 'MID69', 'MID8']
Sample: m30_full
All MID samples: ['MID14', 'MID51', 'MID52', 'MID53', 'MID54']
Sample: m32_full
All MID samples: ['MID15', 'MID39', 'MID40', 'MID42']
Sample: m29_full
All MID samples: ['MID13']
Sample: m38_full
All MID samples: ['MID9']
Sample: m13_full
All MID samples: ['MID2', 'MID23', 'MID24', 'MID25', 'MID26']
Sample: m31_full
All MID samples: ['MID16', 'MID35', 'MID36', 'MID38']
Sample: m28_full
All MID samples: ['MID11']
Sample: m36_full
All MID samples: ['MID7']
Sample: m11_full
All MID samples: ['MID12', 'MID4'

In [6]:
treeobj


         /-seq1
      /-|
     |  |   /-seq2
     |   \-|
-- /-|      \-seq5
     |
     |   /-seq3
     |  |
      \-|--seq6
        |
         \-seq7