In [1]:
import pandas as pd
import numpy as np
import pathlib
import matplotlib.pyplot as plt
import seaborn as sns
import os
from tqdm import tqdm
from typing import List, Union, Optional, Callable
import pickle

from ete3 import Tree, TreeNode
from gctree import CollapsedTree

from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import umap

import warnings
warnings.filterwarnings("ignore")

from preparation import *

sample_base = "mouse_based_trees"

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
storagedir = "/home/hieunguyen/CRC1382/storage"
inputdir = os.path.join(storagedir, "BCR_bulk_trees", sample_base)
outdir = "/home/hieunguyen/CRC1382/outdir"

path_to_01_output = os.path.join(outdir, "BCR_bulk_trees", "01_output")
os.system("mkdir -p {}".format(path_to_01_output))

files = [item for item in pathlib.Path(inputdir).glob("m*_full")]

list_of_samples = [item.name for item in files]

mid_metadata = pd.read_csv(os.path.join(storagedir, "BCR_bulk_trees", "mid_labels.csv"), sep=";", index_col = [0]) 
mid_metadata = mid_metadata.reset_index()
mid_metadata.columns = ["MID"] + list(mid_metadata.columns)[1:]
print("List of samples in this dataset: {}".format(list_of_samples))
print("***** ***** ***** ***** ***** ***** ***** ***** ***** *****")
print("Number of samples in this dataset: {}".format(len(list_of_samples)))

List of samples in this dataset: ['m53_full', 'm14_full', 'm42_full', 'm43_full', 'm37_full', 'm39_full', 'm30_full', 'm32_full', 'm29_full', 'm38_full', 'm13_full', 'm31_full', 'm28_full', 'm36_full', 'm11_full', 'm12_full']
***** ***** ***** ***** ***** ***** ***** ***** ***** *****
Number of samples in this dataset: 16


In [3]:
mid_metadata.head()

Unnamed: 0,MID,mouse,age,day,population,label,Unnamed: 6,Unnamed: 7,hex color
0,MID4,m11,8w,d0,biopsy,m11_biopsy_8w_d0,,,#0ea122
1,MID55,m11,8w,d0,Ly6c+YFP+,m11_Ly6c+YFP+_8w_d0,,,#fcbb2d
2,MID12,m11,8w,d0,Ly6c+YFP-,m11_Ly6c+YFP-_8w_d0,,,#0919ad
3,MID57,m11,8w,d0,Ly6c-YFP+,m11_Ly6c-YFP+_8w_d0,,,#e80707
4,MID58,m11,8w,d0,Ly6c-YFP-,m11_Ly6c-YFP-_8w_d0,,,#1ba8e0


In [4]:
sampleid = list_of_samples[0]
tree_paths = [item for item in pathlib.Path(os.path.join(inputdir, sampleid)).glob("*") if os.path.isdir(item) == True]

list_of_trees = [read_tree_from_path(treedir) for treedir in tree_paths]
list_of_trees = [item for item in list_of_trees if item is not None]

forest = LabForest(list_of_trees)
forest_featuredf = lab_forest_features(forest, forest_name = sampleid)