# 5.1 Generate Overview of Embryos
Working on cell type by developmental stage downsampled view of the data. I'll downsample the data from each embryo into cell types. Each embryo has the same developmental stage.

In [2]:
from clustergrammer2 import net
df = {}

In [32]:
import pandas as pd
from glob import glob
import os
from copy import deepcopy

### Cell Type Distributions

In [53]:
list_cell_types = []
meta_list = []
new_samples = sorted(glob('../data/big_data/cao_2million-cell_2019_61-embryo_parquet_files_binder/*'))
for inst_sample in new_samples:

    df_meta = pd.read_parquet(inst_sample + '/meta_cell.parquet')
    meta_list.append(df_meta)
    
    list_cell_types.extend(list(df_meta['Main_cell_type'].get_values()))
    
print(len(list_cell_types))

1386587


In [64]:
ser_cell_types = pd.Series(list_cell_types)
all_cell_types = ser_cell_types.value_counts().index.tolist()

print('there are', len(all_cell_types), 'cell types')
ser_cell_types.value_counts()

there are 38 cell types


Chondrocytes & osteoblasts       104698
Connective tissue progenitors     98964
Intermediate Mesoderm             89518
Jaw and tooth progenitors         82289
Early mesenchyme                  71949
Excitatory neurons                68567
Epithelial cells                  66209
Radial glia                       65428
Neural progenitor cells           58332
Postmitotic premature neurons     56033
Oligodendrocyte Progenitors       54606
Isthmic organizer cells           48498
Neural Tube                       45985
Inhibitory neurons                44658
Myocytes                          43197
Definitive erythroid lineage      34205
Chondroctye progenitors           33539
Inhibitory neuron progenitors     31214
Premature oligodendrocyte         29538
Limb mesenchyme                   26559
Sensory neurons                   26477
Endothelial cells                 26431
Stromal cells                     23259
Osteoblasts                       23223
Schwann cell precursor            23145


# Main Cell Types Times Developmental Stage

In [79]:
38 * 5

190

In [62]:
df_meta_all = pd.concat(meta_list)
df_meta_all.shape

(1386587, 36)

In [77]:
for inst_cell_type in all_cell_types:
    print('\n')
    inst_df = df_meta_all[df_meta_all['Main_cell_type'] == inst_cell_type]
    print(inst_cell_type, inst_df.shape)
    print(inst_df['development_stage'].value_counts())



Chondrocytes & osteoblasts (104698, 36)
11.5    56283
12.5    26746
10.5    11627
13.5     9987
9.5        55
Name: development_stage, dtype: int64


Connective tissue progenitors (98964, 36)
13.5    45056
12.5    35095
11.5    17506
10.5     1287
9.5        20
Name: development_stage, dtype: int64


Intermediate Mesoderm (89518, 36)
11.5    29423
10.5    22742
12.5    15482
13.5    11697
9.5     10174
Name: development_stage, dtype: int64


Jaw and tooth progenitors (82289, 36)
11.5    41259
12.5    17454
10.5    15327
13.5     6237
9.5      2012
Name: development_stage, dtype: int64


Early mesenchyme (71949, 36)
10.5    35092
9.5     26168
11.5     9887
12.5      741
13.5       61
Name: development_stage, dtype: int64


Excitatory neurons (68567, 36)
13.5    35560
12.5    22626
11.5     9127
10.5     1214
9.5        40
Name: development_stage, dtype: int64


Epithelial cells (66209, 36)
11.5    19020
10.5    13764
12.5    11593
13.5    11464
9.5     10368
Name: development_stage, 

In [69]:
inst_df['Main_trajectory'].value_counts()

Mesenchymal trajectory                  104336
Epithelial trajectory                      134
Haematopoiesis trajectory                   95
Lens trajectory                             43
Neural tube and notochord trajectory        38
Endothelial trajectory                      19
Neural crest 2                              12
Neural crest 1                              10
Hepatocyte trajectory                        9
Neural crest 3                               2
Name: Main_trajectory, dtype: int64

In [23]:
for inst_sample in new_samples[:1]:
    df_gex = pd.read_parquet(inst_sample + '/gex.parquet')
    df_meta = pd.read_parquet(inst_sample + '/meta_cell.parquet')
    df_gex.shape

In [24]:
df_gex.shape

(5000, 15666)

In [26]:
df_meta.shape

(15666, 36)

### Working on adding categories from metadata

In [11]:
def add_cats_from_meta(barcodes, df_meta, add_cat_list):
    '''
    Add categories from df_meta.
    '''

    # get metadata of interest (add_cat_list) from barcodes of interest
    df_cats = df_meta.loc[barcodes][add_cat_list]

    # get list of cats
    list_cat_ini = [list(x) for x in df_cats.values]

    # add titles to cats
    list_cat_titles = [ list([str(x) + ': ' + str(y) for x,y in zip(add_cat_list, a)]) for a in list_cat_ini]

    # add barcodes to new columns
    new_cols = [tuple([x] + y) for x,y in zip(barcodes, list_cat_titles)]

    return new_cols

In [13]:
df_meta.head()

Unnamed: 0_level_0,Unnamed: 0,all_exon_count,all_intron_count,all_read_count,intergenic_rate,embryo_id,embryo_sex,nuclei_extraction_date,development_stage,Total_mRNAs,...,Main_trajectory_umap_3,Main_trajectory_refined_by_cluster,Main_trajectory_refined_umap_1,Main_trajectory_refined_umap_2,Main_trajectory_refined_umap_3,Sub_trajectory_name,Sub_trajectory_umap_1,Sub_trajectory_umap_2,Sub_trajectory_louvain_component,Sub_trajectory_Pseudotime
sample,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
sci3-me-001.ATTAGTCTGTGTATAATACG,0,1612.0,382.0,2442.0,0.183456,9,M,2,11.5,1989.0,...,1.723381,Endothelial trajectory,1.208097,1.13821,1.053613,Venous and capillary endothelial trajectory,1.130985,1.567624,1.0,0.068362
sci3-me-002.GTACCTCTTATTCGACCAA,12,138.0,416.0,672.0,0.175595,9,M,2,11.5,554.0,...,0.864672,Neural tube and notochord trajectory,1.81137,1.316749,1.021694,Neural epithelial trajectory,2.394788,1.07892,1.0,1.570391
sci3-me-002.ACCGTAGCTAGGAGAGAAC,13,181.0,682.0,997.0,0.134403,9,M,2,11.5,858.0,...,0.721429,Neural tube and notochord trajectory,1.557839,0.811976,0.521807,Neuron progenitor trajectory,1.782221,0.887197,1.0,0.168541
sci3-me-002.ATAACTTCCTTATGAGTTAA,15,359.0,935.0,1470.0,0.119728,9,M,2,11.5,1293.0,...,1.093316,Neural tube and notochord trajectory,1.524517,1.654862,1.773734,Oligodendrocyte trajectory,0.304404,0.886391,1.0,1.583377
sci3-me-002.TCTCTCCATAATGCCGCTT,17,160.0,367.0,602.0,0.124585,9,M,2,11.5,527.0,...,1.089299,Neural tube and notochord trajectory,1.586869,1.665207,1.784211,Oligodendrocyte trajectory,0.293331,0.843669,1.0,1.580017


In [27]:
new_cols = add_cats_from_meta(df_gex.columns.tolist(), df_meta, ['Main_cell_type', 'Main_trajectory', 'development_stage'])

In [28]:
len(new_cols)

15666

In [29]:
new_cols[0]

('sci3-me-002.AGATCGGATGCGTTGGAGC',
 'Main_cell_type: Early mesenchyme',
 'Main_trajectory: Mesenchymal trajectory',
 'development_stage: 9.5')

In [30]:
df_gex.shape

(5000, 15666)

In [31]:
df_meta.shape

(15666, 36)

In [33]:
df_cat = deepcopy(df_gex)
df_cat.columns = new_cols

In [38]:
ser_traj = pd.Series([x[1] for x in df_cat.columns.tolist()])
cell_types = ser_traj.value_counts().index.tolist()
ser_traj.value_counts()

Main_cell_type: Early mesenchyme                 3623
Main_cell_type: Neural Tube                      2448
Main_cell_type: Intermediate Mesoderm            1560
Main_cell_type: Epithelial cells                 1319
Main_cell_type: Isthmic organizer cells           925
Main_cell_type: Oligodendrocyte Progenitors       784
Main_cell_type: Radial glia                       696
Main_cell_type: Endothelial cells                 585
Main_cell_type: Stromal cells                     527
Main_cell_type: Primitive erythroid lineage       495
Main_cell_type: Neural progenitor cells           439
Main_cell_type: Chondroctye progenitors           388
Main_cell_type: Notochord cells                   370
Main_cell_type: Jaw and tooth progenitors         344
Main_cell_type: Schwann cell precursor            307
Main_cell_type: Cardiac muscle lineages           263
Main_cell_type: Sensory neurons                   142
Main_cell_type: Myocytes                           80
Main_cell_type: Cholinergic 

In [80]:
for inst_cell_type in cell_types:
    print(inst_cell_type)
    ser_traj = pd.Series([x[3] for x in df_cat.columns.tolist() if x[1] == inst_cell_type])
    print(len(ser_traj.value_counts().index.tolist()))

Main_cell_type: Early mesenchyme
1
Main_cell_type: Neural Tube
1
Main_cell_type: Intermediate Mesoderm
1
Main_cell_type: Epithelial cells
1
Main_cell_type: Isthmic organizer cells
1
Main_cell_type: Oligodendrocyte Progenitors
1
Main_cell_type: Radial glia
1
Main_cell_type: Endothelial cells
1
Main_cell_type: Stromal cells
1
Main_cell_type: Primitive erythroid lineage
1
Main_cell_type: Neural progenitor cells
1
Main_cell_type: Chondroctye progenitors
1
Main_cell_type: Notochord cells
1
Main_cell_type: Jaw and tooth progenitors
1
Main_cell_type: Schwann cell precursor
1
Main_cell_type: Cardiac muscle lineages
1
Main_cell_type: Sensory neurons
1
Main_cell_type: Myocytes
1
Main_cell_type: Cholinergic neurons
1
Main_cell_type: Osteoblasts
1
Main_cell_type: Ependymal cell
1
Main_cell_type: Megakaryocytes
1
Main_cell_type: Hepatocytes
1
Main_cell_type: Melanocytes
1
Main_cell_type: Postmitotic premature neurons
1
Main_cell_type: Premature oligodendrocyte
1
Main_cell_type: White blood cells
1


In [35]:
ser_traj = pd.Series([x[3] for x in df_cat.columns.tolist()])
ser_traj.value_counts()

development_stage: 9.5    15666
dtype: int64

In [82]:
df_cat.shape

(5000, 15666)

In [83]:
df_cat.head()

Unnamed: 0,"(sci3-me-002.AGATCGGATGCGTTGGAGC, Main_cell_type: Early mesenchyme, Main_trajectory: Mesenchymal trajectory, development_stage: 9.5)","(sci3-me-002.TAATACCAGTGCGTTGGAGC, Main_cell_type: Radial glia, Main_trajectory: Neural tube and notochord trajectory, development_stage: 9.5)","(sci3-me-002.AGATTCAACTGCGTTGGAGC, Main_cell_type: Oligodendrocyte Progenitors, Main_trajectory: Neural tube and notochord trajectory, development_stage: 9.5)","(sci3-me-002.GTCATCTGTAGAGCTATAA, Main_cell_type: Schwann cell precursor, Main_trajectory: Neural crest 2, development_stage: 9.5)","(sci3-me-002.TCAGTTGGTGCGTTGGAGC, Main_cell_type: Early mesenchyme, Main_trajectory: Mesenchymal trajectory, development_stage: 9.5)","(sci3-me-002.AAAGCTGATTGCGTTGGAGC, Main_cell_type: Epithelial cells, Main_trajectory: Epithelial trajectory, development_stage: 9.5)","(sci3-me-002.TTGAGTCCTGCGTTGGAGC, Main_cell_type: Early mesenchyme, Main_trajectory: Mesenchymal trajectory, development_stage: 9.5)","(sci3-me-002.GTTCGCTGTGCGTTGGAGC, Main_cell_type: Early mesenchyme, Main_trajectory: Mesenchymal trajectory, development_stage: 9.5)","(sci3-me-002.TGTCCTTATTGCGTTGGAGC, Main_cell_type: Early mesenchyme, Main_trajectory: Mesenchymal trajectory, development_stage: 9.5)","(sci3-me-002.TTATCCGCTGCGTTGGAGC, Main_cell_type: Neural progenitor cells, Main_trajectory: Neural tube and notochord trajectory, development_stage: 9.5)",...,"(sci3-me-760.TTGGAGCTTAGAGCTATAA, Main_cell_type: Early mesenchyme, Main_trajectory: Mesenchymal trajectory, development_stage: 9.5)","(sci3-me-760.CGTACCTATAGAGCTATAA, Main_cell_type: Chondroctye progenitors, Main_trajectory: Mesenchymal trajectory, development_stage: 9.5)","(sci3-me-760.TATGCGATTTAGAGCTATAA, Main_cell_type: Endothelial cells, Main_trajectory: Endothelial trajectory, development_stage: 9.5)","(sci3-me-760.AACGTAATCTAGAGCTATAA, Main_cell_type: Intermediate Mesoderm, Main_trajectory: Mesenchymal trajectory, development_stage: 9.5)","(sci3-me-760.GGTCAGTTTGCGTTGGAGC, Main_cell_type: Early mesenchyme, Main_trajectory: Mesenchymal trajectory, development_stage: 9.5)","(sci3-me-760.GTTCGCTGTAGAGCTATAA, Main_cell_type: Intermediate Mesoderm, Main_trajectory: Neural tube and notochord trajectory, development_stage: 9.5)","(sci3-me-760.GATAAGCGTGCGTTGGAGC, Main_cell_type: Epithelial cells, Main_trajectory: Epithelial trajectory, development_stage: 9.5)","(sci3-me-760.AAATTCCTCTAGAGCTATAA, Main_cell_type: Neural Tube, Main_trajectory: Neural tube and notochord trajectory, development_stage: 9.5)","(sci3-me-760.AGTAGCCATGCGTTGGAGC, Main_cell_type: Cholinergic neurons, Main_trajectory: Neural tube and notochord trajectory, development_stage: 9.5)","(sci3-me-760.GAACGATGTAGAGCTATAA, Main_cell_type: Epithelial cells, Main_trajectory: Epithelial trajectory, development_stage: 9.5)"
Gm42418,5,4,21,7,10,15,13,5,3,5,...,6,13,5,9,4,15,7,26,40,10
Gpc6,1,16,10,15,3,0,6,4,5,1,...,69,11,0,2,3,10,7,8,19,4
Dcc,0,1,2,10,8,0,0,0,16,11,...,0,0,0,0,0,0,0,0,0,0
mt-Rnr1,2,2,4,1,1,1,2,1,1,0,...,5,4,2,2,0,4,0,12,12,0
mt-Rnr2,1,0,4,2,1,1,2,1,0,0,...,5,4,0,0,0,3,1,6,11,2


In [87]:
df_sig.shape

(248, 36)

In [89]:
df_mi = net.row_tuple_to_multiindex(df_cat.transpose())
df_mi.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Gm42418,Gpc6,Dcc,mt-Rnr1,mt-Rnr2,Hbb-y,Gpc3,Hba-x,Auts2,Slc8a1,...,Cfap20,Ackr3,Srfbp1,Dcp1b,Snrpd1,Zfp606,Pou6f2,Cc2d2a,Spg7,Psma7
Name,Main_cell_type,Main_trajectory,development_stage,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1
sci3-me-002.AGATCGGATGCGTTGGAGC,Early mesenchyme,Mesenchymal trajectory,9.5,5,1,0,2,1,0,2,0,3,2,...,0,0,0,0,0,0,0,0,0,0
sci3-me-002.TAATACCAGTGCGTTGGAGC,Radial glia,Neural tube and notochord trajectory,9.5,4,16,1,2,0,1,1,0,1,8,...,0,0,0,0,0,0,1,0,0,0
sci3-me-002.AGATTCAACTGCGTTGGAGC,Oligodendrocyte Progenitors,Neural tube and notochord trajectory,9.5,21,10,2,4,4,0,4,0,3,1,...,0,0,0,0,0,0,0,0,1,0
sci3-me-002.GTCATCTGTAGAGCTATAA,Schwann cell precursor,Neural crest 2,9.5,7,15,10,1,2,0,0,0,11,0,...,0,0,0,0,0,0,0,0,0,0
sci3-me-002.TCAGTTGGTGCGTTGGAGC,Early mesenchyme,Mesenchymal trajectory,9.5,10,3,8,1,1,0,13,0,20,2,...,0,0,0,0,0,0,0,0,0,0


In [90]:
df_mean = df_mi.groupby(level='Main_cell_type').mean().transpose()
df_mean.shape

(5000, 36)