# 3.0 10X Genomics PBMC 2,700 Dataset

In [1]:
from clustergrammer2 import net

df = {}
import clustergrammer_groupby as cby
import gene_exp_10x

clustergrammer2 backend version 0.2.9


In [2]:
from sklearn.metrics import f1_score
import pandas as pd
import numpy as np
from copy import deepcopy

import matplotlib.pyplot as plt
%matplotlib inline 

In [3]:
def calc_mean_var_disp(df_inst):
    mean_arr = []
    var_arr = []
    mean_names = []
    for inst_gene in df_inst.index.tolist():
        mean_arr.append( df_inst.loc[inst_gene].mean() )
        var_arr.append(df_inst.loc[inst_gene].var())
        mean_names.append(inst_gene)

    ser_mean = pd.Series(data=mean_arr, index=mean_names)
    ser_var = pd.Series(data=var_arr, index=mean_names)    
    return ser_mean, ser_var

In [4]:
def cell_umi_count(df):
    sum_arr = []
    sum_names = []
    for inst_cell in df:
        sum_arr.append( df[inst_cell].sum() )
        sum_names.append(inst_cell)
    
    ser_sum = pd.Series(data=sum_arr, index=sum_names)
    return ser_sum

### Load Data

In [5]:
df = gene_exp_10x.load_gene_exp_to_df('../data/pbmc3k_filtered_gene_bc_matrices/hg19/')
df.shape

(32738, 2700)

### Remove Ribosomal and Mitochondrial Genes

In [6]:
all_genes = df.index.tolist()
print(len(all_genes))
keep_genes = [x for x in all_genes if 'RPL' not in x]
keep_genes = [x for x in keep_genes if 'RPS' not in x]
print(len(keep_genes))

df = df.loc[keep_genes]
df.shape

# Removing Mitochondrial Genes
list_mito_genes = ['MTRNR2L11', 'MTRF1', 'MTRNR2L12', 'MTRNR2L13', 'MTRF1L', 'MTRNR2L6', 'MTRNR2L7',
                'MTRNR2L10', 'MTRNR2L8', 'MTRNR2L5', 'MTRNR2L1', 'MTRNR2L3', 'MTRNR2L4']

all_genes = df.index.tolist()
mito_genes = [x for x in all_genes if 'MT-' == x[:3] or 
             x.split('_')[0] in list_mito_genes]
print(mito_genes)

keep_genes = [x for x in all_genes if x not in mito_genes]
df = df.loc[keep_genes]

32738
32546
['MTRNR2L11', 'MTRNR2L12', 'MTRNR2L13', 'MTRF1L', 'MTRNR2L6', 'MTRNR2L10', 'MTRNR2L7', 'MTRNR2L5', 'MTRNR2L8', 'MTRF1', 'MTRNR2L4', 'MTRNR2L1', 'MTRNR2L3', 'MT-ND1', 'MT-ND2', 'MT-CO1', 'MT-CO2', 'MT-ATP8', 'MT-ATP6', 'MT-CO3', 'MT-ND3', 'MT-ND4L', 'MT-ND4', 'MT-ND5', 'MT-ND6', 'MT-CYB']


### UMI Normalize GEX Data

In [7]:
ser_sum = cell_umi_count(df)
df = df.div(ser_sum)
print(df.shape)
print(df.sum().head())

(32520, 2700)
AAACATACAACCAC    1.0
AAACATTGAGCTAC    1.0
AAACATTGATCAGC    1.0
AAACCGTGCTTCCG    1.0
AAACCGTGTATGCG    1.0
dtype: float64


### Find top expressing genes 

In [8]:
ser_mean, ser_var = calc_mean_var_disp(df)

num_keep_umi = 15000
num_top_var = 250

# filter for top expressing genes
keep_mean = ser_mean.sort_values(ascending=False)[:num_keep_umi].index.tolist()


df = df.loc[keep_mean]

ser_keep_var = ser_var[keep_mean]
# filter for top variance based
keep_var = ser_keep_var.sort_values(ascending=False).index.tolist()[:num_top_var]

### ArcSinh Transform and Z-score GEX Data

In [9]:
# ArcSinh transform
df = np.arcsinh(df/5)

# Z-score genes
net.load_df(df)
net.normalize(axis='row', norm_type='zscore')
df = net.export_df()

print(df.shape)

(15000, 2700)


# Unlabeled Cells 

In [10]:
net.load_df(df.loc[keep_var])
net.clip(lower=-5, upper=5)
net.widget()

ExampleWidget(network='{"row_nodes": [{"name": "MALAT1", "ini": 250, "clust": 143, "rank": 237, "rankvar": 237…

### Load CIBERSORT gene sigantures

In [11]:
net.load_file('../data/cell_type_signatures/nm3337_narrow_cell_type_sigs.txt')
net.normalize(axis='row', norm_type='zscore')
df_sig = net.export_df()
print(df_sig.shape)

rows = df_sig.index.tolist()
new_rows = [x.split('_')[0] for x in rows]
df_sig.index = new_rows

(523, 22)


In [12]:
ct_color = {}
ct_color['T cells CD8'] = 'red'
ct_color['T cells CD4 naive'] = 'blue'
ct_color['T cells CD4 memory activated'] = 'blue'
ct_color['T cells CD4 memory resting'] = '#87cefa' # sky blue
ct_color['B cells naive'] = 'purple'
ct_color['B cells memory'] = '#DA70D6' # orchid
ct_color['NK cells activated'] = 'yellow'
ct_color['NK cells resting'] = '#FCD116' # sign yellow
ct_color['Monocytes'] = '#98ff98' # mint green
ct_color['Macrophages M0'] = '#D3D3D3' # light grey
ct_color['Macrophages M1'] = '#C0C0C0' # silver
ct_color['Macrophages M2'] = '#A9A9A9' # dark grey
ct_color[''] = ''

In [13]:
net.set_cat_color(axis='col', cat_index=1, cat_name='T cells CD8', inst_color='red')
net.set_cat_color(axis='col', cat_index=1, cat_name='T cells CD4 naive', inst_color='blue')
net.set_cat_color(axis='col', cat_index=1, cat_name='T cells CD4 memory activated', inst_color='blue')
net.set_cat_color(axis='col', cat_index=1, cat_name='T cells CD4 memory resting', inst_color='#87cefa') # sky blue
net.set_cat_color(axis='col', cat_index=1, cat_name='B cells naive', inst_color='purple')
net.set_cat_color(axis='col', cat_index=1, cat_name='B cells memory', inst_color='#DA70D6') # orchid
net.set_cat_color(axis='col', cat_index=1, cat_name='NK cells activated', inst_color='yellow')
net.set_cat_color(axis='col', cat_index=1, cat_name='NK cells resting', inst_color='#FCD116') # sign yellow
net.set_cat_color(axis='col', cat_index=1, cat_name='Monocytes', inst_color='#98ff98') # mint green
net.set_cat_color(axis='col', cat_index=1, cat_name='Macrophages M0', inst_color='#D3D3D3') # light grey
net.set_cat_color(axis='col', cat_index=1, cat_name='Macrophages M1', inst_color='#C0C0C0') # silver
net.set_cat_color(axis='col', cat_index=1, cat_name='Macrophages M2', inst_color='#A9A9A9') # dark grey

In [14]:
net.load_df(df_sig)
net.clip(lower=-5, upper=5)
net.widget()

ExampleWidget(network='{"row_nodes": [{"name": "ABCB4", "ini": 523, "clust": 318, "rank": 341, "rankvar": 8, "…

# Predict Cell Types using CIBERSORT Signatures

In [15]:
df_pred_cat, df_sig_sim, y_info = cby.predict_cats_from_sigs(df, df_sig, 
                                                                   predict_level='Cell Type', unknown_thresh=0.05)
df.columns = df_pred_cat.columns.tolist()
print(df_pred_cat.shape)

(403, 2700)


### Cell Type Similarity

In [22]:
df_sig_sim.columns = df_pred_cat.columns.tolist()
net.load_df(df_sig_sim)
net.widget()

ExampleWidget(network='{"row_nodes": [{"name": "B cells naive", "ini": 22, "clust": 10, "rank": 0, "rankvar": …

In [16]:
net.load_df(df_pred_cat)
net.set_cat_color(axis='col', cat_index=1, cat_name='Cell Type: T cells CD8', inst_color='red')
net.set_cat_color(axis='col', cat_index=1, cat_name='Cell Type: T cells CD4 naive', inst_color='blue')
net.set_cat_color(axis='col', cat_index=1, cat_name='Cell Type: T cells CD4 memory activated', inst_color='blue')
net.set_cat_color(axis='col', cat_index=1, cat_name='Cell Type: T cells CD4 memory resting', inst_color='#87cefa') # sky blue
net.set_cat_color(axis='col', cat_index=1, cat_name='Cell Type: B cells naive', inst_color='purple')
net.set_cat_color(axis='col', cat_index=1, cat_name='Cell Type: B cells memory', inst_color='#DA70D6') # orchid
net.set_cat_color(axis='col', cat_index=1, cat_name='Cell Type: NK cells activated', inst_color='yellow')
net.set_cat_color(axis='col', cat_index=1, cat_name='Cell Type: NK cells resting', inst_color='#FCD116') # sign yellow
net.set_cat_color(axis='col', cat_index=1, cat_name='Cell Type: Monocytes', inst_color='#98ff98') # mint green
net.set_cat_color(axis='col', cat_index=1, cat_name='Cell Type: Macrophages M0', inst_color='#D3D3D3') # light grey
net.set_cat_color(axis='col', cat_index=1, cat_name='Cell Type: Macrophages M1', inst_color='#C0C0C0') # silver
net.set_cat_color(axis='col', cat_index=1, cat_name='Cell Type: Macrophages M2', inst_color='#A9A9A9') # dark grey

# Cells in CIBERSORT GEX Space

In [17]:
net.load_df(df_pred_cat)
net.clip(lower=-5, upper=5)
net.widget()

ExampleWidget(network='{"row_nodes": [{"name": "P2RY14", "ini": 403, "clust": 372, "rank": 75, "rankvar": 121,…

# Cells with CIBERSORT Predictions, Top Genes Based on Variance

In [18]:
net.load_df(df.loc[keep_var])
net.clip(lower=-5, upper=5)
net.widget()

ExampleWidget(network='{"row_nodes": [{"name": "MALAT1", "ini": 250, "clust": 143, "rank": 237, "rankvar": 237…