# 3.1 10X Genomics 2,700 Cell Type Specific 

In [1]:
from clustergrammer2 import net

df = {}
import clustergrammer_groupby as cby
import gene_exp_10x

clustergrammer2 backend version 0.2.9


In [2]:
from sklearn.metrics import f1_score
import pandas as pd
import numpy as np
from copy import deepcopy

import matplotlib.pyplot as plt
%matplotlib inline 

In [3]:
def calc_mean_var_disp(df_inst):
    mean_arr = []
    var_arr = []
    mean_names = []
    for inst_gene in df_inst.index.tolist():
        mean_arr.append( df_inst.loc[inst_gene].mean() )
        var_arr.append(df_inst.loc[inst_gene].var())
        mean_names.append(inst_gene)

    ser_mean = pd.Series(data=mean_arr, index=mean_names)
    ser_var = pd.Series(data=var_arr, index=mean_names)    
    return ser_mean, ser_var

In [4]:
def cell_umi_count(df):
    sum_arr = []
    sum_names = []
    for inst_cell in df:
        sum_arr.append( df[inst_cell].sum() )
        sum_names.append(inst_cell)
    
    ser_sum = pd.Series(data=sum_arr, index=sum_names)
    return ser_sum

### Load Data

In [5]:
df = gene_exp_10x.load_gene_exp_to_df('../data/pbmc3k_filtered_gene_bc_matrices/hg19/')
df.shape

(32738, 2700)

### Remove Ribosomal and Mitochondrial Genes

In [6]:
all_genes = df.index.tolist()
print(len(all_genes))
keep_genes = [x for x in all_genes if 'RPL' not in x]
keep_genes = [x for x in keep_genes if 'RPS' not in x]
print(len(keep_genes))

df = df.loc[keep_genes]
df.shape

# Removing Mitochondrial Genes
list_mito_genes = ['MTRNR2L11', 'MTRF1', 'MTRNR2L12', 'MTRNR2L13', 'MTRF1L', 'MTRNR2L6', 'MTRNR2L7',
                'MTRNR2L10', 'MTRNR2L8', 'MTRNR2L5', 'MTRNR2L1', 'MTRNR2L3', 'MTRNR2L4']

all_genes = df.index.tolist()
mito_genes = [x for x in all_genes if 'MT-' == x[:3] or 
             x.split('_')[0] in list_mito_genes]
print(mito_genes)

keep_genes = [x for x in all_genes if x not in mito_genes]
df = df.loc[keep_genes]

32738
32546
['MTRNR2L11', 'MTRNR2L12', 'MTRNR2L13', 'MTRF1L', 'MTRNR2L6', 'MTRNR2L10', 'MTRNR2L7', 'MTRNR2L5', 'MTRNR2L8', 'MTRF1', 'MTRNR2L4', 'MTRNR2L1', 'MTRNR2L3', 'MT-ND1', 'MT-ND2', 'MT-CO1', 'MT-CO2', 'MT-ATP8', 'MT-ATP6', 'MT-CO3', 'MT-ND3', 'MT-ND4L', 'MT-ND4', 'MT-ND5', 'MT-ND6', 'MT-CYB']


### UMI Normalize GEX Data

In [7]:
ser_sum = cell_umi_count(df)
df = df.div(ser_sum)
print(df.shape)
print(df.sum().head())

(32520, 2700)
AAACATACAACCAC    1.0
AAACATTGAGCTAC    1.0
AAACATTGATCAGC    1.0
AAACCGTGCTTCCG    1.0
AAACCGTGTATGCG    1.0
dtype: float64


### Add CIBERSORT Cell Type Predictions

In [8]:
# load json to dict
def load_to_dict( filename ):
  import json
  # load
  f = open(filename,'r')
  inst_dict = json.load(f)
  f.close()
  return inst_dict

In [9]:
ct_type = load_to_dict('../data/pbmc3k_cibersort_ct_predict.json')

In [10]:
cols = df.columns.tolist()
new_cols = [(x, ct_type[x]) for x in cols]
df.columns = new_cols

In [11]:
df.head()

Unnamed: 0,"(AAACATACAACCAC, Cell Type: T cells CD8)","(AAACATTGAGCTAC, Cell Type: B cells memory)","(AAACATTGATCAGC, Cell Type: T cells follicular helper)","(AAACCGTGCTTCCG, Cell Type: Monocytes)","(AAACCGTGTATGCG, Cell Type: NK cells activated)","(AAACGCACTGGTAC, Cell Type: T cells regulatory (Tregs))","(AAACGCTGACCAGT, Cell Type: Dendritic cells resting)","(AAACGCTGGTTCTT, Cell Type: T cells gamma delta)","(AAACGCTGTAGCCA, Cell Type: T cells CD4 memory resting)","(AAACGCTGTTTCTG, Cell Type: Neutrophils)",...,"(TTTCAGTGTCACGA, Cell Type: B cells naive)","(TTTCAGTGTCTATC, Cell Type: Macrophages M2)","(TTTCAGTGTGCAGT, Cell Type: B cells memory)","(TTTCCAGAGGTGAG, Cell Type: T cells follicular helper)","(TTTCGAACACCTGA, Cell Type: T cells regulatory (Tregs))","(TTTCGAACTCTCAT, Cell Type: Monocytes)","(TTTCTACTGAGGCA, Cell Type: Plasma cells)","(TTTCTACTTCCTCG, Cell Type: B cells naive)","(TTTGCATGAGAGGC, Cell Type: B cells naive)","(TTTGCATGCCTCAC, Cell Type: T cells CD4 naive)"
MIR1302-10,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
FAM138A,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
OR4F5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
RP11-34P13.7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
RP11-34P13.8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Filter for Specific Cell Type

In [12]:
specific_cell_type = 'NK'

In [13]:
cols = df.columns.tolist()
print(len(cols))
keep_cols = [x for x in cols if specific_cell_type in x[1]]
print(len(keep_cols))
df = df[keep_cols]

2700
257


### Find top expressing genes

In [14]:
ser_mean, ser_var = calc_mean_var_disp(df)

num_keep_umi = 10000
num_top_var = 250

# filter for top expressing genes
keep_mean = ser_mean.sort_values(ascending=False)[:num_keep_umi].index.tolist()


df = df.loc[keep_mean]

ser_keep_var = ser_var[keep_mean]
# filter for top variance based
keep_var = ser_keep_var.sort_values(ascending=False).index.tolist()[:num_top_var]

In [15]:
df.loc[keep_var].shape

(250, 257)

In [16]:
net.load_df(df.loc[keep_var])
net.normalize(axis='row', norm_type='zscore')
net.clip(-5, 5)
net.widget()

ExampleWidget(network='{"row_nodes": [{"name": "MALAT1", "ini": 250, "clust": 37, "rank": 195, "rankvar": 195,…