# 5.0 10X Genomics PBMC 3K Dataset

In [32]:
from clustergrammer_widget import *
net = Network(clustergrammer_widget)
df = {}

import clustergrammer_groupby as cby
import gene_exp_10x

In [33]:
from sklearn.metrics import f1_score
import pandas as pd
import numpy as np
from copy import deepcopy

import matplotlib.pyplot as plt
%matplotlib inline 

### Load Data

In [34]:
df['ge-ini'] = gene_exp_10x.load_gene_exp_to_df('../data/pbmc3k_filtered_gene_bc_matrices/hg19/')
df['ge-ini'].shape

(32738, 2700)

In [35]:
all_genes = df['ge-ini'].index.tolist()
print(len(all_genes))
keep_genes = [x for x in all_genes if 'RPL' not in x]
keep_genes = [x for x in keep_genes if 'RPS' not in x]
print(len(keep_genes))

df['ge'] = df['ge-ini'].loc[keep_genes]
df['ge'].shape

# Removing Mitochondrial Genes
list_mito_genes = ['MTRNR2L11', 'MTRF1', 'MTRNR2L12', 'MTRNR2L13', 'MTRF1L', 'MTRNR2L6', 'MTRNR2L7',
                'MTRNR2L10', 'MTRNR2L8', 'MTRNR2L5', 'MTRNR2L1', 'MTRNR2L3', 'MTRNR2L4']

all_genes = df['ge'].index.tolist()
mito_genes = [x for x in all_genes if 'MT-' == x[:3] or 
             x.split('_')[0] in list_mito_genes]
print(mito_genes)

keep_genes = [x for x in all_genes if x not in mito_genes]
df['ge'] = df['ge'].ix[keep_genes]

# normalize by UMI count
barcode_umi_sum = df['ge'].sum()
df['ge'] = df['ge'].div(barcode_umi_sum)

32738
32546
['MTRNR2L11_3066', 'MTRNR2L12_6165', 'MTRNR2L13_7998', 'MTRF1L_11630', 'MTRNR2L6_13036', 'MTRNR2L10_13646', 'MTRNR2L7_17194', 'MTRNR2L5_17355', 'MTRNR2L8_18439', 'MTRF1_21974', 'MTRNR2L4_24777', 'MTRNR2L1_26599', 'MTRNR2L3_29240', 'MT-ND1_32696', 'MT-ND2_32697', 'MT-CO1_32698', 'MT-CO2_32699', 'MT-ATP8_32700', 'MT-ATP6_32701', 'MT-CO3_32702', 'MT-ND3_32703', 'MT-ND4L_32704', 'MT-ND4_32705', 'MT-ND5_32706', 'MT-ND6_32707', 'MT-CYB_32708']


.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated


In [36]:
net.load_df(df['ge'])
net.normalize(axis='row', norm_type='zscore')
net.swap_nan_for_zero()
df['ge-z'] = net.export_df()
df['ge-z'].shape

(32520, 2700)

### Visualize Original Dataset

In [37]:
net.load_df(df['ge'])
net.filter_N_top(inst_rc='row', N_top=250, rank_type='var')
net.normalize(axis='row', norm_type='zscore')
net.random_sample(axis='col', num_samples=250, random_state=99)
net.clip(lower=-5, upper=5)
net.cluster()
net.widget()

### Load NM'3337 gene sigantures

In [47]:
net.load_file('../data/cell_type_signatures/nm3337_broad_cell_type_sigs.txt')
df['bct-sig'] = net.export_df()
print(df['bct-sig'].shape)

net.load_file('../data/cell_type_signatures/nm3337_narrow_cell_type_sigs.txt')
df['nct-sig'] = net.export_df()
print(df['nct-sig'].shape)

(523, 9)
(523, 22)


In [52]:
sig_rows = df['bct-sig'].index.tolist()
clean_sig_rows = [x.split('_')[0] for x in sig_rows]
print(len(clean_sig_rows), len(list(set(clean_sig_rows))))

523 523


In [54]:
ge_rows = df['ge-ini'].index.tolist()
clean_ge_rows = [x.split('_')[0] for x in ge_rows]
print(len(ge_rows), len(list(set(clean_ge_rows))))

32738 32641


In [43]:
len(set(ge_rows).intersection(sig_rows))

2

# Predict Cell Types using NM3337 Signatures

In [29]:
# rows = df['nct-sig'].index.tolist()
# new_rows = [x.split('_')[0] for x in rows]
# df['nct-sig'].index = new_rows

In [30]:
# rows = df['ge-z'].index.tolist()
# new_rows = [x.split('_')[0] for x in rows]
# df['ge-z'].index = new_rows

In [31]:
df['pred_cat'], df['sig_sim'], y_info = cby.predict_cats_from_sigs(df['ge-z'], df['nct-sig'], 
                                                                   predict_level='Cell Type')

ValueError: Incompatible dimension for X and Y matrices: X.shape[1] == 507 while Y.shape[1] == 509

In [19]:
df['pred_cat'].head()

Unnamed: 0,"(AAACATACAACCAC, Cell Type: T cells CD4 memory activated)","(AAACATTGAGCTAC, Cell Type: T cells CD4 memory activated)","(AAACATTGATCAGC, Cell Type: T cells CD4 memory activated)","(AAACCGTGCTTCCG, Cell Type: T cells CD4 memory activated)","(AAACCGTGTATGCG, Cell Type: T cells CD4 memory activated)","(AAACGCACTGGTAC, Cell Type: T cells CD4 memory activated)","(AAACGCTGACCAGT, Cell Type: T cells CD4 memory activated)","(AAACGCTGGTTCTT, Cell Type: T cells CD4 memory activated)","(AAACGCTGTAGCCA, Cell Type: T cells CD4 memory activated)","(AAACGCTGTTTCTG, Cell Type: T cells CD4 memory activated)",...,"(TTTCAGTGTCACGA, Cell Type: T cells CD4 memory activated)","(TTTCAGTGTCTATC, Cell Type: T cells CD4 memory activated)","(TTTCAGTGTGCAGT, Cell Type: T cells CD4 memory activated)","(TTTCCAGAGGTGAG, Cell Type: T cells CD4 memory activated)","(TTTCGAACACCTGA, Cell Type: T cells CD4 memory activated)","(TTTCGAACTCTCAT, Cell Type: T cells CD4 memory activated)","(TTTCTACTGAGGCA, Cell Type: T cells CD4 memory activated)","(TTTCTACTTCCTCG, Cell Type: T cells CD4 memory activated)","(TTTGCATGAGAGGC, Cell Type: T cells CD4 memory activated)","(TTTGCATGCCTCAC, Cell Type: T cells CD4 memory activated)"
PKD2L2_9573,-0.023926,-0.023926,-0.023926,-0.023926,-0.023926,-0.023926,-0.023926,-0.023926,-0.023926,-0.023926,...,-0.023926,-0.023926,-0.023926,-0.023926,-0.023926,-0.023926,-0.023926,-0.023926,-0.023926,-0.023926
IL9_9553,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [22]:
df['ge-cat'] = deepcopy(df['ge'])
df['ge-cat'].shape

(32520, 2700)

In [25]:
df['pred_cat']

Unnamed: 0,"(AAACATACAACCAC, Cell Type: T cells CD4 memory activated)","(AAACATTGAGCTAC, Cell Type: T cells CD4 memory activated)","(AAACATTGATCAGC, Cell Type: T cells CD4 memory activated)","(AAACCGTGCTTCCG, Cell Type: T cells CD4 memory activated)","(AAACCGTGTATGCG, Cell Type: T cells CD4 memory activated)","(AAACGCACTGGTAC, Cell Type: T cells CD4 memory activated)","(AAACGCTGACCAGT, Cell Type: T cells CD4 memory activated)","(AAACGCTGGTTCTT, Cell Type: T cells CD4 memory activated)","(AAACGCTGTAGCCA, Cell Type: T cells CD4 memory activated)","(AAACGCTGTTTCTG, Cell Type: T cells CD4 memory activated)",...,"(TTTCAGTGTCACGA, Cell Type: T cells CD4 memory activated)","(TTTCAGTGTCTATC, Cell Type: T cells CD4 memory activated)","(TTTCAGTGTGCAGT, Cell Type: T cells CD4 memory activated)","(TTTCCAGAGGTGAG, Cell Type: T cells CD4 memory activated)","(TTTCGAACACCTGA, Cell Type: T cells CD4 memory activated)","(TTTCGAACTCTCAT, Cell Type: T cells CD4 memory activated)","(TTTCTACTGAGGCA, Cell Type: T cells CD4 memory activated)","(TTTCTACTTCCTCG, Cell Type: T cells CD4 memory activated)","(TTTGCATGAGAGGC, Cell Type: T cells CD4 memory activated)","(TTTGCATGCCTCAC, Cell Type: T cells CD4 memory activated)"
PKD2L2_9573,-0.023926,-0.023926,-0.023926,-0.023926,-0.023926,-0.023926,-0.023926,-0.023926,-0.023926,-0.023926,...,-0.023926,-0.023926,-0.023926,-0.023926,-0.023926,-0.023926,-0.023926,-0.023926,-0.023926,-0.023926
IL9_9553,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [23]:
# transfer predicted categories to full dataset and add UMI count
cat_cols = df['pred_cat'].columns.tolist()
df['ge-cat'].columns = cat_cols

new_cols = [(x[0], x[1], 'UMI: ' + str(barcode_umi_sum[x[0]])) for x in cat_cols]

df['ge-cat-umi'] = deepcopy(df['ge-cat'])
df['ge-cat-umi'].columns = new_cols
print(df['ge-cat-umi'].shape)

(32520, 2700)


In [24]:
net.load_df(df['ge-cat-umi'])
net.set_cat_color(axis='col', cat_index=1, cat_name='Cell Type: T cells CD8', inst_color='red')
net.filter_N_top(inst_rc='row', N_top=250, rank_type='var')
net.random_sample(axis='col', num_samples=250, random_state=99)
net.normalize(axis='row', norm_type='zscore')
net.clip(lower=-5, upper=5)
net.cluster()
net.widget()