This notebook follows 05_make_rentina_subsets_for_teaching.

Inputs:

1. cell_metadata was output from 03-make-cell-metadata
2. mouse_gene_metadata was output from 04-make-mouse-gene-metadata
3. GSM1626793_P14Retina_1.digital_expression.txt was downloaded from [here](https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE63472).


This notebook outputs the following files:
1. big_clusters_expression.csv
2. big_clusters_cell_metadata.csv
3. big_clusters_gene_metadata.csv
4. amacrine_expression.csv
5. amacrine_cell_metadata.csv
6. amacrine_gene_metadata.csv
7. differential_expression.csv
8. differential_cell_metadata.csv
9. differential_gene_metadata.csv

In [None]:
import pandas as pd
import numpy as np

In [None]:
to_save=False

# Table1

In [7]:
table1 = pd.read_csv("data/downloads/GSM1626793_P14Retina_1.digital_expression.txt",
                     sep='\t')

# Preprocessing
table1 = table1.T

table1.columns = table1.loc['gene']
table1.drop(index=['gene'], inplace=True)

In [8]:
nav1_rows = [row for row in table1.columns if row.endswith('Nav1')]
nav1_rows

['1:135434580-135585355:Nav1', '1:135606447-135688105:Nav1']

In [9]:
gene_symbols = table1.columns.map(lambda x: x.split(':')[-1].upper())
gene_symbols

Index(['KITL', 'TMTC3', 'CEP290', '4930430F08RIK', '1700017N19RIK', 'MGAT4C',
       'RASSF9', 'LRRIQ1', 'ADGB', 'SLC6A15',
       ...
       'EFNB1', 'PJA1', 'TMEM28', 'EDA', 'EIF2S3Y', 'GM20775', 'UTY', 'DDX3Y',
       'KDM5D', 'N-R5S1'],
      dtype='object', name='gene', length=20478)

In [10]:
table1.columns = gene_symbols
table1.head()

gene,KITL,TMTC3,CEP290,4930430F08RIK,1700017N19RIK,MGAT4C,RASSF9,LRRIQ1,ADGB,SLC6A15,...,EFNB1,PJA1,TMEM28,EDA,EIF2S3Y,GM20775,UTY,DDX3Y,KDM5D,N-R5S1
GGCCGCAGTCCG,0,3,1,2,0,0,0,0,0,4,...,0,8,1,0,0,0,0,0,0,0
CTTGTGCGGGAA,0,0,3,1,0,0,0,0,0,1,...,0,9,0,0,2,0,1,5,0,0
GCGCAACTGCTC,1,0,0,2,0,4,0,0,0,3,...,1,11,0,0,0,0,0,0,0,0
GATTGGGAGGCA,0,0,2,0,0,1,0,0,0,2,...,1,2,0,0,3,0,1,0,0,0
CCTCCTAGTTGG,0,2,1,1,0,2,0,0,0,1,...,0,3,0,0,0,0,0,0,0,0


In [11]:
# Sum gene symbols with same name
table1 = table1.groupby(axis=1, level=0).sum()
print(table1.shape)
table1.head()

(6600, 20426)


gene,0610005C13RIK,0610007N19RIK,0610007P14RIK,0610009B14RIK,0610009B22RIK,0610009D07RIK,0610009E02RIK,0610009L18RIK,0610009O20RIK,0610010F05RIK,...,ZUFSP,ZW10,ZWILCH,ZWINT,ZXDB,ZXDC,ZYG11B,ZYX,ZZEF1,ZZZ3
GGCCGCAGTCCG,0.0,0.0,3.0,0.0,2.0,7.0,0.0,1.0,1.0,6.0,...,2.0,0.0,0.0,53.0,0.0,1.0,4.0,0.0,1.0,0.0
CTTGTGCGGGAA,0.0,1.0,1.0,0.0,1.0,5.0,0.0,0.0,1.0,5.0,...,0.0,0.0,0.0,65.0,0.0,1.0,6.0,1.0,3.0,2.0
GCGCAACTGCTC,0.0,0.0,1.0,0.0,3.0,7.0,0.0,0.0,1.0,9.0,...,1.0,0.0,0.0,38.0,0.0,0.0,2.0,0.0,0.0,0.0
GATTGGGAGGCA,0.0,0.0,2.0,2.0,1.0,1.0,0.0,0.0,2.0,2.0,...,0.0,0.0,0.0,17.0,0.0,0.0,2.0,0.0,2.0,0.0
CCTCCTAGTTGG,0.0,0.0,1.0,0.0,1.0,2.0,0.0,0.0,1.0,8.0,...,1.0,0.0,0.0,38.0,0.0,0.0,5.0,0.0,0.0,0.0


In [12]:
if to_save:
    table1.to_csv('data/retina_batch1_expression.csv', index=True)

# Clusters

In [14]:
cell_metadata = pd.read_csv("data/metadata/cell_metadata.csv", index_col=0)
print(cell_metadata.shape)
cell_metadata.head()

(44808, 4)


Unnamed: 0_level_0,cluster_n,cluster_id,celltype,cluster_celltype_with_id
cell,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
r1_GGCCGCAGTCCG,2,cluster_02,Retinal ganglion cells,Retinal ganglion cells (cluster_02)
r1_CTTGTGCGGGAA,2,cluster_02,Retinal ganglion cells,Retinal ganglion cells (cluster_02)
r1_GCGCAACTGCTC,2,cluster_02,Retinal ganglion cells,Retinal ganglion cells (cluster_02)
r1_GATTGGGAGGCA,2,cluster_02,Retinal ganglion cells,Retinal ganglion cells (cluster_02)
r1_GTGCCGCCTCTC,25,cluster_25,Cones,Cones (cluster_25)


In [15]:
# Add `r1_` prefix to barcodes to indicate the first run
table1.index = 'r1_' + table1.index

cluster_sizes_table1 = table1.groupby(cell_metadata['cluster_id'], 
                                      axis=0).size()
cluster_sizes_table1

cluster_id
cluster_01      27
cluster_02      70
cluster_03      44
cluster_04      13
cluster_05      18
cluster_06      42
cluster_07      54
cluster_08      23
cluster_09      42
cluster_10      29
cluster_11      31
cluster_12      47
cluster_13      11
cluster_14      10
cluster_15      13
cluster_16      36
cluster_17      72
cluster_18      14
cluster_19      19
cluster_20      64
cluster_21      47
cluster_22      46
cluster_23      39
cluster_24    3746
cluster_25     241
cluster_26     317
cluster_27     126
cluster_28      56
cluster_29      85
cluster_30      87
cluster_31      80
cluster_32      54
cluster_33     114
cluster_34     244
cluster_35       4
cluster_36      13
cluster_37      24
cluster_38       9
cluster_39       9
dtype: int64

# Big Clusters

In [16]:
big_clusters = cluster_sizes_table1[cluster_sizes_table1 > 100]
big_clusters

cluster_id
cluster_24    3746
cluster_25     241
cluster_26     317
cluster_27     126
cluster_33     114
cluster_34     244
dtype: int64

In [17]:
cells_in_big_clusters = cell_metadata['cluster_id'].isin(big_clusters.index)
cells_in_big_clusters = cells_in_big_clusters[cells_in_big_clusters]
cells_in_big_clusters.sum()

36622

In [18]:
table1_big_clusters, y = table1.align(cells_in_big_clusters, axis=0, join='inner')
print(table1_big_clusters.shape)
print(y.shape)

(4788, 20426)
(4788,)


### Take 50 random cells from each cluster

In [19]:
np.random.seed(2017)

n_cells = 50
table1_big_clusters_subset = table1_big_clusters.groupby(
    cell_metadata['cluster_id'], as_index=False, group_keys=False).apply(
        lambda x: x.loc[np.random.choice(x.index, size=n_cells, replace=False)])
print(table1_big_clusters_subset.shape)
table1_big_clusters_subset.head()

(300, 20426)


gene,0610005C13RIK,0610007N19RIK,0610007P14RIK,0610009B14RIK,0610009B22RIK,0610009D07RIK,0610009E02RIK,0610009L18RIK,0610009O20RIK,0610010F05RIK,...,ZUFSP,ZW10,ZWILCH,ZWINT,ZXDB,ZXDC,ZYG11B,ZYX,ZZEF1,ZZZ3
r1_CGCACTAGAAGG,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
r1_CATAGAGACCGC,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
r1_GTCAAACTTAGG,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
r1_ATCCAATGCGCC,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
r1_GAGGTTCATACA,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


#### Subset the cell metadata too

In [20]:
cell_metadata_big_clusters = \
    cell_metadata.loc[table1_big_clusters_subset.index]
#     cluster_bools.loc[table1_big_clusters_subset.index, big_clusters.index]
print(cell_metadata_big_clusters.shape)
cell_metadata_big_clusters.head()

(300, 4)


Unnamed: 0,cluster_n,cluster_id,celltype,cluster_celltype_with_id
r1_CGCACTAGAAGG,24,cluster_24,Rods,Rods (cluster_24)
r1_CATAGAGACCGC,24,cluster_24,Rods,Rods (cluster_24)
r1_GTCAAACTTAGG,24,cluster_24,Rods,Rods (cluster_24)
r1_ATCCAATGCGCC,24,cluster_24,Rods,Rods (cluster_24)
r1_GAGGTTCATACA,24,cluster_24,Rods,Rods (cluster_24)


### Subset genes by differential expression

In [22]:
gene_metadata = pd.read_csv('data/metadata/mouse_gene_metadata.csv', index_col=0)

# Remove the index name because it causes problems
gene_metadata.index.name = None

# For the purposes of teaching, we'll rename "retina" to "cluster" so 
# it's consistent with the cell metadata
gene_metadata.columns = [x.replace('retina', 'cluster') 
                         for x in gene_metadata]
gene_metadata.head()

Unnamed: 0,cluster_01,cluster_02,cluster_03,cluster_04,cluster_05,cluster_06,cluster_07,cluster_08,cluster_09,cluster_10,...,cluster_37,cluster_38,cellcycle_01,cellcycle_02,cellcycle_03,cellcycle_04,cellcycle_05,cellcycle_06,cellcycle_07,cellcycle_08
1500015O10RIK,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1500016L03RIK,True,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1700025G04RIK,False,False,True,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1810009A15RIK,False,False,False,False,False,False,False,False,False,False,...,False,True,False,False,False,False,False,False,False,False
1810037I17RIK,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [23]:
in_big_clusters = gene_metadata[big_clusters.index].any(axis=1)
print(in_big_clusters.sum())
in_big_clusters.head()

259


1500015O10RIK    False
1500016L03RIK    False
1700025G04RIK    False
1810009A15RIK    False
1810037I17RIK    False
dtype: bool

In [24]:
genes_in_big_clusters = in_big_clusters[in_big_clusters].index
genes_in_big_clusters

Index(['2010107E04RIK', '4930447C04RIK', 'A930011O12RIK', 'ABCA8A', 'ABLIM1',
       'ACSL3', 'AIPL1', 'ALDOC', 'ANK3', 'APLP2',
       ...
       'VEGFA', 'VIM', 'VSTM2B', 'VSX1', 'VSX2', 'WIPI1', 'YWHAB', 'ZBTB20',
       'ZFP365', 'ZFP36L1'],
      dtype='object', length=259)

#### Perform the subset

In [25]:
table1_big_clusters_subset_genes = table1_big_clusters_subset.loc[:, genes_in_big_clusters]
print(table1_big_clusters_subset_genes.shape)
table1_big_clusters_subset_genes.head()

(300, 259)


Unnamed: 0,2010107E04RIK,4930447C04RIK,A930011O12RIK,ABCA8A,ABLIM1,ACSL3,AIPL1,ALDOC,ANK3,APLP2,...,VEGFA,VIM,VSTM2B,VSX1,VSX2,WIPI1,YWHAB,ZBTB20,ZFP365,ZFP36L1
r1_CGCACTAGAAGG,0.0,0.0,3.0,0.0,0.0,0.0,1.0,0.0,0.0,2.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2.0,0.0,0.0
r1_CATAGAGACCGC,1.0,0.0,3.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0
r1_GTCAAACTTAGG,0.0,0.0,1.0,0.0,1.0,0.0,2.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
r1_ATCCAATGCGCC,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
r1_GAGGTTCATACA,1.0,0.0,0.0,0.0,1.0,0.0,2.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [26]:
gene_metadata_big_clusters = gene_metadata.loc[
    genes_in_big_clusters, big_clusters.index]
print(gene_metadata_big_clusters.shape)
gene_metadata_big_clusters.head()

(259, 6)


cluster_id,cluster_24,cluster_25,cluster_26,cluster_27,cluster_33,cluster_34
2010107E04RIK,False,False,True,False,False,False
4930447C04RIK,False,True,False,False,False,False
A930011O12RIK,False,False,False,False,False,True
ABCA8A,False,False,False,False,False,True
ABLIM1,False,False,True,False,False,False


In [None]:
if to_save:
    table1_big_clusters_subset_genes.to_csv('data/big_clusters_expression.csv', index=True)
    cell_metadata_big_clusters.to_csv('data/big_clusters_cell_metadata.csv', index=True)
    gene_metadata_big_clusters.to_csv('data/big_clusters_gene_metadata.csv', index=True)

# Amacrine Cells

In [22]:
amacrine_clusters = ['cluster_{}'.format(str(i).zfill(2)) for i in range(3, 24)]
amacrine_clusters

['cluster_03',
 'cluster_04',
 'cluster_05',
 'cluster_06',
 'cluster_07',
 'cluster_08',
 'cluster_09',
 'cluster_10',
 'cluster_11',
 'cluster_12',
 'cluster_13',
 'cluster_14',
 'cluster_15',
 'cluster_16',
 'cluster_17',
 'cluster_18',
 'cluster_19',
 'cluster_20',
 'cluster_21',
 'cluster_22',
 'cluster_23']

In [23]:
amacrine_cells = cell_metadata['cluster_id'].isin(amacrine_clusters)
print(amacrine_cells.sum())
amacrine_cells.head()

4426


cell
r1_GGCCGCAGTCCG    False
r1_CTTGTGCGGGAA    False
r1_GCGCAACTGCTC    False
r1_GATTGGGAGGCA    False
r1_GTGCCGCCTCTC    False
Name: cluster_id, dtype: bool

### Get amacrine cells in table 1

In [24]:
table1.index

Index(['r1_GGCCGCAGTCCG', 'r1_CTTGTGCGGGAA', 'r1_GCGCAACTGCTC',
       'r1_GATTGGGAGGCA', 'r1_CCTCCTAGTTGG', 'r1_AGTCAAGCCCTC',
       'r1_GTGCCGCCTCTC', 'r1_CCTGTGACACAC', 'r1_AATCTCGTTAAT',
       'r1_GATTTCCTCTGA',
       ...
       'r1_GATTTAATGGTA', 'r1_TGTAAGGATCCG', 'r1_GAGTGGCTTGAT',
       'r1_GCATCTTTCAGG', 'r1_ACACGAGTTTGG', 'r1_CACCCAGTTTCG',
       'r1_CCTGGAGAGTTT', 'r1_TCTTCACTCTTA', 'r1_GCCGTCTTACTA',
       'r1_GACCAAACTAAT'],
      dtype='object', length=6600)

In [25]:
amacrine_cells_table1 = amacrine_cells[amacrine_cells.index.isin(table1.index)]
print(amacrine_cells_table1.shape)
print(amacrine_cells_table1.sum())
amacrine_cells_table1.head()

(6020,)
714


cell
r1_GGCCGCAGTCCG    False
r1_CTTGTGCGGGAA    False
r1_GCGCAACTGCTC    False
r1_GATTGGGAGGCA    False
r1_GTGCCGCCTCTC    False
Name: cluster_id, dtype: bool

In [26]:
len(amacrine_cells_table1[amacrine_cells_table1 == True])

714

In [27]:
table1_amacrine_cells = table1[table1.index.isin(amacrine_cells_table1[amacrine_cells_table1 == True].index)]
print(table1_amacrine_cells.shape)
table1_amacrine_cells.head()

(714, 20426)


gene,0610005C13RIK,0610007N19RIK,0610007P14RIK,0610009B14RIK,0610009B22RIK,0610009D07RIK,0610009E02RIK,0610009L18RIK,0610009O20RIK,0610010F05RIK,...,ZUFSP,ZW10,ZWILCH,ZWINT,ZXDB,ZXDC,ZYG11B,ZYX,ZZEF1,ZZZ3
r1_GGGTGTCAGTGG,0.0,0.0,0.0,0.0,1.0,2.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,6.0,0.0,0.0,1.0,0.0,0.0,0.0
r1_GTTTATATGCGC,0.0,0.0,1.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,8.0,0.0,0.0,1.0,0.0,1.0,2.0
r1_TCTTCACTGGCT,1.0,0.0,2.0,0.0,0.0,2.0,0.0,2.0,0.0,0.0,...,3.0,0.0,0.0,11.0,0.0,0.0,4.0,2.0,1.0,2.0
r1_TCATTTAGTCGA,0.0,0.0,0.0,0.0,2.0,2.0,0.0,2.0,1.0,2.0,...,0.0,0.0,0.0,6.0,0.0,1.0,6.0,0.0,1.0,2.0
r1_GTCTATTCGGTT,0.0,0.0,2.0,0.0,0.0,2.0,0.0,0.0,0.0,2.0,...,0.0,0.0,0.0,1.0,0.0,0.0,4.0,0.0,0.0,0.0


In [28]:
cell_metadata_amacrine = cell_metadata[cell_metadata.index.isin(amacrine_cells_table1[amacrine_cells_table1 == True].index)]
print(cell_metadata_amacrine.shape)
cell_metadata_amacrine.head()

(714, 4)


Unnamed: 0_level_0,cluster_n,cluster_id,celltype,cluster_celltype_with_id
cell,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
r1_GGGTGTCAGTGG,6,cluster_06,Amacrine cells,Amacrine cells (cluster_06)
r1_GTTTATATGCGC,5,cluster_05,Amacrine cells,Amacrine cells (cluster_05)
r1_TCTTCACTGGCT,13,cluster_13,Amacrine cells,Amacrine cells (cluster_13)
r1_TCATTTAGTCGA,8,cluster_08,Amacrine cells,Amacrine cells (cluster_08)
r1_GTCTATTCGGTT,10,cluster_10,Amacrine cells,Amacrine cells (cluster_10)


### Get amacrine cell gene

In [29]:
gene_metadata_amacrine = gene_metadata[amacrine_clusters]
gene_metadata_amacrine = gene_metadata_amacrine.loc[gene_metadata_amacrine.any(axis=1)]
print(gene_metadata_amacrine.shape)
gene_metadata_amacrine.head()

(614, 21)


Unnamed: 0,cluster_03,cluster_04,cluster_05,cluster_06,cluster_07,cluster_08,cluster_09,cluster_10,cluster_11,cluster_12,...,cluster_14,cluster_15,cluster_16,cluster_17,cluster_18,cluster_19,cluster_20,cluster_21,cluster_22,cluster_23
1700025G04RIK,True,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2610017I09RIK,False,False,False,False,False,False,False,False,False,False,...,True,False,False,False,False,False,False,False,False,True
2900011O08RIK,False,False,True,True,True,True,False,True,True,True,...,False,True,False,False,False,False,False,False,True,False
4833424O15RIK,False,False,True,False,False,True,False,True,False,False,...,True,False,False,False,False,False,False,False,False,False
4930447C04RIK,False,False,False,False,False,False,False,False,False,False,...,False,False,False,True,False,False,False,False,False,False


In [30]:
table1_amacrine_cells_genes = table1_amacrine_cells[gene_metadata_amacrine.index]
print(table1_amacrine_cells_genes.shape)
table1_amacrine_cells_genes.head()

(714, 614)


gene,1700025G04RIK,2610017I09RIK,2900011O08RIK,4833424O15RIK,4930447C04RIK,6330403K07RIK,6430548M08RIK,8430419L09RIK,A030009H04RIK,A830010M20RIK,...,YWHAG,YWHAH,ZCCHC12,ZCCHC18,ZEB2,ZFHX3,ZFP804A,ZMAT4,ZWINT,ZYX
r1_GGGTGTCAGTGG,1.0,0.0,23.0,4.0,3.0,7.0,10.0,4.0,3.0,4.0,...,7.0,4.0,0.0,5.0,2.0,7.0,0.0,3.0,6.0,0.0
r1_GTTTATATGCGC,5.0,0.0,9.0,3.0,1.0,4.0,4.0,6.0,9.0,4.0,...,3.0,5.0,1.0,8.0,11.0,4.0,2.0,3.0,8.0,0.0
r1_TCTTCACTGGCT,5.0,0.0,6.0,0.0,4.0,5.0,7.0,0.0,3.0,9.0,...,7.0,5.0,1.0,4.0,3.0,0.0,3.0,3.0,11.0,2.0
r1_TCATTTAGTCGA,2.0,1.0,14.0,16.0,0.0,8.0,0.0,0.0,3.0,0.0,...,11.0,4.0,1.0,5.0,13.0,0.0,0.0,0.0,6.0,0.0
r1_GTCTATTCGGTT,0.0,0.0,11.0,19.0,0.0,2.0,13.0,2.0,1.0,1.0,...,2.0,10.0,1.0,2.0,1.0,0.0,1.0,2.0,1.0,0.0


In [31]:
table1_amacrine_cells_genes['NAV1']

r1_GGGTGTCAGTGG    3.0
r1_GTTTATATGCGC    4.0
r1_TCTTCACTGGCT    3.0
r1_TCATTTAGTCGA    5.0
r1_GTCTATTCGGTT    5.0
                  ... 
r1_ATCGGCAAAAAC    0.0
r1_CGATACTATTCG    1.0
r1_GTGTGATAGCAA    0.0
r1_CTAATGCGCAGG    0.0
r1_CTAATCTCCAGA    0.0
Name: NAV1, Length: 714, dtype: float64

In [32]:
if to_save:
    table1_amacrine_cells_genes.to_csv('data/amacrine_expression.csv', index=True)
    cell_metadata_amacrine.to_csv('data/amacrine_cell_metadata.csv')
    gene_metadata_amacrine.to_csv('data/amacrine_gene_metadata.csv')

## Make a subset of all cells, but only differentially expressed genes

In [33]:
table1.shape

(6600, 20426)

In [34]:
retina_sets = [x for x in gene_metadata if 'cluster' in x]
len(retina_sets)

38

In [35]:
in_retina_clusters = gene_metadata[retina_sets].any(axis=1)
print(in_retina_clusters.sum())
in_retina_clusters.head()

1270


1500015O10RIK    True
1500016L03RIK    True
1700025G04RIK    True
1810009A15RIK    True
1810037I17RIK    True
dtype: bool

In [36]:
genes_in_retina_clusters = in_retina_clusters[in_retina_clusters].index

In [37]:
table1_diff_genes = table1.loc[:, genes_in_retina_clusters]
print(table1_diff_genes.shape)
table1_diff_genes.head()

(6600, 1270)


Unnamed: 0,1500015O10RIK,1500016L03RIK,1700025G04RIK,1810009A15RIK,1810037I17RIK,2010107E04RIK,2410066E13RIK,2610017I09RIK,2900011O08RIK,4833424O15RIK,...,ZFHX3,ZFHX4,ZFP365,ZFP36L1,ZFP804A,ZIC1,ZIC4,ZMAT4,ZWINT,ZYX
r1_GGCCGCAGTCCG,0.0,0.0,5.0,4.0,7.0,28.0,5.0,0.0,33.0,1.0,...,3.0,3.0,7.0,0.0,0.0,0.0,0.0,7.0,53.0,0.0
r1_CTTGTGCGGGAA,0.0,0.0,9.0,4.0,4.0,33.0,4.0,0.0,43.0,0.0,...,6.0,8.0,4.0,0.0,0.0,0.0,0.0,5.0,65.0,1.0
r1_GCGCAACTGCTC,0.0,0.0,11.0,1.0,4.0,26.0,2.0,0.0,30.0,0.0,...,1.0,0.0,2.0,0.0,0.0,0.0,0.0,4.0,38.0,0.0
r1_GATTGGGAGGCA,0.0,0.0,8.0,2.0,5.0,14.0,6.0,0.0,20.0,0.0,...,8.0,1.0,1.0,0.0,0.0,1.0,0.0,3.0,17.0,0.0
r1_CCTCCTAGTTGG,0.0,0.0,2.0,2.0,2.0,9.0,3.0,0.0,26.0,2.0,...,8.0,0.0,3.0,0.0,0.0,0.0,0.0,7.0,38.0,0.0


### Make sure all the cell barcodes also have cluster assignments

If they don't, it's probably because they were dropped at an earlier stage of the analysis

In [38]:
table1_cell_metadata = cell_metadata[
    cell_metadata.index.isin(table1_diff_genes.index)
]['cluster_id'].dropna()
print(table1_cell_metadata.shape)
table1_cell_metadata.head()

(6020,)


cell
r1_GGCCGCAGTCCG    cluster_02
r1_CTTGTGCGGGAA    cluster_02
r1_GCGCAACTGCTC    cluster_02
r1_GATTGGGAGGCA    cluster_02
r1_GTGCCGCCTCTC    cluster_25
Name: cluster_id, dtype: object

In [39]:
table1_diff_genes_cells = table1_diff_genes[
    table1_diff_genes.index.isin(table1_cell_metadata.index)
]
print(table1_diff_genes_cells.shape)
table1_diff_genes_cells.head()

(6020, 1270)


Unnamed: 0,1500015O10RIK,1500016L03RIK,1700025G04RIK,1810009A15RIK,1810037I17RIK,2010107E04RIK,2410066E13RIK,2610017I09RIK,2900011O08RIK,4833424O15RIK,...,ZFHX3,ZFHX4,ZFP365,ZFP36L1,ZFP804A,ZIC1,ZIC4,ZMAT4,ZWINT,ZYX
r1_GGCCGCAGTCCG,0.0,0.0,5.0,4.0,7.0,28.0,5.0,0.0,33.0,1.0,...,3.0,3.0,7.0,0.0,0.0,0.0,0.0,7.0,53.0,0.0
r1_CTTGTGCGGGAA,0.0,0.0,9.0,4.0,4.0,33.0,4.0,0.0,43.0,0.0,...,6.0,8.0,4.0,0.0,0.0,0.0,0.0,5.0,65.0,1.0
r1_GCGCAACTGCTC,0.0,0.0,11.0,1.0,4.0,26.0,2.0,0.0,30.0,0.0,...,1.0,0.0,2.0,0.0,0.0,0.0,0.0,4.0,38.0,0.0
r1_GATTGGGAGGCA,0.0,0.0,8.0,2.0,5.0,14.0,6.0,0.0,20.0,0.0,...,8.0,1.0,1.0,0.0,0.0,1.0,0.0,3.0,17.0,0.0
r1_GTGCCGCCTCTC,0.0,0.0,1.0,19.0,1.0,13.0,0.0,0.0,0.0,0.0,...,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0


In [40]:
table1_diff_cell_metadata = cell_metadata.loc[table1_diff_genes_cells.index]
print(table1_diff_cell_metadata.shape)
table1_diff_cell_metadata.head()

(6020, 4)


Unnamed: 0,cluster_n,cluster_id,celltype,cluster_celltype_with_id
r1_GGCCGCAGTCCG,2,cluster_02,Retinal ganglion cells,Retinal ganglion cells (cluster_02)
r1_CTTGTGCGGGAA,2,cluster_02,Retinal ganglion cells,Retinal ganglion cells (cluster_02)
r1_GCGCAACTGCTC,2,cluster_02,Retinal ganglion cells,Retinal ganglion cells (cluster_02)
r1_GATTGGGAGGCA,2,cluster_02,Retinal ganglion cells,Retinal ganglion cells (cluster_02)
r1_GTGCCGCCTCTC,25,cluster_25,Cones,Cones (cluster_25)


In [41]:
table1_diff_gene_metadata = gene_metadata.loc[table1_diff_genes_cells.columns, retina_sets]
print(table1_diff_gene_metadata.shape)
table1_diff_gene_metadata.head()

(1270, 38)


Unnamed: 0,cluster_01,cluster_02,cluster_03,cluster_04,cluster_05,cluster_06,cluster_07,cluster_08,cluster_09,cluster_10,...,cluster_29,cluster_30,cluster_31,cluster_32,cluster_33,cluster_34,cluster_35,cluster_36,cluster_37,cluster_38
1500015O10RIK,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,True,False,False,False
1500016L03RIK,True,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1700025G04RIK,False,False,True,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1810009A15RIK,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,True
1810037I17RIK,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,True,False,False,False


In [44]:
if to_save:
    table1_diff_genes_cells.to_csv('data/differential_clusters_expression.csv')
    table1_diff_cell_metadata.to_csv('data/differential_clusters_cell_metadata.csv')
    table1_diff_gene_metadata.to_csv('data/differential_clusters_gene_metadata.csv')

# Robust PCA

Do not run this...

In [44]:
table1_diff_genes_cells[['NRXN2', 'ATP1B1', 'PAX6', 'SLC32A1', 'SLC6A1', 'ELAVL3']]

Unnamed: 0,NRXN2,ATP1B1,PAX6,SLC32A1,SLC6A1,ELAVL3
r1_GGCCGCAGTCCG,16.0,149.0,8.0,0.0,1.0,13.0
r1_CTTGTGCGGGAA,16.0,182.0,21.0,0.0,10.0,5.0
r1_GCGCAACTGCTC,7.0,119.0,10.0,0.0,5.0,4.0
r1_GATTGGGAGGCA,12.0,93.0,7.0,0.0,3.0,7.0
r1_GTGCCGCCTCTC,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...
r1_CACCCAGTTTCG,0.0,0.0,0.0,0.0,0.0,0.0
r1_CCTGGAGAGTTT,0.0,0.0,0.0,0.0,0.0,0.0
r1_TCTTCACTCTTA,0.0,0.0,0.0,0.0,0.0,0.0
r1_GCCGTCTTACTA,0.0,0.0,0.0,0.0,0.0,0.0
