This notebook follows 05_make_rentina_subsets_for_teaching.

Subset retina_1_digital_expression based on certain conditions.

Inputs:

1. retina_clusteridentities.txt
2. mouse_gene_metadata was output from 04-make-mouse-gene-metadata
3. GSM1626793_P14Retina_1.digital_expressionn.txt was downloaded from [here](https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE63472).


This notebook outputs the following files:
1. big_clusters_expression.csv
2. big_clusters_cell_metadata.csv
3. big_clusters_gene_metadata.csv
4. amacrine_expression.csv
5. amacrine_cell_metadata.csv
6. amacrine_gene_metadata.csv
7. differential_expression.csv
8. differential_cell_metadata.csv
9. differential_gene_metadata.csv

In [1]:
import os
import pandas as pd
import numpy as np

from functions.data import ClusterToCellType, preprocess_digital_expression
cluster_to_celltype = ClusterToCellType()

In [2]:
to_save=True

# Get Data

In [3]:
# cell_cluster information
cluster_df = pd.read_csv("data/downloads/retina_clusteridentities.txt",
                         sep='\t', header=None, names=['cell', 'cluster_no'], index_col=0)
cluster_df['cluster_name'] = cluster_df['cluster_no'].apply(lambda x: f'cluster_{str(x).zfill(2)}')
print(cluster_df.shape)
cluster_df.head()

(44808, 2)


Unnamed: 0_level_0,cluster_no,cluster_name
cell,Unnamed: 1_level_1,Unnamed: 2_level_1
r1_GGCCGCAGTCCG,2,cluster_02
r1_CTTGTGCGGGAA,2,cluster_02
r1_GCGCAACTGCTC,2,cluster_02
r1_GATTGGGAGGCA,2,cluster_02
r1_GTGCCGCCTCTC,25,cluster_25


In [4]:
# Get mouse_gene_metadata
mouse_gene_metadata = pd.read_csv('data/mouse_gene_metadata.csv', index_col=0)

# For the purposes of teaching, we'll rename "retina" to "cluster" so it's consistent with the cell metadata
mouse_gene_metadata.columns = [x.replace('retina', 'cluster') for x in mouse_gene_metadata]
mouse_gene_metadata

Unnamed: 0,cluster_01,cluster_02,cluster_03,cluster_04,cluster_05,cluster_06,cluster_07,cluster_08,cluster_09,cluster_10,...,cluster_38,cluster_39,cellcycle_01,cellcycle_02,cellcycle_03,cellcycle_04,cellcycle_05,cellcycle_06,cellcycle_07,cellcycle_08
1500015O10RIK,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1500016L03RIK,True,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1700025G04RIK,False,False,True,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1810009A15RIK,False,False,False,False,False,False,False,False,False,False,...,True,False,False,False,False,False,False,False,False,False
1810037I17RIK,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Zfp36l2,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,True
Zmym1,False,False,False,False,False,False,False,False,False,False,...,False,False,True,False,False,False,False,False,False,False
Zmynd19,False,False,False,False,False,False,False,False,False,False,...,False,False,True,False,False,False,False,False,False,False
Zrsr2,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,True,False,False,False


In [5]:
# Digital expression
digital_expression = pd.read_csv("data/downloads/GSM1626793_P14Retina_1.digital_expression.txt",
                     sep='\t')
digital_expression = preprocess_digital_expression(digital_expression)
digital_expression

gene_symbol,0610005C13RIK,0610007N19RIK,0610007P14RIK,0610009B14RIK,0610009B22RIK,0610009D07RIK,0610009E02RIK,0610009L18RIK,0610009O20RIK,0610010F05RIK,...,ZUFSP,ZW10,ZWILCH,ZWINT,ZXDB,ZXDC,ZYG11B,ZYX,ZZEF1,ZZZ3
r1_GGCCGCAGTCCG,0,0,3,0,2,7,0,1,1,6,...,2,0,0,53,0,1,4,0,1,0
r1_CTTGTGCGGGAA,0,1,1,0,1,5,0,0,1,5,...,0,0,0,65,0,1,6,1,3,2
r1_GCGCAACTGCTC,0,0,1,0,3,7,0,0,1,9,...,1,0,0,38,0,0,2,0,0,0
r1_GATTGGGAGGCA,0,0,2,2,1,1,0,0,2,2,...,0,0,0,17,0,0,2,0,2,0
r1_CCTCCTAGTTGG,0,0,1,0,1,2,0,0,1,8,...,1,0,0,38,0,0,5,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
r1_CACCCAGTTTCG,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
r1_CCTGGAGAGTTT,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
r1_TCTTCACTCTTA,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
r1_GCCGTCTTACTA,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [6]:
# No need for this
# digital_expression.to_csv('data/retina_1_digital_expression.csv', index=True)

# Big Clusters

In [7]:
cluster_sizes = digital_expression.groupby(cluster_df['cluster_name'], axis=0).size()
big_clusters = cluster_sizes[cluster_sizes > 100].index.to_list()  # define big_clusters

# find cells in big_clusters
cells_in_big_clusters = cluster_df[(cluster_df['cluster_name'].isin(big_clusters))].index.to_list()  # get cells from cluster_df

# find mouse_genes in big_clusters
mouse_genes_in_big_clusters = mouse_gene_metadata[(mouse_gene_metadata[big_clusters].any(axis=1))].index.to_list()

In [8]:
# Subset rows by cells_in_big_clusters and columns by mouse_genes_in_big_clusters
big_clusters_expression_subset = digital_expression.loc[
    digital_expression.index.isin(cells_in_big_clusters),
    mouse_genes_in_big_clusters
]

# Take 50 random cells from each cluster
np.random.seed(2017)
big_clusters_expression_random_subset = big_clusters_expression_subset.groupby(
    cluster_df['cluster_name'], as_index=False, group_keys=False).apply(
        lambda x: x.loc[np.random.choice(x.index, size=50, replace=False)])

big_clusters_expression_random_subset

gene_symbol,2010107E04RIK,4930447C04RIK,A930011O12RIK,ABCA8A,ABLIM1,ACSL3,AIPL1,ALDOC,ANK3,APLP2,...,VEGFA,VIM,VSTM2B,VSX1,VSX2,WIPI1,YWHAB,ZBTB20,ZFP365,ZFP36L1
r1_CGCACTAGAAGG,0,0,3,0,0,0,1,0,0,2,...,0,0,0,0,0,0,1,2,0,0
r1_CATAGAGACCGC,1,0,3,0,0,0,1,0,0,0,...,0,0,0,0,0,0,3,0,0,0
r1_GTCAAACTTAGG,0,0,1,0,1,0,2,0,0,0,...,0,0,0,0,0,0,0,0,0,0
r1_ATCCAATGCGCC,0,0,1,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
r1_GAGGTTCATACA,1,0,0,0,1,0,2,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
r1_AAAATATGTTAC,0,0,0,4,0,13,0,10,0,5,...,3,2,0,0,2,7,2,1,0,18
r1_GCCGTATTAAGA,0,0,0,3,1,15,0,0,0,0,...,0,0,0,0,0,3,0,0,0,1
r1_AAACCAAGAAGG,0,0,0,2,0,10,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
r1_GAATTCATAACC,1,0,0,0,0,31,2,1,0,1,...,1,1,0,0,2,4,0,0,0,2


In [9]:
# subset metadata
big_clusters_cell_metadata = cluster_df.loc[big_clusters_expression_random_subset.index]
big_clusters_gene_metadata = mouse_gene_metadata.loc[mouse_genes_in_big_clusters, big_clusters]

In [10]:
if to_save:
    os.makedirs("data/big_clusters", exist_ok=True)
    big_clusters_expression_random_subset.reset_index().to_csv('data/big_clusters/big_clusters_expression.csv', index=False)
    big_clusters_cell_metadata.reset_index().to_csv('data/big_clusters/big_clusters_cell_metadata.csv', index=False)
    big_clusters_gene_metadata.reset_index().to_csv('data/big_clusters/big_clusters_gene_metadata.csv', index=False)

# Amacrine Cells

In [11]:
amacrine_clusters = ['cluster_{}'.format(str(i).zfill(2)) for i in range(3, 24)]

# find cells in amacrine_clusters
cells_in_amacrine_clusters = cluster_df[(cluster_df['cluster_name'].isin(amacrine_clusters))].index.to_list()  # get cells from cluster_df

# find mouse_genes in amacrine_clusters
mouse_genes_in_amacrine_clusters = mouse_gene_metadata[(mouse_gene_metadata[amacrine_clusters].any(axis=1))].index.to_list()

In [12]:
# Subset rows by cells_in_amacrine_clusters and columns by mouse_genes_in_amacrine_clusters
amacrine_clusters_expression_subset = digital_expression.loc[
    digital_expression.index.isin(cells_in_amacrine_clusters),
    mouse_genes_in_amacrine_clusters
]

amacrine_clusters_expression_subset

gene_symbol,1700025G04RIK,2610017I09RIK,2900011O08RIK,4833424O15RIK,4930447C04RIK,6330403K07RIK,6430548M08RIK,8430419L09RIK,A030009H04RIK,A830010M20RIK,...,YWHAG,YWHAH,ZCCHC12,ZCCHC18,ZEB2,ZFHX3,ZFP804A,ZMAT4,ZWINT,ZYX
r1_GGGTGTCAGTGG,1,0,23,4,3,7,10,4,3,4,...,7,4,0,5,2,7,0,3,6,0
r1_GTTTATATGCGC,5,0,9,3,1,4,4,6,9,4,...,3,5,1,8,11,4,2,3,8,0
r1_TCTTCACTGGCT,5,0,6,0,4,5,7,0,3,9,...,7,5,1,4,3,0,3,3,11,2
r1_TCATTTAGTCGA,2,1,14,16,0,8,0,0,3,0,...,11,4,1,5,13,0,0,0,6,0
r1_GTCTATTCGGTT,0,0,11,19,0,2,13,2,1,1,...,2,10,1,2,1,0,1,2,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
r1_ATCGGCAAAAAC,0,0,0,0,2,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
r1_CGATACTATTCG,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
r1_GTGTGATAGCAA,0,0,0,3,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
r1_CTAATGCGCAGG,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0


In [13]:
# subset metadata
amacrine_cell_metadata = cluster_df.loc[amacrine_clusters_expression_subset.index]
amacrine_gene_metadata = mouse_gene_metadata.loc[mouse_genes_in_amacrine_clusters, amacrine_clusters]

In [14]:
if to_save:
    os.makedirs("data/amacrine", exist_ok=True)
    amacrine_clusters_expression_subset.to_csv('data/amacrine/amacrine_expression.csv', index=True)
    amacrine_cell_metadata.to_csv('data/amacrine/amacrine_cell_metadata.csv')
    amacrine_gene_metadata.to_csv('data/amacrine/amacrine_gene_metadata.csv')

In [15]:
# check
# amacrine_clusters_expression_subset['NAV1']

# Retina (aka. Differential)

In [30]:
retina_clusters = list(filter(lambda x: 'cluster' in x, mouse_gene_metadata.columns))

# find cells in retina_clusters
cells_in_retina_clusters = cluster_df[(cluster_df['cluster_name'].isin(retina_clusters))].index.to_list()  # get cells from cluster_df

# find mouse_genes in retina_clusters
mouse_genes_in_retina_clusters = mouse_gene_metadata[(mouse_gene_metadata[retina_clusters].any(axis=1))].index.to_list()

In [33]:
# Subset rows by cells_in_retina_clusters and columns by mouse_genes_in_retina_clusters
retina_clusters_expression_subset = digital_expression.loc[
    digital_expression.index.isin(cells_in_retina_clusters),
    mouse_genes_in_retina_clusters]

retina_clusters_expression_subset

gene_symbol,1500015O10RIK,1500016L03RIK,1700025G04RIK,1810009A15RIK,1810037I17RIK,2010107E04RIK,2410066E13RIK,2610017I09RIK,2900011O08RIK,4632428N05RIK,...,ZFHX4,ZFP36,ZFP365,ZFP36L1,ZFP804A,ZIC1,ZIC4,ZMAT4,ZWINT,ZYX
r1_GGCCGCAGTCCG,0,0,5,4,7,28,5,0,33,0,...,3,0,7,0,0,0,0,7,53,0
r1_CTTGTGCGGGAA,0,0,9,4,4,33,4,0,43,0,...,8,0,4,0,0,0,0,5,65,1
r1_GCGCAACTGCTC,0,0,11,1,4,26,2,0,30,0,...,0,0,2,0,0,0,0,4,38,0
r1_GATTGGGAGGCA,0,0,8,2,5,14,6,0,20,0,...,1,0,1,0,0,1,0,3,17,0
r1_GTGCCGCCTCTC,0,0,1,19,1,13,0,0,0,0,...,0,0,3,0,0,0,0,0,3,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
r1_CACCCAGTTTCG,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
r1_CCTGGAGAGTTT,0,0,0,2,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
r1_TCTTCACTCTTA,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,2,0,0,0
r1_GCCGTCTTACTA,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [34]:
# subset metadata
retina_cell_metadata = cluster_df.loc[retina_clusters_expression_subset.index]
retina_gene_metadata = mouse_gene_metadata.loc[mouse_genes_in_retina_clusters, retina_clusters]

In [20]:
if to_save:
    os.makedirs("data/retina", exist_ok=True)
    retina_clusters_expression_subset.to_csv('data/retina/differential_clusters_expression.csv')
    retina_cell_metadata.to_csv('data/retina/differential_clusters_gene_metadata.csv')
    retina_gene_metadata.to_csv('data/retina/differential_clusters_cell_metadata.csv')