# S2 Cell RNA-Seq

For running the initial NetREX model I need to give Yijie a set of S2 cell RNA-seq data.

In [1]:
# %load ../config/defaults.py
import os
import sys
from pathlib import Path
import re

from IPython.display import display, HTML, Markdown
import numpy as np
import pandas as pd

import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns

from dask.distributed import Client
import dask.dataframe as dd
from dask.delayed import delayed

# Project level imports
sys.path.insert(0, '../lib')
from ncbi_remap.notebook import Nb
from ncbi_remap.plotting import make_figs

# Setup notebook
nbconfig = Nb.setup_notebook()

# Connect to data store
store = pd.HDFStore('../sra.h5', mode='r')

# Start dask server
client = Client()

Please check output/fbgn2chrom.tsv. If it does not exist, run bin/fbgn2chrom.py
last updated: 2018-03-14 
Git hash: 6621fd9b0680c343b334ceb06ec2df78e512b587


In [3]:
# %load ../config/mongo.py
from pymongo import MongoClient
try:
    with open('../output/.mongodb_host', 'r') as fh:
        host = fh.read().strip()
except FileNotFoundError:
    host = 'localhost'

mongoClient = MongoClient(host=host, port=27022)
db = mongoClient['sra']
ncbi = db['ncbi']
biometa = db['biometa']

## Get a list of SRXs that are complete

In [3]:
# SRXS with alignments
srxs = store['aln/complete'].srx.unique().tolist()

## Get list of SRXs that are annotated as RNA-Seq

In [4]:
rnaseq = pd.DataFrame(list(ncbi.aggregate([
    {'$unwind': '$biosample'},
    {
        '$match': {
            '_id': {'$in': srxs},
            'sra.experiment.library_strategy': 'RNA-Seq'
            
        }
    },
    {
        '$project': {
            '_id': 0,
            'srx': '$_id',
            'biosample': '$biosample.biosample_accn'
        }
    }
    
])))

## Get list of SRXs that are hand curated as S2 cell

In [5]:
# Hand examined S2 cell datasets
biosamples = pd.read_csv('../output/s2_samples.tsv', sep='\t').biosample.unique().tolist()

## Get a list of SRXs that are RNA-Seq and annotated as S2 cells

In [6]:
# RNA-Seq in our biosample list
s2_rnaseq = rnaseq[rnaseq.biosample.isin(biosamples)]

## Make sure to remove Lee's data

In [7]:
# Make sure I don't include Lee's study

lee = ['PRJNA353097', 'PRJNA320917']
lee_srx = [x['srx'] for x in ncbi.aggregate([
    {
        '$match': {
            'bioproject.bioproject_accn': {'$in': lee}
        }
    },
    {
        '$project': {
            '_id': 0,
            'srx': '$_id',
        }
    }
])]

non_lee = s2_rnaseq.loc[~s2_rnaseq.srx.isin(lee_srx), 'srx'].tolist()

## Create list of counts tables

In [8]:
data = []
for srx in non_lee:
    fname = f'../output/aln-wf/gene_counts/{srx}.parquet'
    if Path(fname).exists():
        data.append(fname)

In [9]:
# Import and munge
df = dd.read_parquet(data)
df = df.compute()
df = df.pivot_table(index='FBgn', columns='srx', values='count', aggfunc='first')

In [10]:
df.head()

srx,ERX173561,ERX173562,ERX173563,ERX173564,ERX173565,ERX173566,ERX173567,ERX173568,ERX173569,ERX173570,...,SRX981411,SRX981412,SRX981413,SRX981414,SRX981415,SRX982548,SRX982550,SRX982551,SRX982552,SRX982553
FBgn,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
FBgn0000003,40,4,3,9,2,6,5,7,25,11,...,0,0,0,0,0,20894,11895,4048,7363,13892
FBgn0000008,51,9,12,17,2,4,18,16,37,36,...,778,768,519,607,519,440,322,592,416,659
FBgn0000014,0,0,0,0,0,0,0,0,0,1,...,3,1,1,4,1,11,27,26,60,45
FBgn0000015,0,1,0,0,0,0,0,0,0,0,...,0,0,1,1,0,64,26,57,80,79
FBgn0000017,2160,507,303,694,86,296,712,353,1755,1480,...,4094,4084,6269,7124,6168,15861,12228,9626,6167,9100


In [11]:
df.to_csv('../output/notebook/2018-03-14_s2_rnaseq_raw_gene_counts.tsv', sep='\t')

In [13]:
df = pd.read_csv('../output/notebook/2018-03-14_s2_rnaseq_raw_gene_counts.tsv', sep='\t', index_col=0)

In [5]:
client.close()

In [21]:
# Pull gene lengths
glen = Path('../output/gene_ts_lengths.tsv')
if not glen.exists():
    import gffutils
    gtf = Path(os.environ['REFERENCES_DIR'], 'dmel/r6-11/gtf/dmel_r6-11.gtf.db')
    db = gffutils.FeatureDB(gtf.as_posix())

    gene_ts_lengths = []
    for gene in db.features_of_type('gene'):
        length = 0
        for exon in db.merge(db.children(gene, featuretype='exon')):
            length += len(exon)

        try:
            assert len(gene) >= length
            gene_ts_lengths.append([gene.id, length])
        except AssertionError:
            print(gene.id, len(gene), length)

    gene_ts_lengths = pd.DataFrame(gene_ts_lengths, columns=['FBgn', 'gene_ts_length']).set_index('FBgn')
    gene_ts_lengths.to_csv(glen, sep='\t')

gene_ts_lengths = pd.read_csv(glen, sep='\t', index_col=0)

In [22]:
gene_ts_lengths = gene_ts_lengths.loc[gene_ts_lengths.index.isin(df.index.tolist()), 'gene_ts_length']

In [41]:
from ncbi_remap.normalization import rpkm, cpm, tpm

In [43]:
df_rpkm = rpkm(df, gene_ts_lengths)
df_cpm = cpm(df)
df_tpm = tpm(df, gene_ts_lengths)

In [44]:
df_rpkm.head()

Unnamed: 0_level_0,ERX173561,ERX173562,ERX173563,ERX173564,ERX173565,ERX173566,ERX173567,ERX173568,ERX173569,ERX173570,...,SRX981411,SRX981412,SRX981413,SRX981414,SRX981415,SRX982548,SRX982550,SRX982551,SRX982552,SRX982553
FBgn,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
FBgn0000003,15.227652,6.83063,4.693087,8.147429,18.059926,12.59551,4.389867,4.6557,11.780542,3.502857,...,0.0,0.0,0.0,0.0,0.0,1776.653552,1509.384039,457.664859,975.877551,1324.676476
FBgn0000008,1.122204,0.888325,1.085044,0.88952,1.043866,0.485348,0.913445,0.615086,1.007757,0.662614,...,6.363668,5.845374,4.819883,4.641463,4.971601,2.162532,2.361674,3.868633,3.186857,3.632112
FBgn0000014,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.04534,...,0.060447,0.018749,0.022877,0.075344,0.023597,0.133176,0.48781,0.418536,1.132253,0.610955
FBgn0000015,0.0,0.531311,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.049991,0.041161,0.0,1.693203,1.026494,2.005071,3.29897,2.343797
FBgn0000017,229.138557,241.257169,132.084378,175.068905,216.39932,173.15207,174.193849,65.423395,230.448472,131.32967,...,161.442737,149.857868,280.67918,262.623083,284.849848,375.822857,432.376621,303.266464,227.764451,241.800897


In [46]:
df_cpm.head()

Unnamed: 0_level_0,ERX173561,ERX173562,ERX173563,ERX173564,ERX173565,ERX173566,ERX173567,ERX173568,ERX173569,ERX173570,...,SRX981411,SRX981412,SRX981413,SRX981414,SRX981415,SRX982548,SRX982550,SRX982551,SRX982552,SRX982553
FBgn,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
FBgn0000003,4.553068,2.042359,1.403233,2.436081,5.399918,3.766058,1.31257,1.392054,3.522382,1.047354,...,0.0,0.0,0.0,0.0,0.0,531.219412,451.305828,136.841793,291.787388,396.078266
FBgn0000008,5.805162,4.595307,5.612932,4.601487,5.399918,2.510705,4.725253,3.181838,5.213125,3.427704,...,32.919255,30.238118,24.933257,24.01029,25.71809,11.186778,12.216938,20.012436,16.485611,18.788913
FBgn0000014,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.095214,...,0.126938,0.039373,0.048041,0.158223,0.049553,0.279669,1.024402,0.878925,2.377732,1.283006
FBgn0000015,0.0,0.51059,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.048041,0.039556,0.0,1.627168,0.986461,1.926873,3.17031,2.252389
FBgn0000017,245.865672,258.868942,141.726538,187.848935,232.196471,185.792171,186.91,70.199302,247.271211,140.916736,...,173.228057,160.797493,301.16876,281.794568,305.643887,403.257925,463.940114,325.404915,244.391256,259.452363


In [45]:
df_tpm.head()

Unnamed: 0_level_0,ERX173561,ERX173562,ERX173563,ERX173564,ERX173565,ERX173566,ERX173567,ERX173568,ERX173569,ERX173570,...,SRX981411,SRX981412,SRX981413,SRX981414,SRX981415,SRX982548,SRX982550,SRX982551,SRX982552,SRX982553
FBgn,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
FBgn0000003,3.642083,1.554572,0.927102,2.098624,3.949533,2.731209,0.923677,1.21171,2.755027,0.762075,...,0.0,0.0,0.0,0.0,0.0,547.235002,464.914334,145.633368,282.422935,383.373703
FBgn0000008,0.268404,0.202172,0.214346,0.229124,0.228283,0.105243,0.192199,0.160085,0.235676,0.144157,...,1.768391,1.640118,1.418599,1.346621,1.466039,0.666091,0.727433,1.231036,0.922289,1.051167
FBgn0000014,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.009864,...,0.016797,0.005261,0.006733,0.021859,0.006958,0.04102,0.150253,0.133182,0.327679,0.176816
FBgn0000015,0.0,0.12092,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.014713,0.011942,0.0,0.521531,0.316177,0.638033,0.954735,0.678317
FBgn0000017,54.804351,54.907312,26.092777,45.094457,47.324456,37.546279,36.652329,17.027339,53.893252,28.571829,...,44.863112,42.047704,82.610143,76.194444,83.997309,115.758878,133.178889,96.502311,65.915959,69.979431


In [47]:
df_rpkm.to_csv('../output/notebook/2018-03-14_s2_rnaseq_rpkm_gene_counts.tsv', sep='\t')
df_cpm.to_csv('../output/notebook/2018-03-14_s2_rnaseq_cpm_gene_counts.tsv', sep='\t')
df_tpm.to_csv('../output/notebook/2018-03-14_s2_rnaseq_tpm_gene_counts.tsv', sep='\t')