# Quick Look at ERCC

Here I am taking a quick look to make sure I have everything I need for the ERCCs.

In [51]:
import os
import sys
import re
from pathlib import Path

from IPython.display import display, HTML, Markdown
import numpy as np
import pandas as pd

import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns

# Project level imports
from larval_gonad.normalization import tpm

In [6]:
sample_table = pd.read_csv('../bulk-rnaseq-wf/config/sampletable.tsv', sep='\t', index_col=0)
sample_to_ercc = sample_table.ercc

In [9]:
cnts = pd.read_parquet('../output/bulk-rnaseq-wf/aggregation/gene_level_counts.parquet')

In [46]:
!head 

ERCC-00002	1061
ERCC-00003	1023
ERCC-00004	523
ERCC-00007	1135
ERCC-00009	984
ERCC-00012	994
ERCC-00013	808
ERCC-00014	1957
ERCC-00016	844
ERCC-00017	1136


In [49]:
gene_lens = pd.read_csv('../output/gene_ts_lengths.tsv', sep='\t', index_col=0).iloc[:, 0]
ercc_lens = pd.read_csv('/data/LCDB/lcdb-references/ercc/srm2374/fasta/ercc_srm2374.chromsizes', sep='\t', index_col=0).iloc[:, 0]
gene_ercc_lens = pd.concat([gene_lens, ercc_lens])

In [52]:
norm = tpm(cnts, gene_ercc_lens).dropna()

In [54]:
ercc_norm = norm[norm.index.str.startswith('ERCC')].T.join(sample_to_ercc).set_index('ercc', append=True).sort_index(level=1)

In [56]:
ercc_grp_a = ercc_cnts.groupby('ercc').get_group('A')

In [57]:
ercc_grp_a

Unnamed: 0_level_0,Unnamed: 1_level_0,ERCC-00002,ERCC-00003,ERCC-00004,ERCC-00007,ERCC-00009,ERCC-00012,ERCC-00013,ERCC-00014,ERCC-00016,ERCC-00017,...,ERCC-00157,ERCC-00158,ERCC-00160,ERCC-00162,ERCC-00163,ERCC-00164,ERCC-00165,ERCC-00168,ERCC-00170,ERCC-00171
Unnamed: 0_level_1,ercc,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
A11_FT,A,121,0,1,0,9,27,0,0,0,5,...,21,26,48,4,0,1,0,1,1,6
A1_TF,A,86,0,3,1,3,27,0,0,0,6,...,25,21,43,0,0,1,1,1,2,6
A3_TF,A,183,1,4,0,22,45,1,0,0,13,...,47,37,60,0,0,1,0,1,0,11
A9_FT,A,253,0,3,0,18,76,0,0,0,18,...,51,62,98,2,0,1,0,5,2,5
B5_TCP,A,125,0,0,0,11,34,0,2,1,9,...,29,24,53,3,0,3,0,7,0,2
B7_TCP,A,40,0,0,0,8,19,0,0,0,3,...,7,13,21,3,0,0,0,2,0,0
C1_TDT,A,11,0,0,0,2,9,0,0,0,0,...,7,7,6,1,0,0,0,1,0,3
C3_TDT,A,49,0,1,0,5,13,0,0,0,3,...,5,8,25,1,0,0,0,0,0,0
F11_TDP,A,14,0,0,0,1,5,0,1,0,1,...,9,6,7,1,0,0,0,0,0,1
F9_TDP,A,436,4,1,3,35,101,0,4,6,21,...,91,101,148,19,1,6,3,16,1,20


In [60]:
!ls /data/LCDB/lcdb-references/ercc/srm2374/gtf

ercc_srm2374.gtf  ercc_srm2374.gtf.gz.log
