<a href="https://colab.research.google.com/github/jgalazka/ERCC_analysis/blob/main/GLDS-235-Liver/GLDS_246_ERCC_PrepareDESeq2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# GeneLab ERCC spike-in analysis notebook

This notebook contains an analysis of RNA-seq counts data generated from ERCC spike-ins.

Here GLDS-246 is analyzed.


## Setting up the notebook

In [1]:
# import python packages
import pandas as pd
import numpy as np
from urllib.request import urlopen, quote, urlretrieve
from json import loads
from re import search
import zipfile
import seaborn as sns
from scipy.stats import linregress
import matplotlib.pyplot as plt

In [2]:
# Function to pull metadata zip from GeneLab
# Credit to Kirill Grigorev
GENELAB_ROOT = "https://genelab-data.ndc.nasa.gov"
GLDS_URL_PREFIX = GENELAB_ROOT + "/genelab/data/study/data/"
FILELISTINGS_URL_PREFIX = GENELAB_ROOT + "/genelab/data/study/filelistings/"
ISA_ZIP_REGEX = r'.*_metadata_.*[_-]ISA\.zip$'

def read_json(url):
    with urlopen(url) as response:
        return loads(response.read().decode())

def get_isa(accession):
    glds_json = read_json(GLDS_URL_PREFIX + accession)
    try:
        _id = glds_json[0]["_id"]
    except (AssertionError, TypeError, KeyError, IndexError):
        raise ValueError("Malformed JSON?")
    isa_entries = [
        entry for entry in read_json(FILELISTINGS_URL_PREFIX + _id)
        if search(ISA_ZIP_REGEX, entry["file_name"])
    ]
    if len(isa_entries) == 0:
        raise ValueError("Unexpected: no ISAs found")
    elif len(isa_entries) > 1:
        raise ValueError("Unexpected: multiple files match the ISA regex")
    else:
        entry = isa_entries[0]
        version = entry["version"]
        url = GENELAB_ROOT + entry["remote_url"] + "?version={}".format(version)
        alt_url = (
            GENELAB_ROOT + "/genelab/static/media/dataset/" +
            quote(entry["file_name"]) + "?version={}".format(version)
        )
        return entry["file_name"], version, url, alt_url

In [3]:
# Function to pull unnormalized counts from GeneLab
# Credit to Kirill Grigorev

RAW_COUNTS_REGEX = r'.*_rna_seq_Unnormalized_Counts.csv'

def get_rawcounts(accession):
    glds_json = read_json(GLDS_URL_PREFIX + accession)
    try:
        _id = glds_json[0]["_id"]
    except (AssertionError, TypeError, KeyError, IndexError):
        raise ValueError("Malformed JSON?")
    raw_counts_entries = [
        entry for entry in read_json(FILELISTINGS_URL_PREFIX + _id)
        if search(RAW_COUNTS_REGEX, entry["file_name"])
    ]
    if len(raw_counts_entries) == 0:
        raise ValueError("Unexpected: no Raw Counts found")
    elif len(raw_counts_entries) > 1:
        raise ValueError("Unexpected: multiple files match the Raw Counts regex")
    else:
        entry = raw_counts_entries[0]
        version = entry["version"]
        url = GENELAB_ROOT + entry["remote_url"] + "?version={}".format(version)
        alt_url = (
            GENELAB_ROOT + "/genelab/static/media/dataset/" +
            quote(entry["file_name"]) + "?version={}".format(version)
        )
        return entry["file_name"], version, url, alt_url

## Get and parse data and metadata
Get ISA, counts, and ERCC data.

In [4]:
accession = 'GLDS-246' # Change this as necessary
isaurl = get_isa(accession)[3]
filehandle, _ = urlretrieve(isaurl)
zip_file_object = zipfile.ZipFile(filehandle, 'r')
zip_file_object.namelist() # Print contents of zip file. Pick relevant one from list

['.DS_Store',
 's_RR6_SPL.txt',
 'a_rr6_spl_transcription_profiling_RNA_Sequencing_(RNA-Seq).txt',
 'i_Investigation.txt']

In [5]:
sample_file = zip_file_object.namelist()[1]
file = zip_file_object.open(sample_file)
sample_table = pd.read_csv(zip_file_object.open(sample_file), sep='\t')

assay_file = zip_file_object.namelist()[2]
file = zip_file_object.open(assay_file)
assay_table = pd.read_csv(zip_file_object.open(assay_file), sep='\t')

In [6]:
sample_table.head(n=3)

Unnamed: 0,Source Name,Comment[LSDA Source Name],Comment[LSDA Biospecimen Subject ID],Comment[LSDA Biospecimen ID],Protocol REF,Sample Name,Characteristics[Organism],Term Source REF,Term Accession Number,Characteristics[Strain],Comment[Animal Source],"Characteristics[sex,http://purl.obolibrary.org/obo/PATO_0000047,EFO]",Term Source REF.1,Term Accession Number.1,Characteristics[Age at Launch],Unit,Term Source REF.2,Term Accession Number.2,"Characteristics[Diet,http://purl.bioontology.org/ontology/MESH/D004032,MESH]",Term Source REF.3,Term Accession Number.3,Comment[Feeding Schedule],Factor Value[Spaceflight],Term Source REF.4,Term Accession Number.4,Factor Value[Duration],Unit.1,Term Source REF.5,Term Accession Number.5,Factor Value[Euthanasia],Term Source REF.6,Term Accession Number.6,Factor Value[Dissection Condition],Term Source REF.7,Term Accession Number.7,Protocol REF.1,"Parameter Value[Euthanasia Method,http://ncicb.nci.nih.gov/xml/owl/EVS/Thesaurus.owl#C90336,NCIT]",Term Source REF.8,Term Accession Number.8,Parameter Value[Carcass Preservation Method],Unit.2,Term Source REF.9,Term Accession Number.9,Parameter Value[Carcass Weight],Unit.3,Term Source REF.10,Term Accession Number.10,Comment[LSDA Sample Name],Material Type,Term Source REF.11,Term Accession Number.11,Comment[Launch Date],Comment[Euthanasia Date],Comment[Dissection Date],Parameter Value[Sample Preservation Method],Term Source REF.12,Term Accession Number.12,Parameter Value[Sample Storage Temperature],Unit.4,Term Source REF.13,Term Accession Number.13
0,LAR Baseline 3,3B18,598,5910,mouse habitation,Mmus_C57-6T_SPL_BSL_LAR_Rep1_B3,Mus musculus,NCBITAXON,http://purl.bioontology.org/ontology/NCBITAXON...,C57BL/6NTac,Taconic Biosciences,female,EFO,http://purl.obolibrary.org/obo/PATO_0000383,36,week,UO,http://purl.obolibrary.org/obo/UO_0000034,Nutrient Upgraded Rodent Food Bar (NuRFB),,,ad libitum,Basal Control,,,1,day,UO,http://purl.obolibrary.org/obo/UO_0000033,On Earth,,,Upon euthanasia,,,sample collection,"Isoflurane, Thoracotomy, Cardiac Puncture",,,,,,,30.31,gram,UO,http://purl.obolibrary.org/obo/UO_0000021,Spleen LAR-Base 3,spleen,UBERON,http://purl.obolibrary.org/obo/UBERON_0002106,09-Dec-2017,09-Dec-2017,09-Dec-2017,Liquid Nitrogen,NCIT,http://ncicb.nci.nih.gov/xml/owl/EVS/Thesaurus...,-80,degree Celsius,UO,http://purl.obolibrary.org/obo/UO_0000027
1,LAR Baseline 4,7870,598,5910,mouse habitation,Mmus_C57-6T_SPL_BSL_LAR_Rep2_B4,Mus musculus,NCBITAXON,http://purl.bioontology.org/ontology/NCBITAXON...,C57BL/6NTac,Taconic Biosciences,female,EFO,http://purl.obolibrary.org/obo/PATO_0000383,36,week,UO,http://purl.obolibrary.org/obo/UO_0000034,Nutrient Upgraded Rodent Food Bar (NuRFB),,,ad libitum,Basal Control,,,1,day,UO,http://purl.obolibrary.org/obo/UO_0000033,On Earth,,,Upon euthanasia,,,sample collection,"Isoflurane, Thoracotomy, Cardiac Puncture",,,,,,,27.76,gram,UO,http://purl.obolibrary.org/obo/UO_0000021,Spleen LAR-Base 4,spleen,UBERON,http://purl.obolibrary.org/obo/UBERON_0002106,09-Dec-2017,09-Dec-2017,09-Dec-2017,Liquid Nitrogen,NCIT,http://ncicb.nci.nih.gov/xml/owl/EVS/Thesaurus...,-80,degree Celsius,UO,http://purl.obolibrary.org/obo/UO_0000027
2,LAR Baseline 5,7318,598,5910,mouse habitation,Mmus_C57-6T_SPL_BSL_LAR_Rep3_B5,Mus musculus,NCBITAXON,http://purl.bioontology.org/ontology/NCBITAXON...,C57BL/6NTac,Taconic Biosciences,female,EFO,http://purl.obolibrary.org/obo/PATO_0000383,36,week,UO,http://purl.obolibrary.org/obo/UO_0000034,Nutrient Upgraded Rodent Food Bar (NuRFB),,,ad libitum,Basal Control,,,1,day,UO,http://purl.obolibrary.org/obo/UO_0000033,On Earth,,,Upon euthanasia,,,sample collection,"Isoflurane, Thoracotomy, Cardiac Puncture",,,,,,,28.04,gram,UO,http://purl.obolibrary.org/obo/UO_0000021,Spleen LAR-Base 5,spleen,UBERON,http://purl.obolibrary.org/obo/UBERON_0002106,09-Dec-2017,09-Dec-2017,09-Dec-2017,Liquid Nitrogen,NCIT,http://ncicb.nci.nih.gov/xml/owl/EVS/Thesaurus...,-80,degree Celsius,UO,http://purl.obolibrary.org/obo/UO_0000027


In [7]:
assay_table.head(n=3)

Unnamed: 0,Sample Name,Protocol REF,Parameter Value[QA Instrument],Parameter Value[QA Assay],Parameter Value[QA Score],Unit,Term Source REF,Term Accession Number,Extract Name,Protocol REF.1,Parameter Value[Spike-in Quality Control],Term Source REF.1,Term Accession Number.1,Parameter Value[Spike-in Mix number],Term Source REF.2,Term Accession Number.2,Protocol REF.2,Parameter Value[library selection],Parameter Value[library layout],Parameter Value[stranded],Parameter Value[Library QA Instrument],Term Source REF.3,Term Accession Number.3,Parameter Value[Library QA Assay],Term Source REF.4,Term Accession Number.4,Parameter Value[Fragment Size],Unit.1,Term Source REF.5,Term Accession Number.5,Protocol REF.3,Parameter Value[sequencing instrument],Parameter Value[base caller],"Parameter Value[Read Length,http://ncicb.nci.nih.gov/xml/owl/EVS/Thesaurus.owl#C153362,NCIT]",Unit.2,Term Source REF.6,Term Accession Number.6,Assay Name,Parameter Value[rRNA Contamination],Unit.3,...,Term Accession Number.7,"Parameter Value[Read Depth,http://ncicb.nci.nih.gov/xml/owl/EVS/Thesaurus.owl#C155320,NCIT]",Unit.4,Term Source REF.8,Term Accession Number.8,Raw Data File,Parameter Value[Fastqc File Names],Term Source REF.9,Term Accession Number.9,Parameter Value[Multiqc File Names],Term Source REF.10,Term Accession Number.10,Protocol REF.4,"Parameter Value[trimmed sequence data,http://purl.obolibrary.org/obo/OBI_0002569,OBI]",Term Source REF.11,Term Accession Number.11,Parameter Value[Trimmed Sequence Data Fastqc File],Term Source REF.12,Term Accession Number.12,Parameter Value[Trimmed Sequence Data Multiqc File],Term Source REF.13,Term Accession Number.13,Parameter Value[Trimmed Report],Term Source REF.14,Term Accession Number.14,"Parameter Value[aligned sequence data,http://purl.obolibrary.org/obo/OBI_0002580,OBI]",Term Source REF.15,Term Accession Number.15,Parameter Value[Alignment Logs],Term Source REF.16,Term Accession Number.16,Parameter Value[Raw Counts Data File],Term Source REF.17,Term Accession Number.17,Parameter Value[Normalized Counts Data File],Term Source REF.18,Term Accession Number.18,"Parameter Value[differential expression analysis data transformation,http://purl.obolibrary.org/obo/OBI_0000650,OBI]",Term Source REF.19,Term Accession Number.19
0,Mmus_C57-6T_SPL_BSL_LAR_Rep1_B3,nucleic acid extraction,Agilent 2100 Bioanalyzer,RNA 6000 Nano Assay,5.9,RNA Integrity Number,NCIT,http://ncicb.nci.nih.gov/xml/owl/EVS/Thesaurus...,Mmus_C57-6T_SPL_BSL_LAR_Rep1_B3,spike-in protocol,ERCC ExFold RNA Spike-In Mix,,,Mix 1,,,library construction,Ribo-depletion,PAIRED,STRANDED,Agilent 4200 TapeStation,,,D1000 ScreenTape,,,287,base pair,UO,http://purl.obolibrary.org/obo/UO_0000244,nucleic acid sequencing,Illumina NovaSeq 6000,bcl2fastq,149,base pair,UO,http://purl.obolibrary.org/obo/UO_0000244,rna-seq,0.85,percent,...,http://purl.obolibrary.org/obo/UO_0000187,106545145,read,SO,http://purl.obolibrary.org/obo/SO_0000150,GLDS-246_rna-seq_Mmus_C57-6T_SPL_BSL_LAR_Rep1_...,GLDS-246_rna-seq_Mmus_C57-6T_SPL_BSL_LAR_Rep1_...,,,"GLDS-246_rna-seq_raw_multiqc_report.html, GLDS...",,,GeneLab RNAseq data processing protocol,GLDS-246_rna_seq_Mmus_C57-6T_SPL_BSL_LAR_Rep1_...,,,GLDS-246_rna_seq_Mmus_C57-6T_SPL_BSL_LAR_Rep1_...,,,"GLDS-246_rna_seq_trimmed_multiqc_report.html, ...",,,GLDS-246_rna_seq_Mmus_C57-6T_SPL_BSL_LAR_Rep1_...,,,GLDS-246_rna_seq_Mmus_C57-6T_SPL_BSL_LAR_Rep1_...,,,GLDS-246_rna_seq_Mmus_C57-6T_SPL_BSL_LAR_Rep1_...,,,GLDS-246_rna_seq_Mmus_C57-6T_SPL_BSL_LAR_Rep1_...,,,"GLDS-246_rna_seq_Normalized_Counts.csv, GLDS-2...",,,"GLDS-246_rna_seq_contrasts.csv, GLDS-246_rna_s...",,
1,Mmus_C57-6T_SPL_BSL_LAR_Rep2_B4,nucleic acid extraction,Agilent 2100 Bioanalyzer,RNA 6000 Nano Assay,6.6,RNA Integrity Number,NCIT,http://ncicb.nci.nih.gov/xml/owl/EVS/Thesaurus...,Mmus_C57-6T_SPL_BSL_LAR_Rep2_B4,spike-in protocol,ERCC ExFold RNA Spike-In Mix,,,Mix 1,,,library construction,Ribo-depletion,PAIRED,STRANDED,Agilent 4200 TapeStation,,,D1000 ScreenTape,,,289,base pair,UO,http://purl.obolibrary.org/obo/UO_0000244,nucleic acid sequencing,Illumina NovaSeq 6000,bcl2fastq,149,base pair,UO,http://purl.obolibrary.org/obo/UO_0000244,rna-seq,0.35,percent,...,http://purl.obolibrary.org/obo/UO_0000187,111068823,read,SO,http://purl.obolibrary.org/obo/SO_0000150,GLDS-246_rna-seq_Mmus_C57-6T_SPL_BSL_LAR_Rep2_...,GLDS-246_rna-seq_Mmus_C57-6T_SPL_BSL_LAR_Rep2_...,,,"GLDS-246_rna-seq_raw_multiqc_report.html, GLDS...",,,GeneLab RNAseq data processing protocol,GLDS-246_rna_seq_Mmus_C57-6T_SPL_BSL_LAR_Rep2_...,,,GLDS-246_rna_seq_Mmus_C57-6T_SPL_BSL_LAR_Rep2_...,,,"GLDS-246_rna_seq_trimmed_multiqc_report.html, ...",,,GLDS-246_rna_seq_Mmus_C57-6T_SPL_BSL_LAR_Rep2_...,,,GLDS-246_rna_seq_Mmus_C57-6T_SPL_BSL_LAR_Rep2_...,,,GLDS-246_rna_seq_Mmus_C57-6T_SPL_BSL_LAR_Rep2_...,,,GLDS-246_rna_seq_Mmus_C57-6T_SPL_BSL_LAR_Rep2_...,,,"GLDS-246_rna_seq_Normalized_Counts.csv, GLDS-2...",,,"GLDS-246_rna_seq_contrasts.csv, GLDS-246_rna_s...",,
2,Mmus_C57-6T_SPL_BSL_LAR_Rep3_B5,nucleic acid extraction,Agilent 2100 Bioanalyzer,RNA 6000 Nano Assay,8.7,RNA Integrity Number,NCIT,http://ncicb.nci.nih.gov/xml/owl/EVS/Thesaurus...,Mmus_C57-6T_SPL_BSL_LAR_Rep3_B5,spike-in protocol,ERCC ExFold RNA Spike-In Mix,,,Mix 1,,,library construction,Ribo-depletion,PAIRED,STRANDED,Agilent 4200 TapeStation,,,D1000 ScreenTape,,,283,base pair,UO,http://purl.obolibrary.org/obo/UO_0000244,nucleic acid sequencing,Illumina NovaSeq 6000,bcl2fastq,149,base pair,UO,http://purl.obolibrary.org/obo/UO_0000244,rna-seq,1.04,percent,...,http://purl.obolibrary.org/obo/UO_0000187,110761283,read,SO,http://purl.obolibrary.org/obo/SO_0000150,GLDS-246_rna-seq_Mmus_C57-6T_SPL_BSL_LAR_Rep3_...,GLDS-246_rna-seq_Mmus_C57-6T_SPL_BSL_LAR_Rep3_...,,,"GLDS-246_rna-seq_raw_multiqc_report.html, GLDS...",,,GeneLab RNAseq data processing protocol,GLDS-246_rna_seq_Mmus_C57-6T_SPL_BSL_LAR_Rep3_...,,,GLDS-246_rna_seq_Mmus_C57-6T_SPL_BSL_LAR_Rep3_...,,,"GLDS-246_rna_seq_trimmed_multiqc_report.html, ...",,,GLDS-246_rna_seq_Mmus_C57-6T_SPL_BSL_LAR_Rep3_...,,,GLDS-246_rna_seq_Mmus_C57-6T_SPL_BSL_LAR_Rep3_...,,,GLDS-246_rna_seq_Mmus_C57-6T_SPL_BSL_LAR_Rep3_...,,,GLDS-246_rna_seq_Mmus_C57-6T_SPL_BSL_LAR_Rep3_...,,,"GLDS-246_rna_seq_Normalized_Counts.csv, GLDS-2...",,,"GLDS-246_rna_seq_contrasts.csv, GLDS-246_rna_s...",,


In [8]:
# Get raw counts table
raw_counts_file = get_rawcounts('GLDS-246')[3]
raw_counts_table = pd.read_csv(raw_counts_file, index_col=0)
raw_counts_table.index.rename('Gene_ID', inplace=True)
raw_counts_table.head(n=3)

Unnamed: 0_level_0,Mmus_C57-6T_SPL_BSL_LAR_Rep1_B3,Mmus_C57-6T_SPL_BSL_LAR_Rep2_B4,Mmus_C57-6T_SPL_BSL_LAR_Rep3_B5,Mmus_C57-6T_SPL_BSL_LAR_Rep4_B6,Mmus_C57-6T_SPL_BSL_LAR_Rep5_B7,Mmus_C57-6T_SPL_BSL_LAR_Rep6_B8,Mmus_C57-6T_SPL_BSL_LAR_Rep7_B9,Mmus_C57-6T_SPL_BSL_LAR_Rep8_B10,Mmus_C57-6T_SPL_BSL_LAR_Rep9_B1,Mmus_C57-6T_SPL_BSL_LAR_Rep10_B2,Mmus_C57-6T_SPL_GC_LAR_Rep1_G3,Mmus_C57-6T_SPL_GC_LAR_Rep2_G4,Mmus_C57-6T_SPL_GC_LAR_Rep3_G5,Mmus_C57-6T_SPL_GC_LAR_Rep4_G6,Mmus_C57-6T_SPL_GC_LAR_Rep5_G7,Mmus_C57-6T_SPL_GC_LAR_Rep6_G8,Mmus_C57-6T_SPL_GC_LAR_Rep7_G9,Mmus_C57-6T_SPL_GC_LAR_Rep8_G10,Mmus_C57-6T_SPL_GC_LAR_Rep9_G2,Mmus_C57-6T_SPL_FLT_LAR_Rep1_F3,Mmus_C57-6T_SPL_FLT_LAR_Rep2_F4,Mmus_C57-6T_SPL_FLT_LAR_Rep3_F5,Mmus_C57-6T_SPL_FLT_LAR_Rep4_F6,Mmus_C57-6T_SPL_FLT_LAR_Rep5_F7,Mmus_C57-6T_SPL_FLT_LAR_Rep6_F8,Mmus_C57-6T_SPL_FLT_LAR_Rep7_F9,Mmus_C57-6T_SPL_FLT_LAR_Rep8_F10,Mmus_C57-6T_SPL_BSL_ISS-T_Rep1_B6,Mmus_C57-6T_SPL_BSL_ISS-T_Rep2_B7,Mmus_C57-6T_SPL_BSL_ISS-T_Rep3_B8,Mmus_C57-6T_SPL_GC_ISS-T_Rep1_G3,Mmus_C57-6T_SPL_GC_ISS-T_Rep2_G4,Mmus_C57-6T_SPL_GC_ISS-T_Rep3_G5,Mmus_C57-6T_SPL_GC_ISS-T_Rep4_G6,Mmus_C57-6T_SPL_GC_ISS-T_Rep5_G8,Mmus_C57-6T_SPL_GC_ISS-T_Rep6_G9,Mmus_C57-6T_SPL_GC_ISS-T_Rep7_G10,Mmus_C57-6T_SPL_FLT_ISS-T_Rep1_F3,Mmus_C57-6T_SPL_FLT_ISS-T_Rep2_F4,Mmus_C57-6T_SPL_FLT_ISS-T_Rep3_F5,Mmus_C57-6T_SPL_FLT_ISS-T_Rep4_F7,Mmus_C57-6T_SPL_FLT_ISS-T_Rep5_F8,Mmus_C57-6T_SPL_FLT_ISS-T_Rep6_F9,Mmus_C57-6T_SPL_FLT_ISS-T_Rep7_F10
Gene_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1
ENSMUSG00000000001,18.0,15.0,46.0,9.0,5.0,6.0,9.0,1.0,14.0,5.0,26.0,24.0,5.0,8.0,42.0,17.0,1.0,4.0,10.0,9.0,19.0,0.0,24.0,5.0,25.0,6.0,7.0,15.0,31.0,9.0,27.0,23.0,51.0,15.0,26.0,0.0,14.0,30.0,31.0,41.0,21.0,2.0,5.0,7.0
ENSMUSG00000000003,0.0,1.0,0.0,0.0,0.0,0.0,0.0,2.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
ENSMUSG00000000028,6.0,8.0,10.0,4.0,2.0,0.0,2.0,1.0,2.0,4.0,1.0,7.0,1.0,4.0,3.0,2.0,1.0,0.0,3.0,0.0,4.0,0.0,8.0,2.0,2.0,0.0,0.0,6.0,6.0,2.0,1.0,5.0,13.0,4.0,1.0,1.0,1.0,8.0,6.0,10.0,1.0,3.0,3.0,1.0


In [9]:
# Get ERCC counts
ercc_counts = raw_counts_table[raw_counts_table.index.str.contains('^ERCC-')]
ercc_counts.head(n=3)

Unnamed: 0_level_0,Mmus_C57-6T_SPL_BSL_LAR_Rep1_B3,Mmus_C57-6T_SPL_BSL_LAR_Rep2_B4,Mmus_C57-6T_SPL_BSL_LAR_Rep3_B5,Mmus_C57-6T_SPL_BSL_LAR_Rep4_B6,Mmus_C57-6T_SPL_BSL_LAR_Rep5_B7,Mmus_C57-6T_SPL_BSL_LAR_Rep6_B8,Mmus_C57-6T_SPL_BSL_LAR_Rep7_B9,Mmus_C57-6T_SPL_BSL_LAR_Rep8_B10,Mmus_C57-6T_SPL_BSL_LAR_Rep9_B1,Mmus_C57-6T_SPL_BSL_LAR_Rep10_B2,Mmus_C57-6T_SPL_GC_LAR_Rep1_G3,Mmus_C57-6T_SPL_GC_LAR_Rep2_G4,Mmus_C57-6T_SPL_GC_LAR_Rep3_G5,Mmus_C57-6T_SPL_GC_LAR_Rep4_G6,Mmus_C57-6T_SPL_GC_LAR_Rep5_G7,Mmus_C57-6T_SPL_GC_LAR_Rep6_G8,Mmus_C57-6T_SPL_GC_LAR_Rep7_G9,Mmus_C57-6T_SPL_GC_LAR_Rep8_G10,Mmus_C57-6T_SPL_GC_LAR_Rep9_G2,Mmus_C57-6T_SPL_FLT_LAR_Rep1_F3,Mmus_C57-6T_SPL_FLT_LAR_Rep2_F4,Mmus_C57-6T_SPL_FLT_LAR_Rep3_F5,Mmus_C57-6T_SPL_FLT_LAR_Rep4_F6,Mmus_C57-6T_SPL_FLT_LAR_Rep5_F7,Mmus_C57-6T_SPL_FLT_LAR_Rep6_F8,Mmus_C57-6T_SPL_FLT_LAR_Rep7_F9,Mmus_C57-6T_SPL_FLT_LAR_Rep8_F10,Mmus_C57-6T_SPL_BSL_ISS-T_Rep1_B6,Mmus_C57-6T_SPL_BSL_ISS-T_Rep2_B7,Mmus_C57-6T_SPL_BSL_ISS-T_Rep3_B8,Mmus_C57-6T_SPL_GC_ISS-T_Rep1_G3,Mmus_C57-6T_SPL_GC_ISS-T_Rep2_G4,Mmus_C57-6T_SPL_GC_ISS-T_Rep3_G5,Mmus_C57-6T_SPL_GC_ISS-T_Rep4_G6,Mmus_C57-6T_SPL_GC_ISS-T_Rep5_G8,Mmus_C57-6T_SPL_GC_ISS-T_Rep6_G9,Mmus_C57-6T_SPL_GC_ISS-T_Rep7_G10,Mmus_C57-6T_SPL_FLT_ISS-T_Rep1_F3,Mmus_C57-6T_SPL_FLT_ISS-T_Rep2_F4,Mmus_C57-6T_SPL_FLT_ISS-T_Rep3_F5,Mmus_C57-6T_SPL_FLT_ISS-T_Rep4_F7,Mmus_C57-6T_SPL_FLT_ISS-T_Rep5_F8,Mmus_C57-6T_SPL_FLT_ISS-T_Rep6_F9,Mmus_C57-6T_SPL_FLT_ISS-T_Rep7_F10
Gene_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1
ERCC-00002,769.0,533.0,1888.0,456.0,212.0,163.0,735.0,17.0,211.0,240.0,1244.0,1102.0,102.0,621.0,1209.0,1245.0,213.0,327.0,412.0,809.0,2688.0,367.0,4336.0,731.0,2421.0,430.0,396.0,1800.0,3571.0,832.0,3009.0,2708.0,6732.0,3034.0,4129.0,487.0,784.0,1615.0,1737.0,1842.0,1713.0,323.0,329.0,319.0
ERCC-00003,44.0,30.0,140.0,18.0,10.0,9.0,29.0,10.0,10.0,15.0,58.0,61.0,9.0,28.0,79.0,78.0,12.0,24.0,27.0,24.0,126.0,26.0,223.0,31.0,121.0,22.0,28.0,105.0,264.0,38.0,150.0,139.0,376.0,188.0,164.0,36.0,47.0,100.0,98.0,102.0,67.0,12.0,20.0,23.0
ERCC-00004,215.0,111.0,606.0,97.0,94.0,138.0,227.0,67.0,51.0,82.0,279.0,316.0,90.0,148.0,371.0,347.0,49.0,105.0,145.0,15.0,98.0,27.0,150.0,23.0,102.0,15.0,11.0,83.0,173.0,28.0,75.0,70.0,234.0,83.0,143.0,26.0,29.0,539.0,512.0,545.0,443.0,142.0,76.0,137.0


In [11]:
# Get ERCC files
ercc_url = 'https://assets.thermofisher.com/TFS-Assets/LSG/manuals/cms_095046.txt'
filehandle, _ = urlretrieve(ercc_url)
ercc_table = pd.read_csv(filehandle, '\t')
ercc_table.head(n=3)

Unnamed: 0,Re-sort ID,ERCC ID,subgroup,concentration in Mix 1 (attomoles/ul),concentration in Mix 2 (attomoles/ul),expected fold-change ratio,log2(Mix 1/Mix 2)
0,1,ERCC-00130,A,30000.0,7500.0,4.0,2.0
1,2,ERCC-00004,A,7500.0,1875.0,4.0,2.0
2,3,ERCC-00136,A,1875.0,468.75,4.0,2.0


In [12]:
combined = sample_table.merge(assay_table, on='Sample Name')
combined.head(n=3)

Unnamed: 0,Source Name,Comment[LSDA Source Name],Comment[LSDA Biospecimen Subject ID],Comment[LSDA Biospecimen ID],Protocol REF_x,Sample Name,Characteristics[Organism],Term Source REF_x,Term Accession Number_x,Characteristics[Strain],Comment[Animal Source],"Characteristics[sex,http://purl.obolibrary.org/obo/PATO_0000047,EFO]",Term Source REF.1_x,Term Accession Number.1_x,Characteristics[Age at Launch],Unit_x,Term Source REF.2_x,Term Accession Number.2_x,"Characteristics[Diet,http://purl.bioontology.org/ontology/MESH/D004032,MESH]",Term Source REF.3_x,Term Accession Number.3_x,Comment[Feeding Schedule],Factor Value[Spaceflight],Term Source REF.4_x,Term Accession Number.4_x,Factor Value[Duration],Unit.1_x,Term Source REF.5_x,Term Accession Number.5_x,Factor Value[Euthanasia],Term Source REF.6_x,Term Accession Number.6_x,Factor Value[Dissection Condition],Term Source REF.7_x,Term Accession Number.7_x,Protocol REF.1_x,"Parameter Value[Euthanasia Method,http://ncicb.nci.nih.gov/xml/owl/EVS/Thesaurus.owl#C90336,NCIT]",Term Source REF.8_x,Term Accession Number.8_x,Parameter Value[Carcass Preservation Method],...,Term Accession Number.7_y,"Parameter Value[Read Depth,http://ncicb.nci.nih.gov/xml/owl/EVS/Thesaurus.owl#C155320,NCIT]",Unit.4_y,Term Source REF.8_y,Term Accession Number.8_y,Raw Data File,Parameter Value[Fastqc File Names],Term Source REF.9_y,Term Accession Number.9_y,Parameter Value[Multiqc File Names],Term Source REF.10_y,Term Accession Number.10_y,Protocol REF.4,"Parameter Value[trimmed sequence data,http://purl.obolibrary.org/obo/OBI_0002569,OBI]",Term Source REF.11_y,Term Accession Number.11_y,Parameter Value[Trimmed Sequence Data Fastqc File],Term Source REF.12_y,Term Accession Number.12_y,Parameter Value[Trimmed Sequence Data Multiqc File],Term Source REF.13_y,Term Accession Number.13_y,Parameter Value[Trimmed Report],Term Source REF.14,Term Accession Number.14,"Parameter Value[aligned sequence data,http://purl.obolibrary.org/obo/OBI_0002580,OBI]",Term Source REF.15,Term Accession Number.15,Parameter Value[Alignment Logs],Term Source REF.16,Term Accession Number.16,Parameter Value[Raw Counts Data File],Term Source REF.17,Term Accession Number.17,Parameter Value[Normalized Counts Data File],Term Source REF.18,Term Accession Number.18,"Parameter Value[differential expression analysis data transformation,http://purl.obolibrary.org/obo/OBI_0000650,OBI]",Term Source REF.19,Term Accession Number.19
0,LAR Baseline 3,3B18,598,5910,mouse habitation,Mmus_C57-6T_SPL_BSL_LAR_Rep1_B3,Mus musculus,NCBITAXON,http://purl.bioontology.org/ontology/NCBITAXON...,C57BL/6NTac,Taconic Biosciences,female,EFO,http://purl.obolibrary.org/obo/PATO_0000383,36,week,UO,http://purl.obolibrary.org/obo/UO_0000034,Nutrient Upgraded Rodent Food Bar (NuRFB),,,ad libitum,Basal Control,,,1,day,UO,http://purl.obolibrary.org/obo/UO_0000033,On Earth,,,Upon euthanasia,,,sample collection,"Isoflurane, Thoracotomy, Cardiac Puncture",,,,...,http://purl.obolibrary.org/obo/UO_0000187,106545145,read,SO,http://purl.obolibrary.org/obo/SO_0000150,GLDS-246_rna-seq_Mmus_C57-6T_SPL_BSL_LAR_Rep1_...,GLDS-246_rna-seq_Mmus_C57-6T_SPL_BSL_LAR_Rep1_...,,,"GLDS-246_rna-seq_raw_multiqc_report.html, GLDS...",,,GeneLab RNAseq data processing protocol,GLDS-246_rna_seq_Mmus_C57-6T_SPL_BSL_LAR_Rep1_...,,,GLDS-246_rna_seq_Mmus_C57-6T_SPL_BSL_LAR_Rep1_...,,,"GLDS-246_rna_seq_trimmed_multiqc_report.html, ...",,,GLDS-246_rna_seq_Mmus_C57-6T_SPL_BSL_LAR_Rep1_...,,,GLDS-246_rna_seq_Mmus_C57-6T_SPL_BSL_LAR_Rep1_...,,,GLDS-246_rna_seq_Mmus_C57-6T_SPL_BSL_LAR_Rep1_...,,,GLDS-246_rna_seq_Mmus_C57-6T_SPL_BSL_LAR_Rep1_...,,,"GLDS-246_rna_seq_Normalized_Counts.csv, GLDS-2...",,,"GLDS-246_rna_seq_contrasts.csv, GLDS-246_rna_s...",,
1,LAR Baseline 4,7870,598,5910,mouse habitation,Mmus_C57-6T_SPL_BSL_LAR_Rep2_B4,Mus musculus,NCBITAXON,http://purl.bioontology.org/ontology/NCBITAXON...,C57BL/6NTac,Taconic Biosciences,female,EFO,http://purl.obolibrary.org/obo/PATO_0000383,36,week,UO,http://purl.obolibrary.org/obo/UO_0000034,Nutrient Upgraded Rodent Food Bar (NuRFB),,,ad libitum,Basal Control,,,1,day,UO,http://purl.obolibrary.org/obo/UO_0000033,On Earth,,,Upon euthanasia,,,sample collection,"Isoflurane, Thoracotomy, Cardiac Puncture",,,,...,http://purl.obolibrary.org/obo/UO_0000187,111068823,read,SO,http://purl.obolibrary.org/obo/SO_0000150,GLDS-246_rna-seq_Mmus_C57-6T_SPL_BSL_LAR_Rep2_...,GLDS-246_rna-seq_Mmus_C57-6T_SPL_BSL_LAR_Rep2_...,,,"GLDS-246_rna-seq_raw_multiqc_report.html, GLDS...",,,GeneLab RNAseq data processing protocol,GLDS-246_rna_seq_Mmus_C57-6T_SPL_BSL_LAR_Rep2_...,,,GLDS-246_rna_seq_Mmus_C57-6T_SPL_BSL_LAR_Rep2_...,,,"GLDS-246_rna_seq_trimmed_multiqc_report.html, ...",,,GLDS-246_rna_seq_Mmus_C57-6T_SPL_BSL_LAR_Rep2_...,,,GLDS-246_rna_seq_Mmus_C57-6T_SPL_BSL_LAR_Rep2_...,,,GLDS-246_rna_seq_Mmus_C57-6T_SPL_BSL_LAR_Rep2_...,,,GLDS-246_rna_seq_Mmus_C57-6T_SPL_BSL_LAR_Rep2_...,,,"GLDS-246_rna_seq_Normalized_Counts.csv, GLDS-2...",,,"GLDS-246_rna_seq_contrasts.csv, GLDS-246_rna_s...",,
2,LAR Baseline 5,7318,598,5910,mouse habitation,Mmus_C57-6T_SPL_BSL_LAR_Rep3_B5,Mus musculus,NCBITAXON,http://purl.bioontology.org/ontology/NCBITAXON...,C57BL/6NTac,Taconic Biosciences,female,EFO,http://purl.obolibrary.org/obo/PATO_0000383,36,week,UO,http://purl.obolibrary.org/obo/UO_0000034,Nutrient Upgraded Rodent Food Bar (NuRFB),,,ad libitum,Basal Control,,,1,day,UO,http://purl.obolibrary.org/obo/UO_0000033,On Earth,,,Upon euthanasia,,,sample collection,"Isoflurane, Thoracotomy, Cardiac Puncture",,,,...,http://purl.obolibrary.org/obo/UO_0000187,110761283,read,SO,http://purl.obolibrary.org/obo/SO_0000150,GLDS-246_rna-seq_Mmus_C57-6T_SPL_BSL_LAR_Rep3_...,GLDS-246_rna-seq_Mmus_C57-6T_SPL_BSL_LAR_Rep3_...,,,"GLDS-246_rna-seq_raw_multiqc_report.html, GLDS...",,,GeneLab RNAseq data processing protocol,GLDS-246_rna_seq_Mmus_C57-6T_SPL_BSL_LAR_Rep3_...,,,GLDS-246_rna_seq_Mmus_C57-6T_SPL_BSL_LAR_Rep3_...,,,"GLDS-246_rna_seq_trimmed_multiqc_report.html, ...",,,GLDS-246_rna_seq_Mmus_C57-6T_SPL_BSL_LAR_Rep3_...,,,GLDS-246_rna_seq_Mmus_C57-6T_SPL_BSL_LAR_Rep3_...,,,GLDS-246_rna_seq_Mmus_C57-6T_SPL_BSL_LAR_Rep3_...,,,GLDS-246_rna_seq_Mmus_C57-6T_SPL_BSL_LAR_Rep3_...,,,"GLDS-246_rna_seq_Normalized_Counts.csv, GLDS-2...",,,"GLDS-246_rna_seq_contrasts.csv, GLDS-246_rna_s...",,


In [19]:
flightground = combined['Factor Value[Spaceflight]'].isin(['Space Flight', 'Ground Control'])
lar = combined['Factor Value[Euthanasia]'] == 'On Earth'

In [22]:
fewsamples = combined.loc(flightground)

TypeError: ignored