In [1]:
from Bio import Entrez
import os, pandas, re, lxml.etree, datetime, numpy, warnings, time

pandas.options.display.max_rows=100

In [2]:
# Date
d = datetime.datetime.today()
d = str(d.year)+"_"+str(d.month)+"_"+str(d.day)
#d = "2018_4_24"

wd_base = "/Users/kf/Dropbox/kfdata/02_Data/my_db/Ensembl/release-91/sra/"
wd = wd_base+d

# Set directory
if not os.path.exists(wd):
    os.mkdir(wd)
os.chdir(wd)

In [4]:
def get_search_term(species_name="", bioprojects=[], biosamples=[]):
    species_term = '"'+species_name+'"'+"[organism]"
    tissues = ["brain", "liver", "kidney", "testis", "testes", "heart", "cerebellum", "lung", "muscle", "spleen", "ovary", "colon", "thymus", "bone marrow", "lymph node", "skin", "blood", "adrenal", "placenta", "salivary gland", "uterus", "retina", "pancreas", "embryo"]
    tissue_term = "(" + " OR ".join(tissues) + ")"
    other_conditions = ['"platform illumina"[Properties]', '"type rnaseq"[Filter]', '"sra biosample"[Filter]']
    other_term = " AND ".join(other_conditions)
    excluded_conditions = ['"strategy mirna seq"[Properties]', '"strategy rip seq"[Properties]']
    excluded_term = " NOT ".join(excluded_conditions)
    if len(bioprojects) and len(biosamples):
        warnings.warn("Both bioprojects and biosamples cannot be specified.")
    elif len(bioprojects):
        bioproject_term = "(" + " OR ".join(bioprojects) + ")"
        search_term = species_term + " AND " + bioproject_term + " AND " + other_term + " NOT " + excluded_term
    elif len(biosamples):
        biosample_term = "(" + " OR ".join(biosamples) + ")"
        search_term = biosample_term
    else:        
        search_term = species_term + " AND " + tissue_term + " AND " + other_term + " NOT " + excluded_term
    return search_term

def fetch_sra_xml(species_name, search_term, save_xml=True, read_from_existing_file=False):
    file_xml = "SRA_"+species_name.replace(" ", "_")+".xml"
    flag = True
    if (read_from_existing_file)&(os.path.exists(file_xml)):
        with open(file_xml) as f:
            if '<Error>' in f.read():
                print(species_name, ': <Error> found in the saved file. Deleting...')
                os.remove(file_xml)
            else:
                print(species_name, ': reading xml from file')
                root = lxml.etree.parse(file_xml, parser=lxml.etree.XMLParser())
                flag = False
    if flag:
        sra_handle = Entrez.esearch(db="sra", term=search_term, retmax=10000000)
        sra_record = Entrez.read(sra_handle)
        record_ids = sra_record["IdList"]
        num_record = len(record_ids)
        retmax = 1000
        start_time = time.time()
        query_search_time = datetime.datetime.now().strftime('%Y/%m/%d %H:%M:%S')
        root = None
        for i in numpy.arange(numpy.ceil(num_record//retmax)+1):
            start = int(i*retmax)
            end = int(((i+1)*retmax)-1) if num_record >= int(((i+1)*retmax)-1) else num_record
            print('processing ', start, '-', end, flush=True)
            handle = Entrez.efetch(db="sra", id=record_ids[start:end], rettype="full", retmode="xml", retmax=retmax)
            chunk = lxml.etree.parse(handle).getroot()
            if root is None:
                root = chunk
            else:
                root.append(chunk)
        elapsed_time = int(time.time() - start_time)
        xml_string = lxml.etree.tostring(root, pretty_print=True)
        for line in str(xml_string).split('\n'):
            if '<Error>' in line:
                print(line)
                if os.path.exists(file_xml):
                    os.remove(file_xml)
                raise Exception(species_name, ': <Error> found in the xml.')
        if save_xml:
            with open(file_xml, 'wb') as f:
                f.write(xml_string)
    return root

def sra_xml2table(species_name, xml_root):
    if isinstance(xml_root, lxml.etree._Element):
        xml_root = lxml.etree.ElementTree(xml_root)
    root = xml_root
    if isinstance(root, lxml.etree._ElementTree):
        sra_table = pandas.DataFrame()
        for entry in root.iter(tag="EXPERIMENT_PACKAGE"):
            items = []
            bioproject = entry.findall('.//EXTERNAL_ID[@namespace="BioProject"]')
            if not len(bioproject):
                labels = entry.findall('.//LABEL')
                for label in labels:
                    text = label.text
                    if text.startswith("PRJ"):
                        bioproject = [label]
                        break
            is_single = len(entry.findall('.//LIBRARY_LAYOUT/SINGLE'))
            is_paired = len(entry.findall('.//LIBRARY_LAYOUT/PAIRED'))
            if is_single:
                library_layout = ["single"]
            elif is_paired:
                library_layout = ["paired"]
            else:
                library_layout = [""]
            values = entry.findall('.//VALUE')
            is_protected = ["No"]
            if len(values):
                for value in values:
                    text = value.text
                    if not text is None:
                        if text.endswith("PROTECTED"):
                            is_protected = ["Yes"]
                            break
            items.append(["bioproject", bioproject])
            items.append(["scientific_name", entry.xpath('./SAMPLE/SAMPLE_NAME/SCIENTIFIC_NAME')])
            items.append(["biosample", entry.findall('.//EXTERNAL_ID[@namespace="BioSample"]')])
            items.append(["experiment", entry.xpath('./EXPERIMENT/IDENTIFIERS/PRIMARY_ID')])
            items.append(["run", entry.xpath('./RUN_SET/RUN/IDENTIFIERS/PRIMARY_ID')])
            items.append(["sra_primary", entry.xpath('./SUBMISSION/IDENTIFIERS/PRIMARY_ID')])
            items.append(["sra_sample", entry.xpath('./SAMPLE/IDENTIFIERS/PRIMARY_ID')])
            items.append(["sra_study", entry.xpath('./EXPERIMENT/STUDY_REF/IDENTIFIERS/PRIMARY_ID')])
            items.append(["published_date", entry.xpath('./RUN_SET/RUN/@published')])
            items.append(["exp_title", entry.xpath('./EXPERIMENT/TITLE')])
            items.append(["design", entry.xpath('./EXPERIMENT/DESIGN/DESIGN_DESCRIPTION')])
            items.append(["lib_name", entry.xpath('./EXPERIMENT/DESIGN/LIBRARY_DESCRIPTOR/LIBRARY_NAME')])
            items.append(["lib_strategy", entry.xpath('./EXPERIMENT/DESIGN/LIBRARY_DESCRIPTOR/LIBRARY_STRATEGY')])
            items.append(["lib_source", entry.xpath('./EXPERIMENT/DESIGN/LIBRARY_DESCRIPTOR/LIBRARY_SOURCE')])
            items.append(["lib_selection", entry.xpath('./EXPERIMENT/DESIGN/LIBRARY_DESCRIPTOR/LIBRARY_SELECTION')])
            items.append(["lib_layout", library_layout])
            items.append(["nominal_length", entry.xpath('./EXPERIMENT/DESIGN/LIBRARY_DESCRIPTOR/LIBRARY_LAYOUT/PAIRED/@NOMINAL_LENGTH')])
            items.append(["nominal_sdev", entry.xpath('./EXPERIMENT/DESIGN/LIBRARY_DESCRIPTOR/LIBRARY_LAYOUT/PAIRED/@NOMINAL_SDEV')])
            items.append(["spot_length", entry.xpath('./EXPERIMENT/DESIGN/SPOT_DESCRIPTOR/SPOT_DECODE_SPEC/SPOT_LENGTH')])
            items.append(["read_index", entry.xpath('./EXPERIMENT/DESIGN/SPOT_DESCRIPTOR/SPOT_DECODE_SPEC/READ_SPEC/READ_INDEX')])
            items.append(["read_class", entry.xpath('./EXPERIMENT/DESIGN/SPOT_DESCRIPTOR/SPOT_DECODE_SPEC/READ_SPEC/READ_CLASS')])
            items.append(["read_type", entry.xpath('./EXPERIMENT/DESIGN/SPOT_DESCRIPTOR/SPOT_DECODE_SPEC/READ_SPEC/READ_TYPE')])
            items.append(["base_coord", entry.xpath('./EXPERIMENT/DESIGN/SPOT_DESCRIPTOR/SPOT_DECODE_SPEC/READ_SPEC/BASE_COORD')])
            items.append(["instrument", entry.xpath('./EXPERIMENT/PLATFORM/ILLUMINA/INSTRUMENT_MODEL')])
            items.append(["lab", entry.xpath('./SUBMISSION/@lab_name')])
            items.append(["center", entry.xpath('./SUBMISSION/@center_name')])
            items.append(["submitter_id", entry.xpath('./SUBMISSION/IDENTIFIERS/SUBMITTER_ID')])
            items.append(["study_title", entry.xpath('./STUDY/DESCRIPTOR/STUDY_TITLE')])
            items.append(["pubmed_id", entry.xpath('./STUDY/STUDY_LINKS/STUDY_LINK/XREF_LINK/ID')])
            items.append(["sample_title", entry.xpath('./SAMPLE/TITLE')])
            items.append(["taxid", entry.xpath('./SAMPLE/SAMPLE_NAME/TAXON_ID')])
            items.append(["sample_description", entry.xpath('./SAMPLE/DESCRIPTION')])
            items.append(["total_spots", entry.xpath('./RUN_SET/RUN/@total_spots')])
            items.append(["total_bases", entry.xpath('./RUN_SET/RUN/@total_bases')])
            items.append(["size", entry.xpath('./RUN_SET/RUN/@size')])
            items.append(["is_protected", is_protected])
            row = []
            for item in items:
                try:
                    if isinstance(item[1][0], (lxml.etree._ElementUnicodeResult, int, str)):
                        row.append(str(item[1][0]))
                    else:
                        row.append(item[1][0].text)
                except:
                    row.append("")
            try:
                column_names
            except:
                column_names = []
                for item in items:
                    column_names.append(item[0])
            row_df = pandas.DataFrame(row).T
            row_df.columns = column_names
            sas = entry.xpath('./SAMPLE/SAMPLE_ATTRIBUTES/SAMPLE_ATTRIBUTE')
            for sa in sas:
                tag = sa.xpath('./TAG')
                if not tag[0].text == None:
                    tag = tag[0].text.lower()
                    tag = re.sub(" \(.*", "", tag)
                    tag = re.sub(" ", "_", tag)
                    if not tag in row_df.columns:
                        value = sa.xpath('./VALUE')
                        if len(value):
                            value = value[0].text
                            if tag in column_names:
                                tag = tag+"_2"
                            sa_df = pandas.DataFrame([value])
                            sa_df.columns = [tag]
                            row_df = pandas.concat([row_df,sa_df], axis=1)
            sra_table = pandas.concat([sra_table, row_df], ignore_index=True)
        sra_table == sra_table.fillna("")
        if "scientific_name" in sra_table.columns and len(sra_table.loc[sra_table["scientific_name"] == "", "scientific_name"]):        
            sra_table.loc[sra_table["scientific_name"] == "", "scientific_name"] = species_name
        return sra_table
    else:
        raise Exception("Unknown input type.", type(root))

def exclude_by_id(sra_table, bioprojects=[], biosamples=[]):
    print('Running: exclude_by_id')
    if not 'exclusion' in sra_table.columns:
        sra_table['exclusion'] = 'no'
    sra_table.loc[sra_table['bioproject'].isin(bioprojects),'exclusion'] = 'bioproject'
    sra_table.loc[sra_table['biosample'].isin(biosamples),'exclusion'] = 'biosample'
    return sra_table

def standardize_orthographical_variants(sra_table):
    print('Running: standardize_orthographical_variants')
    sra_table = sra_table.replace('not applicable', '')
    sra_table = sra_table.replace('Not applicable', '')
    sra_table = sra_table.replace('Not Applicable', '')
    sra_table = sra_table.replace('not collected', '')  
    sra_table = sra_table.replace('Missing', '') 
    sra_table = sra_table.replace('missing', '')
    return sra_table

def column_aggregation(sra_table, column_aggregation_list):
    print('Running: column_aggregation')
    for item in column_aggregation_list.items():
        aggregate_to = item[0]
        for aggregate_from in item[1]:
            if (aggregate_from in sra_table.columns)&(aggregate_from!=''):
                if not aggregate_to in sra_table.columns:
                    sra_table[aggregate_to] = ''
                is_from_empty = (sra_table[aggregate_from].isnull())|(sra_table[aggregate_from].astype(str)=='')
                is_to_empty = (sra_table[aggregate_to].isnull())|(sra_table[aggregate_to].astype(str)=='')
                new_annotations = sra_table.loc[(~is_from_empty)&(is_to_empty), aggregate_from].astype(str)+'['+aggregate_from+']'
                sra_table.loc[(~is_from_empty)&(is_to_empty), aggregate_to] = new_annotations
                new_annotations = sra_table.loc[(~is_from_empty)&(~is_to_empty), aggregate_to].astype(str)+"; "+sra_table.loc[(~is_from_empty)&(~is_to_empty), aggregate_from].astype(str)+'['+aggregate_from+']'
                sra_table.loc[(~is_from_empty)&(~is_to_empty), aggregate_to] = new_annotations
                sra_table = sra_table.drop(aggregate_from, 1)
    sra_table = reorder_columns(sra_table, omit_misc=False)
    return sra_table

def standardize_tissue(sra_table, tissue_aggregation_list):
    print('Running: standardize_tissue')
    sra_table['tissue_original'] = sra_table['tissue']
    sra_table['tissue'] = sra_table['tissue'].map(lambda x: str(x).lower())
    sra_table['tissue'] = sra_table['tissue'].map(lambda x: re.sub(r'[0-9]+', '', str(x)))
    sra_table['tissue'] = sra_table['tissue'].map(lambda x: re.sub('-', ' ', str(x)))
    sra_table['tissue'] = sra_table['tissue'].map(lambda x: re.sub('_', ' ', str(x)))
    sra_table['tissue'] = sra_table['tissue'].map(lambda x: re.sub(' \(.*', '', str(x)))
    sra_table['tissue'] = sra_table['tissue'].map(lambda x: re.sub('pouch young ', '', str(x)))   
    sra_table["tissue"] = sra_table["tissue"].map(lambda x: re.sub("large white ", "", str(x)))
    sra_table["tissue"] = sra_table["tissue"].map(lambda x: re.sub("pietrain ", "", str(x)))
    sra_table["tissue"] = sra_table["tissue"].map(lambda x: re.sub("duroc ", "", str(x)))
    sra_table["tissue"] = sra_table["tissue"].map(lambda x: re.sub("cherry valley duck ", "", str(x)))
    sra_table['tissue'] = sra_table['tissue'].replace(';.*','',regex=True)
    sra_table['tissue'] = sra_table['tissue'].replace(':.*','',regex=True)
    sra_table['tissue'] = sra_table['tissue'].replace('\[.*','',regex=True)
    sra_table['tissue'] = sra_table['tissue'].replace('^ ','',regex=True)
    sra_table['tissue'] = sra_table['tissue'].replace('^ ','',regex=True)
    sra_table['tissue'] = sra_table['tissue'].replace(' $','',regex=True)
    sra_table['tissue'] = sra_table['tissue'].replace(' $','',regex=True)
    sra_table.loc[sra_table.tissue=='nan','tissue'] = ''
    for item in tissue_aggregation_list.items():
        to_value = item[0]
        from_values = [ i for i in item[1] if i!='' ]
        for from_value in from_values:
            if from_value in list(sra_table['tissue']):
                sra_table['tissue'] = sra_table['tissue'].replace(from_value, to_value)
    return sra_table

def standardize_sex(sra_table):
    print("Running: standardize_sex")
    sra_table["sex"] = sra_table["sex"].map(lambda x: re.sub("female", "F", str(x)))
    sra_table["sex"] = sra_table["sex"].map(lambda x: re.sub("male", "M", str(x)))
    sra_table["sex"] = sra_table["sex"].map(lambda x: re.sub("FEM", "F", str(x)))
    sra_table["sex"] = sra_table["sex"].map(lambda x: re.sub("MALE", "M", str(x)))
    sra_table["sex"] = sra_table["sex"].map(lambda x: re.sub("Male", "M", str(x)))
    sra_table["sex"] = sra_table["sex"].map(lambda x: re.sub("Female", "F", str(x)))
    sra_table["sex"] = sra_table["sex"].map(lambda x: re.sub("fem", "F", str(x)))
    sra_table["sex"] = sra_table["sex"].map(lambda x: str(x).upper())
    return sra_table

def standardize_scientific_name(sra_table):
    print("Running: standardize_scientific_name")
    sra_table["sci_name_original"] = sra_table["scientific_name"]
    sra_table["scientific_name"] = sra_table["scientific_name"].replace("Canis familiaris", "Canis lupus")
    sra_table["scientific_name"] = sra_table["scientific_name"].replace("Xenopus (Silurana) tropicalis", "Xenopus tropicalis") 
    sra_table["scientific_name"] = [re.sub(r'(.+)(\s)(.+)(\s)(.+)', r"\1\2\3", sp) for sp in sra_table["scientific_name"]]
    return sra_table

def manual_attribute_filling(sra_table):
    print("Running: manual_attribute_filling")
    sra_table.loc[sra_table["bioproject"]=="PRJNA215077", "tissue"] = sra_table.loc[sra_table["bioproject"]=="PRJNA215077", "lib_name"].map(lambda x: re.sub("cja_", "", str(x)))
    sra_table.loc[sra_table["bioproject"]=="PRJNA157897", "tissue"] = sra_table.loc[sra_table["bioproject"]=="PRJNA157897", "lib_name"].map(lambda x: re.sub("Beagle ", "", str(x)))
    sra_table.loc[sra_table["bioproject"]=="PRJNA177703", "tissue"] = sra_table.loc[sra_table["bioproject"]=="PRJNA177703", "exp_title"].map(lambda x: re.sub("Canine ", "", str(x)))
    sra_table.loc[sra_table["bioproject"]=="PRJDB3843", "tissue"] = sra_table.loc[sra_table["bioproject"]=="PRJDB3843", "age"]
    sra_table.loc[sra_table["bioproject"]=="PRJEB11491", "tissue"] = sra_table.loc[sra_table["bioproject"]=="PRJEB11491", "cell"]
    sra_table.loc[sra_table["bioproject"]=="PRJNA70959", "tissue"] = sra_table.loc[sra_table["bioproject"]=="PRJNA70959", "exp_title"]    
    sra_table.loc[sra_table["bioproject"]=="PRJEB7620", "tissue"] = sra_table.loc[sra_table["bioproject"]=="PRJEB7620", "cell"]
    sra_table.loc[sra_table["bioproject"]=="PRJNA403803", "tissue"] = sra_table.loc[sra_table["bioproject"]=="PRJNA403803", "source_name"]
    sra_table.loc[sra_table["bioproject"]=="PRJNA391214", "treatment"] = sra_table.loc[sra_table["bioproject"]=="PRJNA391214", "sample_title"]
    sra_table.loc[sra_table["bioproject"]=="PRJEB19268", "tissue"] = sra_table.loc[sra_table["bioproject"]=="PRJEB19268", "sample_title"].map(lambda x: re.sub(" RNA-seq", "", str(x)))

    sra_table.loc[sra_table["bioproject"]=="PRJNA173328", "tissue"] = ["uterus", "uterus", "ovary"]
    sra_table.loc[sra_table["bioproject"]=="PRJDB90", "tissue"] = "embryo"
    sra_table.loc[sra_table["bioproject"]=="PRJNA417542", "tissue"] = "vascular smooth muscle cells"
    sra_table.loc[sra_table["bioproject"]=="PRJNA222780", "tissue"] = "embryo"
    sra_table.loc[sra_table["bioproject"]=="PRJDB2921", "tissue"] = "brain"
    sra_table.loc[sra_table["bioproject"]=="PRJNA209394", "tissue"] = "spleen"
    sra_table.loc[sra_table["bioproject"]=="PRJDB1766", "tissue"] = "cerebellum"
    sra_table.loc[sra_table["bioproject"]=="PRJEB12300", "tissue"] = "blood"
    sra_table.loc[sra_table["bioproject"]=="PRJNA189967", "tissue"] = "blood"
    sra_table.loc[sra_table["bioproject"]=="PRJNA168072", "tissue"] = "liver"
    sra_table.loc[sra_table["bioproject"]=="PRJEB24166", "tissue"] = "macrophage"
    sra_table.loc[sra_table["bioproject"]=="PRJEB21709", "tissue"] = "liver"
    sra_table.loc[sra_table["bioproject"]=="PRJNA188394", "tissue"] = "liver"
    sra_table.loc[sra_table["bioproject"]=="PRJEB7406", "tissue"] = "spleen"
    sra_table.loc[sra_table["bioproject"]=="PRJEB22373", "tissue"] = "macrophage"
    sra_table.loc[sra_table["bioproject"]=="PRJNA341964", "tissue"] = "spleen"
    sra_table.loc[sra_table["bioproject"]=="PRJEB12613", "tissue"] = "skeletal muscle"
    
    sra_table.loc[sra_table["biosample"]=="SAMN00706768", "tissue"] = "testis"
    sra_table.loc[sra_table["biosample"]=="SAMN00709563", "tissue"] = "thymus"
    sra_table.loc[sra_table["biosample"]=="SAMN00727963", "tissue"] = "brain"
    sra_table.loc[sra_table["biosample"]=="SAMN00739297", "tissue"] = "brain"
    sra_table.loc[sra_table["biosample"]=="SAMN00739310", "tissue"] = "brain"
    sra_table.loc[sra_table["biosample"]=="SAMN00739311", "tissue"] = "brain"
    sra_table.loc[sra_table["biosample"]=="SAMN00739312", "tissue"] = "brain"
    sra_table.loc[sra_table["biosample"]=="SAMN00739313", "tissue"] = "brain"
    sra_table.loc[sra_table["biosample"]=="SAMN01766804", "tissue"] = "prefrontal cortex"
    sra_table.loc[sra_table["biosample"]=="SAMN01766813", "tissue"] = "prefrontal cortex"
    sra_table.loc[sra_table["biosample"]=="SAMN01768054", "tissue"] = "prefrontal cortex"
    sra_table.loc[sra_table["biosample"]=="SAMN01768055", "tissue"] = "prefrontal cortex"
    sra_table.loc[sra_table["biosample"]=="SAMN01768056", "tissue"] = "prefrontal cortex"
    sra_table.loc[sra_table["biosample"]=="SAMN01768057", "tissue"] = "prefrontal cortex"
    sra_table.loc[sra_table["biosample"]=="SAMN01768058", "tissue"] = "prefrontal cortex"
    sra_table.loc[sra_table["biosample"]=="SAMN01768059", "tissue"] = "prefrontal cortex"
    sra_table.loc[sra_table["biosample"]=="SAMN01768060", "tissue"] = "prefrontal cortex"
    sra_table.loc[sra_table["biosample"]=="SAMN01768061", "tissue"] = "prefrontal cortex"
    sra_table.loc[sra_table["biosample"]=="SAMN01768062", "tissue"] = "prefrontal cortex"
    sra_table.loc[sra_table["biosample"]=="SAMN01768063", "tissue"] = "prefrontal cortex"
    sra_table.loc[sra_table["biosample"]=="SAMN01768064", "tissue"] = "prefrontal cortex"
    sra_table.loc[sra_table["biosample"]=="SAMN01768065", "tissue"] = "prefrontal cortex"
    sra_table.loc[sra_table["biosample"]=="SAMN01768066", "tissue"] = "prefrontal cortex"
    sra_table.loc[sra_table["biosample"]=="SAMN00715283", "tissue"] = "testis"
    sra_table.loc[sra_table["biosample"]=="SAMN00715284", "tissue"] = "retina"
    sra_table.loc[sra_table["biosample"]=="SAMN00778253", "tissue"] = "frontal cortex"
    sra_table.loc[sra_table["biosample"]=="SAMN00778254", "tissue"] = "frontal cortex"
    sra_table.loc[sra_table["biosample"]=="SAMN00778255", "tissue"] = "frontal cortex"
    sra_table.loc[sra_table["biosample"]=="SAMN00778256", "tissue"] = "frontal cortex"
    sra_table.loc[sra_table["biosample"]=="SAMN00778257", "tissue"] = "frontal cortex"
    sra_table.loc[sra_table["biosample"]=="SAMN00778258", "tissue"] = "frontal cortex"
    sra_table.loc[sra_table["biosample"]=="SAMN00216499", "tissue"] = "heart"
    sra_table.loc[sra_table["biosample"]=="SAMN00216500", "tissue"] = "heart"
    sra_table.loc[sra_table["biosample"]=="SAMN00216504", "tissue"] = "heart"
    sra_table.loc[sra_table["biosample"]=="SAMN00216505", "tissue"] = "heart"
    sra_table.loc[sra_table["biosample"]=="SAMN00216506", "tissue"] = "heart"
    sra_table.loc[sra_table["biosample"]=="SAMN00216507", "tissue"] = "heart"
    sra_table.loc[sra_table["biosample"]=="SAMN00216508", "tissue"] = "heart"
    sra_table.loc[sra_table["biosample"]=="SAMN02401346", "tissue"] = "bone marrow"
    sra_table.loc[sra_table["biosample"]=="SAMN02427293", "tissue"] = "temporal lobe"
    sra_table.loc[sra_table["biosample"]=="SAMN02427294", "tissue"] = "pituitary"
    sra_table.loc[sra_table["biosample"]=="SAMN02427295", "tissue"] = "lung"
    sra_table.loc[sra_table["biosample"]=="SAMN02427296", "tissue"] = "skeletal muscle"
    sra_table.loc[sra_table["biosample"]=="SAMN02427297", "tissue"] = "thymus"
    sra_table.loc[sra_table["biosample"]=="SAMN00749950", "tissue"] = "testis"
    sra_table.loc[sra_table["biosample"]=="SAMN00761040", "tissue"] = "brain"
    sra_table.loc[sra_table["biosample"]=="SAMN00761041", "tissue"] = "liver"
    sra_table.loc[sra_table["biosample"]=="SAMN00761045", "tissue"] = "kidney"
    sra_table.loc[sra_table["biosample"]=="SAMN00761046", "tissue"] = "kidney"
    sra_table.loc[sra_table["biosample"]=="SAMN00761050", "tissue"] = "kidney"
    sra_table.loc[sra_table["biosample"]=="SAMN00761051", "tissue"] = "embryo"
    sra_table.loc[sra_table["biosample"]=="SAMN00761052", "tissue"] = "embryo"
    sra_table.loc[sra_table["biosample"]=="SAMN00761053", "tissue"] = "embryo"
    sra_table.loc[sra_table["biosample"]=="SAMN00761054", "tissue"] = "embryo"
    sra_table.loc[sra_table["biosample"]=="SAMN00761055", "tissue"] = "embryo"
    sra_table.loc[sra_table["biosample"]=="SAMN00761056", "tissue"] = "embryo"
    sra_table.loc[sra_table["biosample"]=="SAMN00761057", "tissue"] = "embryo"
    sra_table.loc[sra_table["biosample"]=="SAMN01120728", "tissue"] = "testis"
    sra_table.loc[sra_table["biosample"]=="SAMN01120729", "tissue"] = "placenta"
    sra_table.loc[sra_table["biosample"]=="SAMN01121009", "tissue"] = "blood"
    sra_table.loc[sra_table["biosample"]=="SAMN01766975", "tissue"] = "liver"
    sra_table.loc[sra_table["biosample"]=="SAMN00991521", "tissue"] = "liver"
    sra_table.loc[sra_table["biosample"]=="SAMN00991522", "tissue"] = "spleen"
    sra_table.loc[sra_table["biosample"]=="SAMN00991523", "tissue"] = "heart"
    sra_table.loc[sra_table["biosample"]=="SAMN00991525", "tissue"] = "ascending colon"
    sra_table.loc[sra_table["biosample"]=="SAMN00991526", "tissue"] = "kidney"
    sra_table.loc[sra_table["biosample"]=="SAMN00991527", "tissue"] = "lung"
    sra_table.loc[sra_table["biosample"]=="SAMN00991528", "tissue"] = "cerebellum"
    sra_table.loc[sra_table["biosample"]=="SAMN00765498", "tissue"] = "heart"
    sra_table.loc[sra_table["biosample"]=="SAMN00765499", "tissue"] = "ovary"
    sra_table.loc[sra_table["biosample"]=="SAMN00765500", "tissue"] = "brain"
    sra_table.loc[sra_table["biosample"]=="SAMN00767849", "tissue"] = "skeletal muscle"
    sra_table.loc[sra_table["biosample"]=="SAMN00767850", "tissue"] = "liver"
    sra_table.loc[sra_table["biosample"]=="SAMN00767851", "tissue"] = "lung"
    sra_table.loc[sra_table["biosample"]=="SAMN00771449", "tissue"] = "embryo"
    sra_table.loc[sra_table["biosample"]=="SAMN00791550", "tissue"] = "adrenal gland"
    sra_table.loc[sra_table["biosample"]=="SAMN00014283", "tissue"] = "brain"
    sra_table.loc[sra_table["biosample"]=="SAMN00013560", "tissue"] = "brain"
    sra_table.loc[sra_table["biosample"]=="SAMN00013366", "tissue"] = "brain"
    sra_table.loc[sra_table["biosample"]=="SAMN00016407", "tissue"] = "lung/trachea"
    sra_table.loc[sra_table["biosample"]=="SAMN00014285", "tissue"] = "pancreas"
    sra_table.loc[sra_table["biosample"]=="SAMN00013365", "tissue"] = "spleen"
    sra_table.loc[sra_table["biosample"]=="SAMN00013362", "tissue"] = "brain"
    sra_table.loc[sra_table["biosample"]=="SAMN00014284", "tissue"] = "lung/trachea"
    sra_table.loc[sra_table["biosample"]=="SAMN04284181", "tissue"] = "GMP 100 cells"
    sra_table.loc[sra_table["biosample"]=="SAMN04284167", "tissue"] = "GMP 100 cells"
    sra_table.loc[sra_table["biosample"]=="SAMN04284116", "tissue"] = "CMP 100 cells"
    sra_table.loc[sra_table["biosample"]=="SAMN04284065", "tissue"] = "CMP 100 cells"
    sra_table.loc[sra_table["biosample"]=="SAMN04284002", "tissue"] = "MEP 100 cells"
    sra_table.loc[sra_table["biosample"]=="SAMN04284129", "tissue"] = "MEP 100 cells"
    sra_table.loc[sra_table["biosample"]=="SAMN04284133", "tissue"] = "hematopoietic stem 100 cells"
    sra_table.loc[sra_table["biosample"]=="SAMN04284013", "tissue"] = "hematopoietic stem 100 cells"
    sra_table.loc[sra_table["biosample"]=="SAMN04284141", "tissue"] = "Purkinje 50 cells"
    sra_table.loc[sra_table["biosample"]=="SAMN04284062", "tissue"] = "Purkinje 50 cells"
    sra_table.loc[sra_table["biosample"]=="SAMN04284113", "tissue"] = "cerebral cortex, layer 5 50 cells"
    sra_table.loc[sra_table["biosample"]=="SAMN04284164", "tissue"] = "cerebral cortex, layer 5 50 cells"
    sra_table.loc[sra_table["biosample"]=="SAMN04284179", "tissue"] = "CH12.LX immortalized cell line"
    sra_table.loc[sra_table["biosample"]=="SAMN04283998", "tissue"] = "CH12.LX immortalized cell line"
    sra_table.loc[sra_table["biosample"]=="SAMN04283997", "tissue"] = "CH12.LX immortalized cell line"
    sra_table.loc[sra_table["biosample"]=="SAMEA2689605", "tissue"] = "thymus"
    sra_table.loc[sra_table["biosample"]=="SAMEA2689601", "tissue"] = "ovary"
    sra_table.loc[sra_table["biosample"]=="SAMEA2689599", "tissue"] = "lung"
    sra_table.loc[sra_table["biosample"]=="SAMEA2689597", "tissue"] = "kidney"
    sra_table.loc[sra_table["biosample"]=="SAMEA2689596", "tissue"] = "heart"
    sra_table.loc[sra_table["biosample"]=="SAMEA2689594", "tissue"] = "blood"
    sra_table.loc[sra_table["biosample"]=="SAMEA2689604", "tissue"] = "testis"
    sra_table.loc[sra_table["biosample"]=="SAMEA2689603", "tissue"] = "spleen"
    sra_table.loc[sra_table["biosample"]=="SAMEA2689602", "tissue"] = "skin"
    sra_table.loc[sra_table["biosample"]=="SAMEA2689600", "tissue"] = "muscle"
    sra_table.loc[sra_table["biosample"]=="SAMEA2689598", "tissue"] = "liver"
    sra_table.loc[sra_table["biosample"]=="SAMEA2689595", "tissue"] = "brain"
    sra_table.loc[sra_table["biosample"]=="SAMN01853600", "tissue"] = "skin"
    sra_table.loc[sra_table["biosample"]=="SAMN01853601", "tissue"] = "skin"
    sra_table.loc[sra_table["biosample"]=="SAMN01853599", "tissue"] = "skin"
    sra_table.loc[sra_table["biosample"]=="SAMN01853598", "tissue"] = "skin"    
    sra_table.loc[sra_table["biosample"]=="SAMN00998531", "tissue"] = "heart"
    sra_table.loc[sra_table["biosample"]=="SAMN00998528", "tissue"] = "brain"
    sra_table.loc[sra_table["biosample"]=="SAMN00998533", "tissue"] = "skeletal muscle"
    sra_table.loc[sra_table["biosample"]=="SAMN00998535", "tissue"] = "ovary"
    sra_table.loc[sra_table["biosample"]=="SAMN00998532", "tissue"] = "liver"
    sra_table.loc[sra_table["biosample"]=="SAMN00998529", "tissue"] = "testis"
    sra_table.loc[sra_table["biosample"]=="SAMN00998536", "tissue"] = "blood"
    sra_table.loc[sra_table["biosample"]=="SAMN00998534", "tissue"] = "lung"
    sra_table.loc[sra_table["biosample"]=="SAMN00998537", "tissue"] = "kidney"
    sra_table.loc[sra_table["biosample"]=="SAMD00004034", "tissue"] = "kidney"
    sra_table.loc[sra_table["biosample"]=="SAMD00004035", "tissue"] = "skeletal muscle"
    sra_table.loc[sra_table["biosample"]=="SAMN02781105", "tissue"] = "liver"
    sra_table.loc[sra_table["biosample"]=="SAMN02781095", "tissue"] = "testis"
    sra_table.loc[sra_table["biosample"]=="SAMN02781093", "tissue"] = "kidney"
    sra_table.loc[sra_table["biosample"]=="SAMN02781090", "tissue"] = "brain"
    sra_table.loc[sra_table["biosample"]=="SAMN02781081", "tissue"] = "heart"
    sra_table.loc[sra_table["biosample"]=="SAMN02781078", "tissue"] = "skeletal muscle"
    sra_table.loc[sra_table["biosample"]=="SAMN02781061", "tissue"] = "skin"
    sra_table.loc[sra_table["biosample"]=="SAMD00043263", "tissue"] = "liver"
    sra_table.loc[sra_table["biosample"]=="SAMN01780059", "tissue"] = "testis"
    sra_table.loc[sra_table["biosample"]=="SAMEA2553415", "tissue"] = "fibroblast"
    sra_table.loc[sra_table["biosample"]=="SAMN02205043", "tissue"] = "liver"
    
    return sra_table


def exclude_treatment_terms(sra_table, control_term_list):
    print('Running: exclude_treatment_terms')
    for item in control_term_list.items():
        column = item[0]
        sra_table[column] = sra_table[column].astype(str)
        for control_term in item[1]:
            if not control_term=='':
                bioprojects = sra_table['bioproject'][sra_table[column].str.contains(control_term)].unique()
                for bioproject in bioprojects:
                    if bioproject != '':
                        is_bioproject = (sra_table['bioproject']==bioproject)
                        is_control_term = (sra_table[column].str.contains(control_term))
                        sra_table.loc[(is_bioproject & -is_control_term), 'exclusion'] = 'treatment'
    return sra_table

def nspot_cutoff(sra_table, min_nspots=5000000):
    print('Running: nspot_cutoff')
    sra_table['total_spots'] = sra_table['total_spots'].fillna(0).astype(int)
    sra_table.loc[-(sra_table['total_spots']==0) & (sra_table['total_spots'] < min_nspots), 'exclusion'] = 'low_nspots'
    return sra_table

def exclude_entry_with_keyword(sra_table):
    print("Running: exclude_entry_with_keyword")
    sra_table.loc[~sra_table['antibody'].isnull(), 'exclusion'] = 'immunoprecipitation'   
    sra_table.loc[~sra_table['cell'].isnull(), 'exclusion'] = 'cell_culture'
    for col in ['exp_title','study_title','design','sample_title','sample_description','lib_name','protocol',]:
        sra_table.loc[:,col] = sra_table.loc[:,col].astype(str)
        sra_table.loc[sra_table[col].str.contains('RipSeq').fillna(False), 'exclusion'] = 'immunoprecipitation'
        sra_table.loc[sra_table[col].str.contains('chrom_RNAseq').fillna(False), 'exclusion'] = 'immunoprecipitation'
        sra_table.loc[sra_table[col].str.contains('RNAi').fillna(False), 'exclusion'] = 'RNAi'
        sra_table.loc[sra_table[col].str.contains('shRNA RNA-seq').fillna(False), 'exclusion'] = 'small_RNA'
        sra_table.loc[sra_table[col].str.contains('smRNA').fillna(False), 'exclusion'] = 'small_RNA'
        sra_table.loc[sra_table[col].str.contains('CAGE').fillna(False), 'exclusion'] = 'CAGE'
        sra_table.loc[sra_table[col].str.contains('piRNA').fillna(False), 'exclusion'] = 'small_RNA'
        sra_table.loc[sra_table[col].str.contains('^exposed to').fillna(False), 'exclusion'] = 'treatment'
    for col in ['treatment','protocol',]:
        sra_table.loc[sra_table[col].str.contains('miRNA').fillna(False), 'exclusion'] = 'small_RNA'
        sra_table.loc[sra_table[col].str.contains('sRNA').fillna(False), 'exclusion'] = 'small_RNA'
        sra_table.loc[sra_table[col].str.contains('uranium').fillna(False), 'exclusion'] = 'treatment'
    for col in ['age',]:
        sra_table.loc[sra_table[col].str.contains('embryo').fillna(False), 'exclusion'] = 'embryonic'
        sra_table.loc[sra_table[col].str.contains('Embryo').fillna(False), 'exclusion'] = 'embryonic'
        sra_table.loc[sra_table[col].str.contains('somite').fillna(False), 'exclusion'] = 'embryonic'
        sra_table.loc[sra_table[col].str.contains('cell').fillna(False), 'exclusion'] = 'embryonic'
        sra_table.loc[sra_table[col].str.contains('Fertiliz').fillna(False), 'exclusion'] = 'embryonic'
        sra_table.loc[sra_table[col].str.contains('fertiliz').fillna(False), 'exclusion'] = 'embryonic'
        sra_table.loc[sra_table[col].str.contains('oocyte').fillna(False), 'exclusion'] = 'embryonic'
    for col in ['genotype',]:
        sra_table.loc[sra_table[col].str.contains('foxn3').fillna(False), 'exclusion'] = 'transgenic'
    for col in ['exp_title','study_title','design','sample_title','sample_description','lib_name','experiment','treatment','protocol','age',]:
        sra_table.loc[sra_table[col].str.contains('single cell').fillna(False), 'exclusion'] = 'single_cell'
        sra_table.loc[sra_table[col].str.contains('Single cell').fillna(False), 'exclusion'] = 'single_cell'
        sra_table.loc[sra_table[col].str.contains('Single Cell').fillna(False), 'exclusion'] = 'single_cell'
    return sra_table

def remove_redundant_biosample(sra_table):
    print('Running: remove_redundant_biosample')
    redundant_bool = sra_table.duplicated(subset=['bioproject', 'biosample'], keep = 'first')
    sra_table.loc[redundant_bool, 'exclusion'] = 'redundant_biosample'
    return sra_table

def maximize_bioproject_sampling(df, target_n=10):
    df['bioproject'] = df['bioproject'].fillna('unknown')
    df['is_sampled'] = 'No'
    df['is_qualified'] = 'No'
    df.loc[(df.exclusion=='no'), 'is_qualified'] = 'Yes'
    while len(df.loc[(df['is_sampled']=='Yes')&(df.exclusion=='no'),:]) < target_n:
        if len(df) <= target_n:
            df.loc[(df.exclusion=='no'), 'is_sampled'] = 'Yes'
            break
        else:
            df_unselected = df.loc[(df['is_sampled']=='No')&(df.exclusion=='no'),:]
            bioprojects = df_unselected['bioproject'].unique()
            if len(bioprojects) == 0:
                break
            remaining_n = target_n - len(df.loc[df['is_sampled']=='Yes',:])            
            select_n = min([len(bioprojects), remaining_n])
            selected_bioprojects = numpy.random.choice(bioprojects, size=select_n, replace=False)
            selected_index = []
            for bioproject in selected_bioprojects:
                index = numpy.random.choice(df_unselected.index[df_unselected['bioproject']==bioproject], size=1, replace=False)
                selected_index.append(int(index))
            df.loc[selected_index, 'is_sampled'] = 'Yes'
    return df

def label_sampled_data(sra_table, suppress_to=10, tissue_selection=[]):
    print('Running: label_sampled_data')
    sra_table_labeled = pandas.DataFrame()
    species = sra_table['scientific_name'].unique()
    for sp in species:
        sp_table = sra_table.loc[sra_table['scientific_name'] == sp, :]
        tissues = sp_table['tissue'].unique()
        for tissue in tissues:
            sp_tissue = sp_table.loc[sp_table['tissue'] == tissue, :]
            sp_tissue = maximize_bioproject_sampling(df=sp_tissue, target_n=suppress_to)
            sra_table_labeled = pandas.concat([sra_table_labeled, sp_tissue], axis=0)
    sra_table_labeled['is_selected_tissue'] = 'Yes'
    if len(tissue_selection):
        sra_table_labeled.loc[-(sra_table_labeled['tissue'].isin(tissue_selection)),'is_selected_tissue'] = 'No'
    return sra_table_labeled

def make_cross_tables(sra_table, n_sp_cutoff=5, suppress_tissue=True, sample_upper=True):
    print('Running: make_cross_tables')
    if suppress_tissue:
        sra_table = sra_table.loc[sra_table['is_selected_tissue']=='Yes',:]
    if sample_upper:
        sra_table = sra_table.loc[sra_table['is_sampled']=='Yes',:]    
    sra_reduced = sra_table[['scientific_name', 'biosample', 'tissue']]
    sra_pivot = sra_reduced.pivot_table(columns='tissue',index='scientific_name', aggfunc='count')
    sra_pivot.columns = sra_pivot.columns.get_level_values(1)
    column_sort = sra_pivot.count(axis='index').sort_values(ascending=False).index
    index_sort = sra_pivot.count(axis='columns').sort_values(ascending=False).index
    sra_pivot = sra_pivot.loc[index_sort,column_sort]
    sra_pivot_reduced = sra_pivot.loc[:,sra_pivot.count(axis='index') >= n_sp_cutoff]
    column_sort = sra_pivot_reduced.count(axis='index').sort_values(ascending=False).index
    index_sort = sra_pivot_reduced.count(axis='columns').sort_values(ascending=False).index
    sra_pivot_reduced = sra_pivot_reduced.loc[index_sort,column_sort]
    return sra_pivot_reduced

def reorder_columns(sra_table, omit_misc):
    column_names = ['scientific_name','tissue','tissue_original','genotype','sex','age','treatment','source_name','is_sampled','is_selected_tissue',
                    'is_qualified','exclusion',
                    'protocol','bioproject','biosample','experiment',
                    'run','sra_primary','sra_sample','sra_study','study_title','exp_title','design','sample_title','sample_description','lib_name',
                    'lib_layout','lib_strategy','lib_source','lib_selection','instrument','total_spots','total_bases','size','nominal_length','nominal_sdev',
                    'spot_length','read_index','read_class','read_type','base_coord','lab','center','submitter_id','pubmed_id',
                    'taxid','published_date','biomaterial_provider','cell','location','antibody','batch','misc']
    if omit_misc:
        sra_table_reordered = sra_table.loc[:,column_names]
    else:
        misc_columns = [ col for col in sra_table.columns if col not in column_names ]
        sra_table_reordered = sra_table.loc[:,column_names+misc_columns]
    return sra_table_reordered

In [5]:
# Curation lists
column_aggregation_list = {
    'age':["arrayexpress-developmentalstage","agedays","age_classification","age_description","age_in_years","agecat","dev_stage",'ages',
           "development_satge","development_stage","developmental_stage","developmentalstage","differentiation_stages","embryonic_day",
           "embryonic_stage","female_age","sample_comment","stage","age_at_collection","age_classification",'age_description','age_in_years',
           'age_of_collection','age_of_fly_in_days_post_eclosion','agecat','agedays','age{category}','animal_age_at_collection','day',
           'arrayexpress-timepoint','avg_age_after_second_6_months_of_open_access','collection_timing_in_reference_to_e_lineage',
           'day_relative_to_weaning','dev-stage','developmental_time','duration','embryo_stage','female_birth_date','gestational_age',
           'gestational_age_at_sample_collection','hotta_stage','male_age','male_birth_date','male_collection_date','male_death_date',
           'timepoint','trimester','birth_date','collection_date','death_date','female_collection_date','female_death_date','geo_loc_name',
           'sample_date','sampling_date','specimen_collection_date','time_-_blastomeres','animal_length','date','developemental_stage',
           'colection_date','','','','','','','','','','','','','','','','','','','','','','','','','','','','','','','','','','','','','','','','','','','','','','','','','','','','','','','','','','','','','','','','','','','','','','','','','','','','','','','','','','','',],
    'antibody':['chip_antibody','chip_or_ip_antibody','clip-antibody','clip_antibody','rip_antibody','rna_binding_protein',
                '','','','','','','','','','','','','','','','','',],
    'batch':['adapter_barcode','animal','animal_id','animalid','barcode','biological_replicate','biorep','brain_code','clutch','cohort',
            'collected_by','cow','custom_field','donator_monkey_number','donor_id','horse_number','iclip_barcode','id','illumina_rna-seq_bar_codes',
            'index','individual','individual_id','individuals','internal_id','labanimalnumber','labexpid','mahpic_non_human_primate_individual_id',
            'monkey_id','non_human_primate_individual_id','number','oocyte/embryo_number','pig_id','replicate','rna_processing_batch','run_id',
            'sample','sample-type','sample_coding','sample_descriptor','sample_id','sample_identifier','sample_number','series','specimen_voucher',
            'study','study_group','study_phase','subject','subject_id','subjectid','technical_replicate','tissue_abbreviation','tissue_code',
            'umc_id','unique_id','uniqueid','wur_marker_type','animal_number','ebi_equivalent_biosample','ercc','flow_cell','flowcell','lane',
             'lane_num','num_replicons','cell_barcode','submission_identifier','library_id','library_index_sequence_used_to_demultiplex',
             'replicate_/_experiment','sequencing_pool','plate_col_id','plate_row_id','replicate_no.','run_no.','sire','unique_identifier',
             '','','','','','','','','','','','','','','','','','','','','','','','','','','','','',],
    'biomaterial_provider':["biosourceprovider","female_biomaterial_provider","male_biomaterial_provider",'library_preparation_location',
                    'material_provider','availability','','','',],
    'bioproject':['bioproject_id','bioprojectid','','','','','',],
    'cell':["cell_type","cell_line","cell_class","cell_description","cell_subtype","arrayexpress-celltype","culture_collection","culture_conditions",
           'cell-line','cell-type','cell_or_tissue_type','eosinophil','feature','germ_layer','cells_derived_from','number_cells','cell_organism',
            'cell_culture_protocol','cell_population','cell_typing','','','','','','','','','',],
    'protocol':["extract_protocol","female_sample_collection_protocol","protocol","sample_extraction_method",'experimental_protocol',
                  "chemical_treatment_of_isolated_rna",'amplification','assay','experiment_target','extraction_protocol','fixation','fraction',
                  'library_selection','library_source','library_strategy','library_type','libraryprotocol','lysis_buffer_ion_concentrations',
                  'lysis_strategy','male_sample_collection_protocol','meoh-fixed','molecule_subtype','prep_type','preperation_kit',
                  'purification_protocol','rin','rna_fraction','rna_rin_values_after_globin_depletion','rna_rin_values_before_globin_depletion',
                  'rna_subtype','sample_material','sample_storage','sample_storage_processing','small_rna_classes','specimen_collection_protocol',
                  'specimen_with_known_storage_state','tissue_state','store_cond','datatype_description','datatype','monosome_enrichment_strategy',
                  'rna_concentration_after_globin_depletion','rna_concentration_before_globin_depletion','rna_input','sequencing_type','date_run',
                  'instrument_model','ngs_platform','paired_end_seq?','paired_or_single-end','pe_read_length','platform','quality','route',
                  'read_len_orig','readtype','readtype_description','sequencer','sequencing_method','extraction','indrops_version',
                'minimum_counts_per_cell_threshold_used_to_remove_background_barcodes','minimum_reads_per_cell_during_initial_processing',
                'spike-in','','','','','','','','','','','','','','','','','','',],
    'nominal_length':['mean_insert_size','','','','','','','','','','','','','','','','','','',],
    'genotype':["arrayexpress-genotype","arrayexpress-phenotype","arrayexpress-rnai","background_strain","breed","cultivar","chicken_line",
                "ecotype","full_genotype","genetic_background","genetic_line","genetic_modification","genotype/variaion","genotype/variation",
                "germline_knock-down","germline_knockdown_and_other_transgenes","isolate","line","phenotype","ploidy","population",
                "snp_allelic_state","strain","strain/background","strain_origin","strains","sub_species","subspecies","variety",'breed_name',
                'arrayexpress-strainorline','breed/line','breed_type','breeding_history','breeds','chicken_breed','clonality','marker',
                'dstim_knockdown_status','environmental_history','fly_line','origin','phenotype/variation','pig_breed','pig_type','propagation',
                'reporter_construct','rhesus_monkey_origin','rna_interference','strain_background','strain_description','strain_prrsv','strainorline',
                'transfecting_vector','transgenic_strain','virus_group','arrayexpress-species','parental_strain','tax_id','transgene',
                'genotype/phenotype','cross','dad','driver','qtl_genotype/haplotype','','','','','','','','','','','','','','','','','','','','','','','','',],
    'location':['birth_location','country','feeding_barn','female_birth_location','geographic_location','lat_lon','library_preparation_location_latitude',
              'library_preparation_location_latitude_units','library_preparation_location_longitude','library_preparation_location_longitude_units',
              'local_origin','male_birth_location','sampling_site','seq_center','sequencing_location','sequencing_location_latitude','station',
              'sequencing_location_latitude_units','sequencing_location_longitude','sequencing_location_longitude_units','source','biome',
                'env_biome','env_feature','env_material','environment','environmental_package','','','','','','','','','','','','','','','','','','','','','','','','','','','','',],
    'misc':['ena-checklist','ena-first-public','ena-last-update','estimated_size','investigation_type','lab_description','library_layout','mapalgorithm',
            'mapalgorithm_description','new_attribute','notes','organism','project','project_name','species','sra_experiment_accession',
            'biosamplemodel','sample_year','number_of_pieces','specimen_picture_url','submission_description','submission_title','unknown',
            'body_weight','file_no.','num_parts_in_pool','processing_date','','','','','','','','','','','',],
    'nominal_length':['mean_insert_size','','','','','','','','','','','','','','','',],
    'sex':["arrayexpress-sex","gender","host_sex",'cell_sex','gender_type','obsolete_sex','both_sexes_pooled','','',],
    'tissue':["arrayexpress-organismpart","bio_material","body_site","biopsy_site","brain_region","description","experiment_set","explant",
              "isolation-source","isolation_source","label","mixedtissues","muscle_type","organ/tissue","organ_part","organism_part",
              "organismpart","oviduct","region","sample_name","sample_origin","sampling_position","source_name","tag","tissue-type",
              "tissue/cell","tissue/cell_type","tissue_location","tissue_or_dev_stage","tissue_origin","tissue_source","tissue_type","title",
              "arrayexpress-organismpart","organism_part",'embryo_region','organsim_part','tissue_/_cells','tissue_lib','tissue_region',
              '','','','','','','','','','','','','','','','','','','',],
    'treatment':["agent","arrayexpress-diseasestate","arrayexpress-growthcondition","arrayexpress-immunoprecipitate","behavior",
                 "biopsy_day","challenged_with","chemical_treatment_of_isolated_rna","diet","diet_fed","dietary_group","disease","disease_status",
                 "domestication","drip","energy_balance","fed_status","feeding","feeding_type","fertility_group","food","group","health_state",
                 "infect","infected_with","infection","infection_status","inoculation","maternal_diet","meat_quality","mptp_treatment","oxidation",
                 "pathogen","percent_marinade_uptake_at_24h_post-slaughter","phenotype_sample_type","restriction_model",'altitude','dpi',
                 "salmonella_shedding_status","status","stimulation","stimulus","stress","survival","transplant_type","treated_with",
                 "treatment_group","pregnancy_outcome","isol_growth_condt",'agent','aliquot','ammonia_concentration','ammonia_exposure',
                 'average_altitude','blastocyst_rate','breeding_method','bull_ntm','calcium_intake','cell_status','cellular_compartment',
                 'chemical','collection_time','comment','concentration','condition','culture_condition','culture-collection','culture_type',
                 'custom_name','day_of_infection','day_post_infection','days_at_29_c','days_post_infection','differentiation_time','disease_stage',
                 'drinking_category','drug','drug_for_synchronization','egg_production','enzyme_treatment','exposure_time','extracellular_component',
                 'fasted_status','fed_for','feed_efficiency','feeding_period','feeding_treatment','growth_condition','growth_protocol',
                 'harmonized_disease_score','harvest_time','health-state','health_status','health_status_at_collection','heat_hours',
                 'hematocrit','hemoglobin_measurement','host','host_tissue_sampled','hours_at_restrictive_temperature','hp-prrsv_infection',
                 'infection_route','infectious_agent','inflammatory_stimulus_treatment','injury','kinetics','lens_type','lesion_score','lesion_number',
                 'library_name','litter','loin_weight_group','lps_exposure','material','maternal_treatment','mating_status','mock','mode','morpholino',
                 'muscle_name','neutralization_group','normalized?','number_born_alive','oxygen_concentration','parity_number','passage',
                 'passage_number','passagers','passages','phase','pooled','pregnancy_status','pregnancy_time','purity','resorted',
                 'rna_treatment','sample_group','sample_processing','sampleid','sampling_time_point','sampling_to_preparation_interval',
                 'selection_criteria','serologic_response_status','shear_force','skin_color','social_rank','sorted','specific_host','stim_time',
                 'stud_book_number','superovulated','temperature','thermal_treatment','time','time-point','time_point','ega-disease',
                 'time_post_infection','timepoint_postchallenge','training','training_timepoint','treatment_description','tumor','tumor_grading',
                 'vaccine','vaccine_administration','vaccine_group','viremia_level','virus','virus_dose','d','days_post-infection','ega-subject',
                 'ega-treatment','ercc_pool','host_taxid','lymphocyte','monocyte','morphology','neutrophil','platelet_count','rfi_value',
                 'sample_type','source_of_female','sperm_dna_fragmentation_index','srek_ligation','time/hpf','total_number_born','treatments',
                 'type','weight','arrayexpress-compound','arrayexpress-dose','assay_type','backfat_thickness_of_live(mm)','basophil',
                 'photorecptors_remaining','productivity','hours/days_after_injury','disease_state','mehg_exposure','operation',
                 'post-injury_time_point','embryo_source','earlobe_color','reproductive_status','specimen_size','specimen_volume','specimen_weight',
                 'liver_phenotype','clinical_information','oestrous_peroid','secondary_description','finishing_diet','gestation_duration',
                 'growth_rate','heart_weight','information','light_treatment','maternal_food','number_of_passages','paternal_experience',
                 'personal_experience','physiological_conditions','rfi_group','sample_role','slaughter_weight','target','target_source',
                 'temperature_regimen','','','','','','','','','','','','','','','','','','','','','','','','','','','','','','','','','','','','','',],
}

tissue_aggregation_list = {
    'adrenal':['adrenal gland',],
    'adipose':[ "white adipose","adipose   subcutaneous", "adipose   visceral", "subcutaneous adipose", "fat",'white adipose subcutaneous',
    'cardiac adipose','adipose tissue','brown adipose tissue','cardiac adipose','subcutaneous adipose tissue','','','','','','','','','','','','','','',],
    'bone':['bones',],
    'blood':["whole blood","blood, peripheral vein","blood sample",],
    'brain':[ "a brain", "b brain", "normal brain", "whole brain", "brain, whole", "whole brains", "brain cerebrum", "brain left hemisphere", 
             "brain right hemisphere", "male  brain", "female brain", "brain ground control", "larval brain", "brain tissue",'brain gl',
             'brain without ventral nerve cord','brain space flight',],
    'breast':["breast   mammary tissue",],
    'cerebellum':["brain cerebellum","brain, cerebellum","brain   cerebellum",'cerebellar hemisphere','','','','','','','','','','','','','','',],
    'cerebral cortex':["dog cerebral cortex",],
    'colon':['ascending colon','descending colon','gut colon','spiral colon','sigmoid colon','','','','','','','','','','','','','','','',],
    'embryo':[ "embryos", "single embryo", "single embryo ", "whole embryo", "whole embryos", "whole embryo of  individuals", "whole-embryo", 
              "whole embryo homogenate", "embryo, whole, . days gestation  (n=)", "day  elongated embryo pool", "day  hatched embryo pool", 
              "body tissue", "mixed sex embryos",'about  h old embryos','mrnaseq of whole embryo','ivf embryos','living one cell embryo',
              'older embryos','post gastrulation embryo','two to four cell embryo','wild type embryos, tap treated','whole embryo, min intervals',
              'cell embryo','mixed stage embryos','early embryo','late gastrula embryo','dissociated embryo','whole embryo stage .',
              'embryos,  hr after egg laying','hr embryo','whole   hour embryo','total embryo','mrnaseq of whole embryo','whole embryo lysate',
              'intact embryo','hh whole embryo','cell embryo at pronuclear','blastocyst stage embryo','morula stage embryo','eight cell stage embryo',
              'four cell stage embryo','two cell stage embryo','pronuclear stage embryo','embryo, whole, . days gestation','embryo whole embryo',
              'hpf whole embryo','drosophila embryo','single stage  embryo','','','','','','','','','','','','','','','','','','','','','','','','','','','','','',],
    'fibroblast':[ "skin fibroblasts from adult naked mole-rat.", "skin fibroblasts from adult naked mole rat.", "fibroblast, skin", 
                  "primary skin fibroblasts", "skin fibroblast",'','','','','','','','','','','','','','','','','','',],
    'frontal cortex':["brain frontal cortex", "brain, frontal cortex", "brain   frontal cortex",'','','','','','','','','','','','','','','','','','',], 
    'gill':['gills','','','','','','','','','','','','','','','','','','',],
    'heart':["a heart", "b heart", "male  heart", "female heart", "the total rnas were isolated from the chimpanzee heart with trizol",
             'heart/coronary','heart, left ventricle','heart left ventricle','','','','','','','','','','','','','','','','','',],
    'hippocampus':['brain   hippocampus','hippocampus ca','','','','','','','','','','','','','','','','','','',],
    'hypothalamus':["dog hypothalamus","brain   hypothalamus",'','','','','','','','','','','','','','','','','','',],
    'kidney':[ "normal kidney", "a left kidney", "a right kidney", "male  kidney", "female kidney", "head kidney","head kidney tissue",],
    'liver':["normal liver", "a liver", "b liver", "male  liver", "female liver", "liver ground control", "adult liver",'liver space flight','newborn liver',
             'liver left lateral lobe','','','','','','','','','','','','','','','','','',],
    'lung':[ "left lung", "lung healthy", "lung control ", "lung control", "whole lung", "a lung", "Lung control_", "Lung control_1", 
            "Lung control_2", "Lung control_3", "male  lung", "female lung", "lung/trachea",'lung day','lungs','','','','','','','','','','','','','','','','','','',],
    'lymph node':[ "Ferret, lymph nodes, control", "normal lymph node", "lymph nodes", "supramammary lymph node", 
                  "submandibular lymph node", "popliteal lymph node", "mesenteric lymph node",'bronchial lymph node',
                  'abomasal lymph node','ferret, lymph nodes, control','intestinal lymph node','lymph node mesenteric',
                  'lymph node prescapular','para rectal lymph node','prescapular lymph node','tracheobronchial lymph node',
                  'tracheobronchial lymph node, iav s     dpi siv ecu','mesenteric lymphonodes','axillary lymphonodes','','','','','','','',],
    'mushroom body':['mshroom body',],
    'oocyte':['mature oocyte','stage  oocyte','','','','','','','','','','','','','','','','','',],
    'ovary':[ "ovaries", "virgin female ovary", "adult ovary", "adult ovaries", "adult drosophila ovaries", "ovary ground control", "left ovary", 
             "right ovary",'adult female ovary, no treatment, no gal activation','overy','ovary germarium to stage','ovary stage  + egg chambers',
             'ovary stage  to','ovarian tissues','ovary space flight','ovary tissue','','','','','','','','','','','',],
    'pancreas':['female pancreas','female  pancreas','','','','','','','','','','','','','','','','','','',],
    'placenta':['placent','placenta + membranes','term placenta','','','','','','','','','','','','','','','','','',],
    'pituitary':['brain pituitary','pituitary gland','anterior pituitary','','','','','','','','','','','','','','','','','',],
    'prefrontal cortex':["brain, prefrontal cortex",],
    'retina':[ "retina, rpe, choroid", "central retina", "peripheral retina", "whole retina", "adult retina",],
    'salivary gland':['salivary glands','major salivary gland','right salivary gland','left salivary gland',],
    'skeletal muscle':[ "muscle", "female  skeletal muscle", "skeletal_muscle", "body muscle", "muscle tissues", "muscle   skeletal", 
                       "muscle biceps", "muscle long dorsal", "biceps brachii", "pelvic limb", "longissimus muscle", "longissimus lumborum muscle", 
                       "longissimus dorsi", "semimembranosus muscle", "breast musle", "breast muscle", "longissimus thoracis", "gluteus medius muscle", 
                       "pectoralis superficialis muscle", "pectoralis major muscle", "psoas major muscle", "longissimus dorsi muscle", 
                       "biceps femoris muscle", "fast muscle", "slow muscle", "indirect flight muscle", "longissimus lumborum", "body wall muscle", 
                       "pectoralis", "longuissimus dorsi", "skeletal muscle tissue of biceps brachii", "longissimus thoracis muscle", "longissimus muslce", 
                       "pectoralis major", "soleus muscle", "pectoral muscle", "gluteus medius",'leg musle','leg muscle','longissimus muscles',
                       'longissimus dorsi muscle tissues','skeletal muscle tissue','thoracic muscle','thorax skeletal muscle','infraspinatus muscle, pre',
                       'longissimus dorsi muscle tissueslongissimus dorsi muscle tissues','muscle abdominal','muscle gastrocnemian','crico arytenoid muscle',
                       'ventral lateral sacrocaudal muscle','gluteal muscle','','','','','','','',],
    'skin':[ "skin, upper arm", "skin, unspecified", "black skin", "white skin",   "back skin", "dorsal skin", "ventral skin", "skin back", "skin side", 
            "male skin", "female  skin", "skin   sun exposed", "skin   not sun exposed", "tarsometatarsal skin", "thigh skin", "ear skin", "stkin",
            'wing skin','earlobe skin','skin tissue of inner side of the wing','abdomen skin tissue','skin of back','','','','','','','','','','','','','','','',],
    'sperm':['f sperm','','','','','','','','','','','','','','','','','','',],
    'spinal cord':["brain   spinal cord",],
    'spleen':[ "spleen tissue", "a spleen", "b spleen", "d spleen", "male spleen", "female  spleen", "chicken splenic tissue",'pleen',],
    'temporal lobe':["brain, temporal lobe","brain temporal lobe","temperal lobe",],
    'tongue':['tongue.',],
    'testis':[ "testes", "super bull testis", "adult testis", "male  testis", "virgin testes", "testis ground control", "dissected testis", "mature testis", 
              "adult testes",'fly testis','testis space flight','whole testes','','','','','','','','','','','','','','','','',],
    'whole organism':["whole worms","whole fly",'worm','whole body','whole worm','whole animal','whole flies','whole fish','','','','','','','','','','',],
}

control_term_list = {
    'treatment':["mock", "control", "Control", "none", "CNTL", "CON", "normal", "untreated", "Mock", "mock", "Ad Libitum", "uninfected", 
                 "Uninfected", "in the absence of", "kontrol", "None", "Thermoneutral",'21%[oxygen_concentration]','DMSO','Normal',' 0hrs',
                 '^0 ppm','un-infected','','','','','',],
    'genotype':["wild", "Wild", "N2", "OregonR", "Oregon R", "Canton S", "Canton-S", "y w", "yw", "wt",'\+\/\+'],
    'sample_title':["wild","Wild type","control","Control","CON_",],
    'sample_description':["control",],
    'lib_name':["control",],
    'exp_title':["control","Control_","CON_","TimePoint1_","0 days post infection",],
}

tissue_selection = ["liver","brain","kidney","testis","heart","ovary"]


In [6]:
# Entrez search conditions
Entrez.email = "YOUR_EMAIL_ADDRESS"

In [2]:
# Read Ensembl species list
ensembl_table = pandas.read_csv("/Users/kf/Dropbox/kfdata/02_Data/my_db/Ensembl/release-91/species_list/Ensembl_Release91_species_list.csv", sep=",")
ensembl_sp = ensembl_table["Scientific name"].unique()
ensembl_sp_binom = sorted(set([re.sub(r'(.+)(\s)(.+)(\s)(.+)', r"\1\2\3", sp) for sp in ensembl_sp]))
print('Number of species for SRA search:', len(ensembl_sp))

Number of species for SRA search: 105


In [8]:
# Get SRA table
sra_table_species = dict()
for sp in ensembl_sp_binom:
    print(sp)
    sp_file_name = "sra_table_sp_"+sp.replace(' ', '_')+'_'+d+".tsv"
    if sp == "Homo sapiens":
        bioprojects = ["PRJEB2445","PRJNA143955","PRJNA143627","PRJNA154333","PRJNA186646","PRJNA272542","PRJNA276463",
                       "PRJNA233428","PRJNA280600"]
    elif sp == "Mus musculus":
        bioprojects = ["PRJNA66167","PRJNA176589","PRJNA177791","PRJNA184055","PRJNA186646","PRJNA324710","PRJNA317431",
                       "PRJNA315706"]
    elif sp == "Danio rerio":
        bioprojects = ["PRJNA276667", "PRJNA260259", "PRJEB2368", "PRJNA207719", "PRJNA255848","PRJNA293388","PRJNA379145",
                      "PRJNA297460","PRJNA317597","PRJNA266803","PRJNA263496","PRJNA431371","PRJNA371613"]
    elif sp == "Rattus norvegicus":
        bioprojects = ["PRJEB3197","PRJNA177791","PRJNA184055","PRJNA218629","PRJNA238328","PRJEB6938","PRJNA236460",
                       "PRJNA264000","PRJNA325290"]
    elif sp == "Saccharomyces cerevisiae":
        continue
    else:
        bioprojects = []
    if os.path.exists(sp_file_name):
        print(sp, ': reading from tsv file.')
        sra_table_species[sp] = pandas.read_csv(sp_file_name, sep='\t', header=0, low_memory=False)
    else:
        search_term = get_search_term(species_name=sp, bioprojects=bioprojects, biosamples=[])
        root = fetch_sra_xml(species_name=sp, search_term=search_term, save_xml=False, read_from_existing_file=False)
        sra_table_species[sp] = sra_xml2table(species_name=sp, xml_root=root)
        sra_table_species[sp].to_csv(sp_file_name, sep="\t", index=False)
sra_table_multisp = pandas.concat(sra_table_species.values())
sra_table_multisp.to_csv("sra_table_multisp_"+d+".tsv", sep="\t", index=False)


Ailuropoda melanoleuca
Ailuropoda melanoleuca : reading xml from file
Anas platyrhynchos
Anas platyrhynchos : reading xml from file
Anolis carolinensis
Anolis carolinensis : reading xml from file
Aotus nancymaae
Aotus nancymaae : reading xml from file
Astyanax mexicanus
Astyanax mexicanus : reading xml from file
Bos taurus
Bos taurus : reading xml from file
Caenorhabditis elegans
Caenorhabditis elegans : reading xml from file
Callithrix jacchus
Callithrix jacchus : reading xml from file
Canis lupus
Canis lupus : reading xml from file
Carlito syrichta
Carlito syrichta : reading xml from file
Cavia aperea
Cavia aperea : reading xml from file
Cavia porcellus
Cavia porcellus : reading xml from file
Cebus capucinus
Cebus capucinus : reading xml from file
Ceratotherium simum
Ceratotherium simum : reading xml from file
Cercocebus atys
Cercocebus atys : reading xml from file
Chinchilla lanigera
Chinchilla lanigera : reading xml from file
Chlorocebus sabaeus
Chlorocebus sabaeus : reading xml fr

In [9]:
# GTEx project with >10k samples
do_gtex=False
if do_gtex:
    sp = "Homo sapiens"
    bioprojects = ["PRJNA75899"]
    search_term = get_search_term(species_name=sp, bioprojects=bioprojects, biosamples=[])
    root = fetch_sra_xml(species_name=sp, search_term=search_term, save_xml=True)
    sra_table_gtex = sra_xml2table(species_name=sp, xml_root=root)    
    sra_table_gtex.to_csv("sra_table_GTEx_"+d+".tsv", sep="\t", index=False)

In [10]:
# Additional Bioprojects and Biosamples
sra_table_additional = pandas.DataFrame()
species_bioprojects = ["Sus scrofa", "PRJEB1213",]
species = species_bioprojects[0::2]
bioprojects = species_bioprojects[1::2]
for (sp,bioproject) in zip(species,bioprojects):
    search_term = bioproject
    root = fetch_sra_xml(species_name=sp, search_term=search_term, save_xml=True)
    sra_table_tmp = sra_xml2table(species_name=sp, xml_root=root)    
    sra_table_additional = pandas.concat([sra_table_additional, sra_table_tmp])

biosamples = ["SAMN03299101","SAMN03299100","SAMN03299102","SAMN03105187","SAMN03105186","SAMN04486032","SAMN02739515","SAMN02739515",
              "SAMN02739513","SAMN02739297",]
sp = "additional_biosamples"
search_term = get_search_term(species_name=sp, bioprojects=[], biosamples=biosamples)
root = fetch_sra_xml(species_name=sp, search_term=search_term, save_xml=True)
sra_table_biosamples = sra_xml2table(species_name=sp, xml_root=root)    
sra_table_additional = pandas.concat([sra_table_additional, sra_table_biosamples])
sra_table_additional.to_csv("sra_table_additional_"+d+".tsv", sep="\t", index=False)

processing  0 - 100
processing  0 - 9


In [11]:
# Concatenate and save SRA table
sra_table_multisp = pandas.read_table("sra_table_multisp_"+d+".tsv", sep="\t", dtype=str)
sra_table_additional = pandas.read_table("sra_table_additional_"+d+".tsv", sep="\t", dtype=str)
sra_table = pandas.concat([sra_table_multisp, sra_table_additional], axis=0, ignore_index=True)
if do_gtex:
    sra_table_gtex = pandas.read_table("sra_table_GTEx_"+d+".tsv", sep="\t", dtype=str)
    sra_table = pandas.concat([sra_table_multisp, sra_table_gtex, sra_table_additional], axis=0, ignore_index=True)
sra_table = sra_table.drop_duplicates(subset=["bioproject", "biosample", "experiment", "run", "sra_sample", "sra_study"], keep = "first")
sra_table = sra_table.sort_values(by=["scientific_name", "bioproject", "tissue", "treatment", "genotype", "age", "sex"])
sra_table.to_csv("sra_table_"+d+".tsv", sep="\t", index=False)
sra_table = pandas.read_table("sra_table_"+d+".tsv", sep="\t", dtype=str)
sra_table = sra_table.fillna("")

In [12]:
# Curation
start = time.time()
sra_table = pandas.read_csv('sra_table_'+d+'.tsv', sep='\t', header=0, low_memory=False)
sra_table_curated = standardize_orthographical_variants(sra_table)
sra_table_curated = exclude_by_id(sra_table_curated, bioprojects=[], biosamples=[])
sra_table_curated = column_aggregation(sra_table_curated, column_aggregation_list)
sra_table_curated = exclude_entry_with_keyword(sra_table_curated)
sra_table_curated = manual_attribute_filling(sra_table_curated)
sra_table_curated = standardize_scientific_name(sra_table_curated)
sra_table_curated = standardize_tissue(sra_table_curated, tissue_aggregation_list)
sra_table_curated = standardize_sex(sra_table_curated)
sra_table_curated.to_csv('sra_table_column_aggregated_'+d+'.tsv', sep='\t', index=False)
sra_table_curated = exclude_treatment_terms(sra_table_curated, control_term_list)
sra_table_curated = nspot_cutoff(sra_table_curated, min_nspots=5000000)
sra_table_curated = remove_redundant_biosample(sra_table_curated)
sra_table_curated = label_sampled_data(sra_table_curated, suppress_to=20, tissue_selection=tissue_selection)
sra_table_curated = reorder_columns(sra_table_curated, omit_misc=True)
sra_table_curated.to_csv('sra_table_curated_'+d+'.tsv', sep='\t', index=False)
print(time.time() - start, '[sec]')

Running: standardize_orthographical_variants
Running: exclude_by_id
Running: column_aggregation


Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike
Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike
  return self._getitem_tuple(key)


Running: exclude_entry_with_keyword
Running: manual_attribute_filling
Running: standardize_scientific_name
Running: standardize_tissue
Running: standardize_sex
Running: exclude_treatment_terms
Running: nspot_cutoff
Running: remove_redundant_biosample
Running: label_sampled_data


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item]

409.4720039367676 [sec]


In [13]:
# Cross table
sra_qualified_pivot = make_cross_tables(sra_table_curated, n_sp_cutoff=5, suppress_tissue=False, sample_upper=False)
sra_qualified_pivot.to_csv('sra_table_pivot_qualified_'+d+'.tsv', sep='\t')
sra_selected_pivot = make_cross_tables(sra_table_curated, n_sp_cutoff=5, suppress_tissue=False, sample_upper=True)
sra_selected_pivot.to_csv('sra_table_pivot_selected_'+d+'.tsv', sep='\t')

Running: make_cross_tables
Running: make_cross_tables


In [14]:
# Reduced table
sra_table_qualified = sra_table_curated.loc[(sra_table_curated['is_qualified']=='Yes') & (sra_table_curated['is_selected_tissue']=='Yes'),:]
sra_table_qualified = sra_table_qualified.loc[(sra_table_qualified['scientific_name']!='Drosophila melanogaster'),:]
sra_table_qualified.to_csv('sra_table_qualified_'+d+'.tsv', sep='\t', index=False)
sra_table_reduced = sra_table_curated.loc[(sra_table_curated['is_sampled']=='Yes') & (sra_table_curated['is_selected_tissue']=='Yes'),:]
sra_table_reduced.to_csv('sra_table_reduced_'+d+'.tsv', sep='\t', index=False)

In [15]:
# dif

sra_dirs = os.listdir(wd_base)
#previous_dir = sra_dirs[len(sra_dirs)-2]
previous_dir = '2018_5_1'
previous_path = wd_base+previous_dir+'/'+'sra_table_pivot_qualified_'+previous_dir+'.tsv'
previous_qualified_pivot = pandas.read_csv(previous_path, header=0, index_col=0, sep='\t')
dif_pivot = sra_qualified_pivot.subtract(previous_qualified_pivot, fill_value=0)
dif_pivot = dif_pivot.loc[sra_qualified_pivot.index,sra_qualified_pivot.columns]
dif_pivot.to_csv('sra_table_pivot_qualified_dif_'+d+'_'+previous_dir+'.tsv', sep='\t')