# Sample Metadata

I have done my first submission of RNA-Seq data to GEO, but they really want sample biological metadata to add for indexing purposes. This is not an easy problem solved for everything, but if I can get most things sorted out then that will be good enough. I want to focus on the categroies:

* Sex
* Developmental stage
* strain
* tissue
* cell type

I am not sure if I will be able to get a reasonable parsing of strain, but at least I can try the others.

In [1]:
import os
import sys
from pathlib import Path
import re
from collections import defaultdict
from yaml import load

from IPython.display import display, HTML, Markdown
import numpy as np
import pandas as pd

import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns

# Project level imports
sys.path.insert(0, '../lib')
from ncbi_remap.notebook import Nb
from ncbi_remap.plotting import make_figs

In [2]:
# Setup notebook
nbconfig = Nb.setup_notebook()

Please check output/fbgn2chrom.tsv. If it does not exist, run bin/fbgn2chrom.py
last updated: 2018-07-31 
Git hash: 8506138704bf2e17010368949820110db52a4599


In [3]:
from pymongo import MongoClient
try:
    with open('../output/.mongodb_host', 'r') as fh:
        host = fh.read().strip()
except FileNotFoundError:
    host = 'localhost'

mongoClient = MongoClient(host=host, port=27017)
db = mongoClient['sra']
ncbi = db['ncbi']

## Get list of uploaded samples

In [9]:
sample_list = pd.read_csv('/media/psf/Promise_Pegasus/fearjm/ncbi_remap/geo-wf/justin.fear@nih.gov/sample_section.tsv', sep='\t', usecols=[0]).sample_name.unique().tolist()

In [10]:
metadata = pd.DataFrame([], index=sample_list, columns=['sex', 'developmental stage', 'tissue', 'strain', 'cell type'])
metadata.head()

Unnamed: 0,sex,developmental stage,tissue,strain,cell type
DRX013093,,,,,
DRX013094,,,,,
DRX014765,,,,,
DRX014766,,,,,
DRX014767,,,,,


## Attribute dump

In [11]:
attrs = pd.DataFrame(list(ncbi.aggregate([
    {
        '$match': {
            '_id': {'$in': sample_list}
        }
    },
    {
        '$unwind': {
            'path': '$sra.sample.attributes'
        }
    },
    {
        '$project': {
            '_id': 0,
            'srx': '$_id',
            'name': '$sra.sample.attributes.name',
            'value': '$sra.sample.attributes.value'
        }
    },
])))

In [12]:
attrs.shape

(70753, 3)

In [13]:
# Initial normalization
replace_dict = {
    '_': ' ', 
    '; ': ';', 
    ' / ': '/',
    'dev stage': 'developmental stage', 
    'dev-stage': 'developmental stage',
    'developemntal stage': 'developmental stage', 
    'development stage': 'developmental stage', 
    'developmental stage': 'developmental stage', 
    'develpomental stage': 'developmental stage', 
    'cell/tissue type': 'tissue',
    'tissue lib': 'tissue', 
    'tissue source': 'tissue',
    'tissue type': 'tissue', 
    'tissue/cell type': 'tissue',
}
regex = re.compile('|'.join(replace_dict.keys()))

attrs['name'] = attrs['name'].str.lower().apply(lambda s: regex.sub(lambda x: replace_dict[x.group()], s))
attrs['value'] = attrs['value'].str.lower().apply(lambda s: regex.sub(lambda x: replace_dict[x.group()], s))

In [14]:
attrsp = attrs.pivot_table(values='value', index='srx', columns='name', aggfunc='first').copy()

In [15]:
bow = {}
for srx, row in attrsp.iterrows():
    bow[srx] = ' '.join(row.dropna().values.tolist())

In [16]:
print(sorted(attrsp.columns.tolist()))

['adapter', 'adapter barcode', 'affinity purification', 'age', 'age of flies', 'age of fly in days post eclosion', 'agent', 'ages', 'alternate taxon id 1', 'alternate taxon id 2', 'amplification', 'antibody', 'assay', 'background strain', 'barcode', 'barcode-kit', 'batch', 'biological replicate', 'biological replicate number', 'biological replicates', 'biomarker', 'biomaterial provider', 'bioproject id', 'bioprojectid', 'biorep', 'biosamplemodel', 'birth date', 'birth location', 'bloomington stock center id', 'body site', 'breed', 'breeding history', 'breeding method', 'cage', 'cell class', 'cell line', 'cell line background', 'cell passages', 'cell subtype', 'cell type', 'cell type marker', 'cells', 'cells derived from', 'checksum', 'chip antibody', 'chip or ip antibody', 'clip antibody', 'colection date', 'collected by', 'collection date', 'compound', 'condition', 'crosses', 'cultivar', 'culture collection', 'custom name', 'days after eclosion', 'days after treatment', 'days at 29 c'

In [243]:
#print(sorted(attrs.value.unique().tolist()))

In [25]:
def parse_sex(attrsp, metadata):
    dat = attrsp['sex']

    with open('../geo-wf/config/sex.yaml') as fh:
        sex_norm = load(fh)

    for k, v in sex_norm.items():
        if v == 'None':
            sex_norm[k] = np.nan

    dat.replace(sex_norm, inplace=True)
    metadata.update(dat.dropna())
        
    missing = metadata.sex.isna().index.tolist()
    for srx in missing:
        try:
            string = bow[srx]
        except KeyError:
            continue

        mf = re.search(r'\bfemale\b', string)
        mm = re.search(r'\bmale\b', string)
        if (mf is not None) & (mm is not None):
            metadata.loc[srx, 'sex'] = 'mixed'
        elif mf:
            metadata.loc[srx, 'sex'] = 'female'
        elif mm:
            metadata.loc[srx, 'sex'] = 'male'
            
parse_sex(attrsp, metadata)

In [27]:
def parse_dev(attrsp, metadata):
    dat = attrsp['developmental stage']

    with open('../geo-wf/config/dev_stage.yaml') as fh:
        dev_norm = load(fh)

    for k, v in dev_norm.items():
        if v == 'None':
            dev_norm[k] = np.nan

    dat.replace(dev_norm, inplace=True)
    metadata.update(dat.dropna())
            
parse_dev(attrsp, metadata)

In [39]:
def parse_tissue(attrsp, metadata):
    dat = attrsp['tissue']

    with open('../geo-wf/config/tissue.yaml') as fh:
        tissue_norm = load(fh)

    for k, v in tissue_norm.items():
        if v == 'None':
            tissue_norm[k] = np.nan

    dat.replace(tissue_norm, inplace=True)
    metadata.update(dat.dropna())
            
parse_tissue(attrsp, metadata)

In [11]:
attrs = pd.DataFrame(list(ncbi.aggregate([
    {
        '$match': {
            '_id': {'$in': sample_list}
        }
    },
    {
        '$unwind': {
            'path': '$sra.sample.attributes'
        }
    },
    {
        '$project': {
            '_id': 0,
            'srx': '$_id',
            'name': '$sra.sample.attributes.name',
            'value': '$sra.sample.attributes.value'
        }
    },
])))

In [12]:
attrs.shape

(70753, 3)

In [42]:
# Initial normalization
replace_dict = {
    '_': ' ', 
    '; ': ';', 
    ' / ': '/',
    'cell/tissue type': 'cell type',
    'tissue/cell type': 'cell type',
    'cell class': 'cell type', 
    'cell line': 'cell type', 
    'cell line background': 'cell type', 
    'cell subtype': 'cell type', 
    'cell type': 'cell type', 
    'cells': 'cell type', 
    'cells derived from': 'cell type',
}
regex = re.compile('|'.join(replace_dict.keys()))

attrs['name'] = attrs['name'].str.lower().apply(lambda s: regex.sub(lambda x: replace_dict[x.group()], s))
attrs['value'] = attrs['value'].str.lower().apply(lambda s: regex.sub(lambda x: replace_dict[x.group()], s))

In [43]:
attrsp = attrs.pivot_table(values='value', index='srx', columns='name', aggfunc='first').copy()

In [44]:
bow = {}
for srx, row in attrsp.iterrows():
    bow[srx] = ' '.join(row.d.opna().values.tolist())

In [48]:
def parse_cell_type(attrsp, metadata):
    dat = attrsp['cell type']

    with open('../geo-wf/config/cell_type.yaml') as fh:
        norm = load(fh)

    for k, v in norm.items():
        if v == 'None':
            norm[k] = np.nan

    dat.replace(norm, inplace=True)
    metadata.update(dat.dropna())
            
parse_cell_type(attrsp, metadata)

In [51]:
metadata['cell type'].dropna()

ERX173563    S2
ERX173565    S2
ERX173570    S2
ERX173571    S2
ERX173572    S2
ERX173578    S2
ERX173581    S2
ERX173588    S2
ERX173594    S2
ERX173597    S2
ERX173598    S2
ERX173601    S2
ERX173605    S2
ERX173607    S2
ERX173611    S2
ERX173614    S2
ERX173615    S2
ERX173617    S2
ERX173618    S2
ERX173625    S2
ERX173631    S2
ERX173640    S2
ERX173642    S2
ERX173647    S2
ERX173651    S2
ERX173658    S2
ERX173659    S2
ERX173661    S2
ERX173664    S2
ERX173668    S2
             ..
SRX976314    S3
SRX976315    S3
SRX976316    S3
SRX976317    S3
SRX976318    S3
SRX976319    S3
SRX976320    S3
SRX976321    S3
SRX976322    S3
SRX976323    S3
SRX976324    S3
SRX976325    S3
SRX976326    S3
SRX981410    S2
SRX981411    S2
SRX981412    S2
SRX981413    S2
SRX981414    S2
SRX981415    S2
SRX982548    S2
SRX982550    S2
SRX982551    S2
SRX982552    S2
SRX982553    S2
SRX984881    S2
SRX984882    S2
SRX984883    S2
SRX984887    S2
SRX984888    S2
SRX984889    S2
Name: cell type, Length:

In [52]:
metadata

Unnamed: 0,sex,developmental stage,tissue,strain,cell type
DRX013093,,,,,
DRX013094,,,,,
DRX014765,female,,,,
DRX014766,female,,,,
DRX014767,male,,,,
DRX014768,male,,,,
DRX014769,female,,,,
DRX014770,female,,,,
DRX014771,male,,,,
DRX014772,male,,,,


In [315]:
bowS = pd.Series(bow)
bowS.name = 'bow'

In [332]:
helper = ''
for dv, bb in devs.join(bowS, how='left').groupby('developmental stage'):
    mylist = ' '.join(bb.bow.unique().tolist())
    helper += f'{dv}\t{mylist}\n'

In [333]:
with open('/tmp/helper.tsv', 'w') as fo:
    fo.write(helper)