This notebook updates all file annotations for the PCBC project. There are four required file annotations for any assay file uploaded to Synapse:

* `UID` - a universal ID for this file
* `dataType` - `mRNA`, `miRNA`, or `methylation`
* `fileType`: `bam`, `fastq`, etc.
* `fileSubType`: is `fileType`-specific; e.g., `bam` can have `mapped` or `unmapped`.

This script uses the `dataType` field to get assay-specific files (using a query lookup dictionary `assayToQuery`), and then the `UID` to merge existing file annotations and the table metadata (which is obtained through a lookup dictionarly `assayToMetadataTable`.

In [2]:
import sys
import pandas as pd
import numpy
import logging

logger = logging.getLogger('pcbc update annots')
logger.setLevel(logging.DEBUG)

from multiprocessing.dummy import Pool
mp = Pool(10)

import synapseHelpers
import synapseclient

syn = synapseclient.Synapse(skip_checks=True)
syn.login(silent=True)

dataTypes = ['mRNA', 'miRNA', 'methylation']

dataTypesToMetadataTable = dict(mRNA='syn3156503',
                                miRNA='syn3219876',
                                methylation='syn3156828')

dataTypesToQuery = dict(mRNA="select id,UID from file where projectId=='syn1773109' AND dataType=='mRNA'",
                        miRNA="select id,UID from file where projectId=='syn1773109' AND dataType=='miRNA'",
                        methylation="select id,UID from file where projectId=='syn1773109' AND dataType=='methylation' AND fileType=='idat'")

In [5]:
def fixDict(d):
    """Fix dictionaries having .
    
    """
    
    newd = {}
    for k, v in d.iteritems():
        tmp = {}
        for k2, v2 in v.iteritems():
            if v2 is list:
                tmp[k2] = v2
            else:
                try:
                    if not numpy.isnan(v2):
                        tmp[k2] = v2
                except TypeError:
                    tmp[k2] = v2
        newd[k] = tmp
    return newd

def updateDict(a, b):
    """Remove synapse fields from Synapse query on file annotations.
    
    """
    
    foo = a.copy() # [a.pop(x) for x in ["creationDate", "etag", "id", "uri"]]
    a.update(b)
    return a

def updateAnnots(synid, lookupDict):
    """Update annotations using a dict to lookup which annotations to add (keys are synids).
    
    Works this way for parallelization.
    
    """
    logger.info("%s" % synid)
    o = syn.get(synid, downloadFile=False)
    a = syn.getAnnotations(o)
    newa = a.copy()
    newa = updateDict(newa, lookupDict[o.properties.id])
    foo = syn.setAnnotations(o, newa)


In [6]:
doupdate = True
updateDataTypes = dataTypes
# updateDataTypes = ['mRNA']

for dataType in updateDataTypes:
        
    # Metadata
    logger.info("Getting %s metadata" % dataType)
    metaSchema = syn.get(dataTypesToMetadataTable[dataType])
    metaResults = syn.tableQuery("select * from %s" % metaSchema.id)
    meta = metaResults.asDataFrame()
    
    # All files
    logger.info("Getting %s file list" % dataType)
    fileTbl = synapseHelpers.query2df(syn.chunkedQuery(dataTypesToQuery[dataType]), 
                                      filterSynapseFields=False)
    
    # Merge metadata and files
    logger.info("Merging %s" % dataType)
    
    merged = pd.merge(left=fileTbl[['id', 'UID']],  right=meta,
                      how="left", left_on="UID", right_on="UID")
    
    merged.index = merged.id
    merged.drop("id", axis=1, inplace=True)
    mergedDict = merged.transpose().to_dict()
    mergedDict2 = fixDict(mergedDict)
    
    # Update annotations
    if doupdate:
        logger.info("Updating %s annotations" % dataType)
        res = mp.map(lambda x: updateAnnots(x, mergedDict2), mergedDict2.keys())

INFO:pcbc update annots:Getting mRNA metadata
INFO:pcbc update annots:Getting mRNA file list
INFO:pcbc update annots:Merging mRNA
INFO:pcbc update annots:Updating mRNA annotations
INFO:pcbc update annots:syn3351708
INFO:pcbc update annots:syn2251038
INFO:pcbc update annots:syn3915404
INFO:pcbc update annots:syn2252425
INFO:pcbc update annots:syn3323667
INFO:pcbc update annots:syn2874523
INFO:pcbc update annots:syn2246877
INFO:pcbc update annots:syn2247059
INFO:pcbc update annots:syn2788734
INFO:pcbc update annots:syn3351678
INFO:pcbc update annots:syn2252095
INFO:pcbc update annots:syn2252927
INFO:pcbc update annots:syn2788737
INFO:pcbc update annots:syn2247237
INFO:pcbc update annots:syn2248594
INFO:pcbc update annots:syn2252427
INFO:pcbc update annots:syn2253099
INFO:pcbc update annots:syn3639711
INFO:pcbc update annots:syn3326142
INFO:pcbc update annots:syn2252051
INFO:pcbc update annots:syn2253599
INFO:pcbc update annots:syn2252421
INFO:pcbc update annots:syn2246875
INFO:pcbc updat

Create CSV FileHandle [####################]100.00%     504/504  Done...
Downloading  [####################]100.00%     140.1kB/140.1kB query_results.csv Done...
Downloaded   [####################]100.00%     140.1kB/140.1kB query_results.csv Done...
    

INFO:pcbc update annots:Getting miRNA file list
INFO:pcbc update annots:Merging miRNA
INFO:pcbc update annots:Updating miRNA annotations
INFO:pcbc update annots:syn3224836
INFO:pcbc update annots:syn3268937
INFO:pcbc update annots:syn3223359
INFO:pcbc update annots:syn2248456
INFO:pcbc update annots:syn3276191
INFO:pcbc update annots:syn2248331
INFO:pcbc update annots:syn3241225
INFO:pcbc update annots:syn3261312
INFO:pcbc update annots:syn3223571
INFO:pcbc update annots:syn3261129
INFO:pcbc update annots:syn3261135
INFO:pcbc update annots:syn3261112
INFO:pcbc update annots:syn2248336
INFO:pcbc update annots:syn2248454
INFO:pcbc update annots:syn3223577
INFO:pcbc update annots:syn3268935
INFO:pcbc update annots:syn2248359
INFO:pcbc update annots:syn2247625
INFO:pcbc update annots:syn3276189
INFO:pcbc update annots:syn2247750
INFO:pcbc update annots:syn3261136
INFO:pcbc update annots:syn3261113
INFO:pcbc update annots:syn3241231
INFO:pcbc update annots:syn3268933
INFO:pcbc update annots

 [####################]100.00%     1/1  Done...
Downloading  [####################]100.00%     69.9kB/69.9kB query_results.csv Done...
Downloaded   [####################]100.00%     69.9kB/69.9kB query_results.csv Done...
    

INFO:pcbc update annots:Getting methylation file list
INFO:pcbc update annots:Merging methylation
INFO:pcbc update annots:Updating methylation annotations
INFO:pcbc update annots:syn2654073
INFO:pcbc update annots:syn2654103
INFO:pcbc update annots:syn2653916
INFO:pcbc update annots:syn2654022
INFO:pcbc update annots:syn2654116
INFO:pcbc update annots:syn2654130
INFO:pcbc update annots:syn2653930
INFO:pcbc update annots:syn2677687
INFO:pcbc update annots:syn2653934
INFO:pcbc update annots:syn2653921
INFO:pcbc update annots:syn2654074
INFO:pcbc update annots:syn2654104
INFO:pcbc update annots:syn2654058
INFO:pcbc update annots:syn2654140
INFO:pcbc update annots:syn2653920
INFO:pcbc update annots:syn2654054
INFO:pcbc update annots:syn2654119
INFO:pcbc update annots:syn2653929
INFO:pcbc update annots:syn2654115
INFO:pcbc update annots:syn2654040
INFO:pcbc update annots:syn2654075
INFO:pcbc update annots:syn2654129
INFO:pcbc update annots:syn2654059
INFO:pcbc update annots:syn2653927
INFO: