This notebook updates file annotations for the PCBC project for those that can be mapped by C4_Cell_Line_ID. 

There are two required file annotations for any assay file uploaded to Synapse:

* `C4_Cell_Line_ID` - Used to join to metadata
* `dataType` - `Karyotype`, `CNV`, or `Teratoma`

This script uses the `dataType` field to get assay-specific files (using a query lookup dictionary `assayToQuery`), and then the `C4_Cell_Line_ID` to merge existing file annotations and the table metadata (which is obtained through a lookup dictionary `assayToMetadataTable`.

In [12]:
import sys
import pandas as pd
import numpy
import logging

logger = logging.getLogger('pcbc update annots')
logger.setLevel(logging.DEBUG)

from multiprocessing.dummy import Pool
mp = Pool(10)

import synapseHelpers
import synapseclient

syn = synapseclient.Synapse(skip_checks=True)
syn.login(silent=True)

dataTypes = ['Karyotype', 'CNV', 'Teratoma']

dataTypesToMetadataTable = dict(Karyotype='syn2767694',
                                CNV='syn2767694',
                                Teratoma='syn2767694')

dataTypesToQuery = dict(Karyotype="select id,C4_Cell_Line_ID from file where projectId=='syn1773109' AND dataType=='Karyotype'",
                        CNV="select id,C4_Cell_Line_ID from file where projectId=='syn1773109' AND dataType=='CNV'",
                        Teratoma="select id,C4_Cell_Line_ID from file where projectId=='syn1773109' AND dataType=='Teratoma'")

In [13]:
def fixDict(d):
    """Fix dictionaries having .
    
    """
    
    newd = {}
    for k, v in d.iteritems():
        tmp = {}
        for k2, v2 in v.iteritems():
            if v2 is list:
                tmp[k2] = v2
            else:
                try:
                    if not numpy.isnan(v2):
                        tmp[k2] = v2
                except TypeError:
                    tmp[k2] = v2
        newd[k] = tmp
    return newd

def updateDict(a, b):
    """Remove synapse fields from Synapse query on file annotations.
    
    """
    
    foo = a.copy() # [a.pop(x) for x in ["creationDate", "etag", "id", "uri"]]
    a.update(b)
    return a

def updateAnnots(synid, lookupDict):
    """Update annotations using a dict to lookup which annotations to add (keys are synids).
    
    Works this way for parallelization.
    
    """
    logger.info("%s" % synid)
    o = syn.get(synid, downloadFile=False)
    a = syn.getAnnotations(o)
    newa = a.copy()
    newa = updateDict(newa, lookupDict[o.properties.id])
    foo = syn.setAnnotations(o, newa)


In [14]:
doupdate = True
updateDataTypes = dataTypes
# updateDataTypes = ['Teratoma']
_joinKey = "C4_Cell_Line_ID"

for dataType in updateDataTypes:
        
    # Metadata
    logger.info("Getting %s metadata" % dataType)
    metaSchema = syn.get(dataTypesToMetadataTable[dataType])
    metaResults = syn.tableQuery("select * from %s" % metaSchema.id)
    meta = metaResults.asDataFrame()
    
    # All files
    logger.info("Getting %s file list" % dataType)
    fileTbl = synapseHelpers.query2df(syn.chunkedQuery(dataTypesToQuery[dataType]), 
                                      filterSynapseFields=False)
        
    # Merge metadata and files
    logger.info("Merging %s" % dataType)
    
    merged = pd.merge(left=fileTbl[['id', _joinKey]],  right=meta,
                      how="left", left_on=_joinKey, right_on=_joinKey)
    
    merged.index = merged.id
    merged.drop("id", axis=1, inplace=True)
    merged.drop(_joinKey, axis=1, inplace=True)
        
    mergedDict = merged.transpose().to_dict()
    mergedDict2 = fixDict(mergedDict)

    # Update annotations
    if doupdate:
        logger.info("Updating %s annotations" % dataType)
        res = mp.map(lambda x: updateAnnots(x, mergedDict2), mergedDict2.keys())

INFO:pcbc update annots:Getting Karyotype metadata


 [####################]100.00%     1/1  Done...
Downloading  [####################]100.00%     39.8kB/39.8kB query_results.csv Done...
Downloaded   [####################]100.00%     39.8kB/39.8kB query_results.csv Done...
    

INFO:pcbc update annots:Getting Karyotype file list
INFO:pcbc update annots:Merging Karyotype
INFO:pcbc update annots:Updating Karyotype annotations
INFO:pcbc update annots:syn2679209
INFO:pcbc update annots:syn2679186
INFO:pcbc update annots:syn2679180
INFO:pcbc update annots:syn2679182
INFO:pcbc update annots:syn2679184
INFO:pcbc update annots:syn2679168
INFO:pcbc update annots:syn2679166
INFO:pcbc update annots:syn2679164
INFO:pcbc update annots:syn2679162
INFO:pcbc update annots:syn2679160
INFO:pcbc update annots:syn2679187
INFO:pcbc update annots:syn2679183
INFO:pcbc update annots:syn2679185
INFO:pcbc update annots:syn2679181
INFO:pcbc update annots:syn2679165
INFO:pcbc update annots:syn2679189
INFO:pcbc update annots:syn2679169
INFO:pcbc update annots:syn2679210
INFO:pcbc update annots:syn2679212
INFO:pcbc update annots:syn2679161
INFO:pcbc update annots:syn2679211
INFO:pcbc update annots:syn2679188
INFO:pcbc update annots:syn2679203
INFO:pcbc update annots:syn2679192
INFO:pcbc u