In [1]:
import os
import rdflib as rdf
#import csv for reading csv files
import csv
#import for reading XLS data dictionary file
import pandas as pd
import uuid
from glob import glob
import tarfile
import dicom as dcm
import re

In [2]:
g = rdf.Graph()

In [3]:
nidash = rdf.Namespace("http://nidm.nidash.org#")
prov = rdf.Namespace("http://www.w3.org/ns/prov#")
ncit = rdf.Namespace("http://ncitt.ncit.nih.gov/")
fbirn = rdf.Namespace("http://www.birncommunity.org/collaborators/function-birn/")
xsd = rdf.Namespace("http://www.w3.org/2001/XMLSchema#")
rdfs = rdf.Namespace("http://www.w3.org/2000/01/rdf-schema#")
foaf = rdf.Namespace("http://xmlns.com/foaf/0.1/")
vc = rdf.Namespace("http://www.w3.org/2006/vcard/ns#")
dicom = rdf.Namespace("http://neurolex.org/wiki/Category:DICOM_term/")
dct = rdf.Namespace("http://purl.org/dc/terms/")
dctypes = rdf.Namespace("http://purl.org/dc/dcmitype/")
dcat = rdf.Namespace("http://www.w3.org/ns/dcat#")
nfo = rdf.Namespace("http://www.semanticdesktop.org/ontologies/2007/03/22/nfo#")
list(g.namespaces())

[('xml', rdflib.term.URIRef(u'http://www.w3.org/XML/1998/namespace')),
 ('rdf', rdflib.term.URIRef(u'http://www.w3.org/1999/02/22-rdf-syntax-ns#')),
 ('xsd', rdflib.term.URIRef(u'http://www.w3.org/2001/XMLSchema#')),
 ('rdfs', rdflib.term.URIRef(u'http://www.w3.org/2000/01/rdf-schema#'))]

In [4]:
g.bind('nidash', nidash)
g.bind('prov', prov)
g.bind('ncit', ncit)
g.bind('fbirn', fbirn)
g.bind('xsd',xsd)
g.bind('rdfs',rdfs)
g.bind('foaf',foaf)
g.bind('vc',vc)
g.bind('dicom',dicom)
g.bind('dct',dct)
g.bind('dctypes', dctypes)
g.bind('dcat', dcat)
g.bind('nfo',nfo)
list(g.namespaces())

[('xml', rdflib.term.URIRef(u'http://www.w3.org/XML/1998/namespace')),
 ('fbirn',
  rdflib.term.URIRef(u'http://www.birncommunity.org/collaborators/function-birn/')),
 ('vc', rdflib.term.URIRef(u'http://www.w3.org/2006/vcard/ns#')),
 ('ncit', rdflib.term.URIRef(u'http://ncitt.ncit.nih.gov/')),
 ('rdfs', rdflib.term.URIRef(u'http://www.w3.org/2000/01/rdf-schema#')),
 ('nfo',
  rdflib.term.URIRef(u'http://www.semanticdesktop.org/ontologies/2007/03/22/nfo#')),
 ('prov', rdflib.term.URIRef(u'http://www.w3.org/ns/prov#')),
 ('nidash', rdflib.term.URIRef(u'http://nidm.nidash.org#')),
 ('dctypes', rdflib.term.URIRef(u'http://purl.org/dc/dcmitype/')),
 ('rdf', rdflib.term.URIRef(u'http://www.w3.org/1999/02/22-rdf-syntax-ns#')),
 ('foaf', rdflib.term.URIRef(u'http://xmlns.com/foaf/0.1/')),
 ('xsd', rdflib.term.URIRef(u'http://www.w3.org/2001/XMLSchema#')),
 ('dicom',
  rdflib.term.URIRef(u'http://neurolex.org/wiki/Category:DICOM_term/')),
 ('dcat', rdflib.term.URIRef(u'http://www.w3.org/ns/dcat

#Functions for AssessmentOM

In [5]:
def nidm_add_elements(g,line,assessment_name, variable_name_id, question_id, type_id):
    #Create new node for each element with metadata consistent with NIDM-Experiment assessment acquistion
    g.add((nidash[line[variable_name_id]], rdf.RDF.type, nidash["DataElement"]))
    g.add((nidash[line[variable_name_id]], rdf.RDF.type, rdf.RDF.Property))
    g.add((nidash[line[variable_name_id]], prov["label"], rdf.Literal(line[variable_name_id])))
    if (type_id in line.keys()):
        g.add((nidash[line[variable_name_id]], nidash["DataType"], rdf.Literal(line[type_id])))
    if (question_id in line.keys()):
        g.add((nidash[line[variable_name_id]], nidash["Question"], rdf.Literal(line[question_id])))
    if ('SCORESEQ' in line.keys()):
        g.add((nidash[line[variable_name_id]], nidash["Sequence"], rdf.Literal(line["SCORESEQ"])))   
    #make association with assessment
    g.add((nidash[assessment_name],prov["hadMember"], nidash[line[variable_name_id]]))
def nidm_create_assessment(g, assessment_name):
    g.add((nidash[assessment_name], rdf.RDF.type, nidash["DataStructure"]))
    g.add((nidash[assessment_name], rdf.RDF.type, prov["Entity"]))
    g.add((nidash[assessment_name],prov["label"],rdf.Literal(assessment_name)))
def nidm_add_codedproperty(g,line,valueset_id, codedvalue_id, score_code_id, score_label_id):
    codedvalue_uri = safe_uri(codedvalue_id)
    g.add((nidash[codedvalue_uri], rdf.RDF.type, nidash["CodedProperty"]))
    g.add((nidash[codedvalue_uri], rdf.RDF.type, rdf.RDF.Property))
    g.add((nidash[codedvalue_uri],nidash["code"], rdf.Literal(line[score_code_id])))
    g.add((nidash[codedvalue_uri], prov["label"], rdf.Literal(line[score_label_id])))
    g.add((nidash[valueset_id], prov["hadMember"], nidash[codedvalue_uri]))
def safe_uri(string):
    return string.strip().replace(" ","_").replace("-", "_").replace(",", "_").replace("(", "_").replace(")","_").replace("'","_").replace("/", "_")
def nidm_create_assessment_acquisition_object(g, object_id, assessment_type):
    #g.add((nidash[object_id], rdf.RDF.type, nidash["AcquisitionObject"]))
    g.add((nidash[object_id], rdf.RDF.type, nidash[assessment_type]))
    g.add((nidash[object_id], rdf.RDF.type, prov["Entity"]))
    g.add((nidash[object_id], rdf.RDF.type, nidash["Assessment"]))
def nidm_add_elements_assessment_acquisition_object(g, object_id, element, value):
    g.add((nidash[object_id], nidash[element], rdf.Literal(value)))
    

#Functions for ExperimentOM - Investigation Level

In [6]:
def nidm_create_investigation(g, uid, expid):
    g.add((nidash[uid], rdf.RDF.type, dctypes["Dataset"]))
    g.add((nidash[uid], rdf.RDF.type, nidash["Investigation"]))
    g.add((nidash[uid], rdf.RDF.type, prov["Entity"]))
    g.add((nidash[uid], fbirn["ExperimentID"], rdf.Literal(expid, lang='en')))
def nidm_add_investigation_metadata(g, uid, name, description, baseuri, storagetype):
    g.add((nidash[uid], dct["title"], rdf.Literal(name, lang='en')))
    g.add((nidash[uid], dct["description"], rdf.Literal(description, lang='en')))
    g.add((nidash[uid], dct["publisher"], rdf.URIRef(baseuri)))
    g.add((nidash[uid], dcat["accessURL"], rdf.URIRef(baseuri)))
    g.add((nidash[uid], fbirn["StorageType"], rdf.Literal(storagetype, lang='en')))
def nidm_add_role(g, uid, role):
    g.add((uid, rdf.RDF.type, prov["Role"]))
    g.add((uid, prov["label"], rdf.Literal(role)))
def nidm_add_investigation_PI(g, uid, expid, first, last, email):
    g.add((nidash[uid], rdf.RDF.type, prov["Person"]))
    g.add((nidash[uid], foaf["givenName"], rdf.Literal(first)))
    g.add((nidash[uid], foaf["familyName"], rdf.Literal(last)))
    g.add((nidash[uid], vc["email"], rdf.Literal(email)))
    #connect Person to Experiment and add Role
    nidm_add_role(g, nidash["PI"], "Principle Investigator")
    g.add((nidash[uid], prov["hadRole"], nidash["PI"]))
    g.add((nidash[uid], prov["wasAssociatedWith"], nidash[expid]))
    

In [7]:
#experiment and person dictionaries for lookup
exp_dict = {}
exp_collection_dict = {}
person_dict = {}
#Create investigation, parse metadata
variables = pd.read_csv("./HID_Download/phaseIII_experiment_info.csv")
#iterate over the variables in the experiment info export
for index, row in variables.iterrows():
    #create UUID
    expid = "Investigation_" + str(uuid.uuid1())
    #keep UUID -> uniqueid mappings for experiments in dictionary in case there are multiple experiments
    exp_dict[expid] = row["uniqueid"]
    #add investigation using HID experiment ID
    nidm_create_investigation(g, expid,exp_dict[expid])
    #add other metadata from file
    nidm_add_investigation_metadata(g, expid, row["name"], row["description"],row["baseuri"],row["storagetype"])

    #create an investigation collection / investigation activity
    #collectid = "InvestigationCollection_" + str(uuid.uuid1())
    #store URL for later use
    #exp_collection_dict[str(row["uniqueid"])] = collectid
    
    activityid = "InvestigationActivity_"+str(uuid.uuid1())
    #g.add((nidash[collectid], rdf.RDF.type, prov["Collection"]))
    g.add((nidash[activityid], rdf.RDF.type, prov["Activity"]))
    #add label for debugging/model evaluation
    g.add((nidash[expid],prov["label"], rdf.Literal("Investigation Collection")))
    g.add((nidash[activityid],prov["label"], rdf.Literal("Investigation Process Activity")))
    #associate investigation entity with activity
    g.add((nidash[expid],prov["wasGeneratedBy"], nidash[activityid]))
    
    #Create PI person, parse metadata
    person_variables = pd.read_csv("./HID_Download/phaseIII_PI_info.csv")
    for person_index, person_row in person_variables.iterrows():
        #create UUID
        PIid = "Person_" + str(uuid.uuid1())
        #keep UUID -> uniqueid mappings for persons in dictionary in case there are multiple person
        person_dict[PIid] = person_row["uniqueid"]
        #add PI 
        nidm_add_investigation_PI(g, PIid, expid, person_row["first_name"], person_row["last_name"], person_row["email"])
        #associate person with investigation activity
        g.add((nidash[activityid],prov["wasAssociatedWith"], nidash[PIid]))
        #associate investigation collection with person
        g.add((nidash[expid],prov["wasAttributedTo"], nidash[PIid]))


#Image series (DICOM) download parsing/conversion to NIDM

In [8]:
#Image series functions
def nidm_add_session_person(g, uid, subjid):
    g.add((nidash[uid], rdf.RDF.type, prov["Person"]))
    g.add((nidash[uid], ncit["subjectID"], rdf.Literal(subjid)))
    nidm_add_role(g, nidash["Participant"], "Participant")
    g.add((nidash[uid], prov["hadRole"], nidash["Participant"]))
def nidm_add_scanner(g, uid, dicom_hdr):
    g.add((nidash[scanner_id], rdf.RDF.type, prov["Agent"]))
    g.add((nidash[scanner_id], dicom["Manufacturer"], rdf.Literal(ds[0x0008, 0x0070].value)))
    g.add((nidash[scanner_id], dicom["ManufacturerModelName"], rdf.Literal(ds[0x0008,0x1090].value)))
    g.add((nidash[scanner_id], dicom["MagneticFieldStrength"], rdf.Literal(ds[0x0018,0x0087].value)))
    g.add((nidash[scanner_id], dicom["DeviceSerialNumber"], rdf.Literal(ds[0x0018,0x1000].value)))
    g.add((nidash[scanner_id], dicom["SoftwareVersion"], rdf.Literal(ds[0x0018,0x1020].value)))
def nidm_add_dicom_metadata(g, uid, dicom_hdr):
    g.add((nidash[uid], dicom["ScanningSequence"], rdf.Literal(ds[0x0018,0x0020].value)))
    g.add((nidash[uid], dicom["SequenceVariant"], rdf.Literal(ds[0x0018,0x0021].value)))
    g.add((nidash[uid], dicom["ScanOptions"], rdf.Literal(ds[0x0018,0x0022].value)))
    g.add((nidash[uid], dicom["MRAcquisitionType"], rdf.Literal(ds[0x0018,0x0023].value)))
    g.add((nidash[uid], dicom["SequenceName"], rdf.Literal(ds[0x0018,0x0024].value)))
    g.add((nidash[uid], dicom["AngioFlag"], rdf.Literal(ds[0x0018,0x0025].value)))
    g.add((nidash[uid], dicom["SliceThickness"], rdf.Literal(ds[0x0018,0x0050].value)))
    g.add((nidash[uid], dicom["RepetitionTime"], rdf.Literal(ds[0x0018,0x0080].value)))
    g.add((nidash[uid], dicom["EchoTime"], rdf.Literal(ds[0x0018,0x0081].value)))
    g.add((nidash[uid], dicom["NumberofAverages"], rdf.Literal(ds[0x0018,0x0083].value)))
    g.add((nidash[uid], dicom["ImagingFrequency"], rdf.Literal(ds[0x0018,0x0084].value)))
    g.add((nidash[uid], dicom["ImagedNucleus"], rdf.Literal(ds[0x0018,0x0085].value)))
    g.add((nidash[uid], dicom["EchoNumber"], rdf.Literal(ds[0x0018,0x0086].value)))
    g.add((nidash[uid], dicom["MagneticFieldStrength"], rdf.Literal(ds[0x0018,0x0087].value)))
    g.add((nidash[uid], dicom["SpacingBetweenSlices"], rdf.Literal(ds[0x0018,0x0088].value)))
    g.add((nidash[uid], dicom["NumberofPhaseEncodingSteps"], rdf.Literal(ds[0x0018,0x0089].value)))
    g.add((nidash[uid], dicom["EchoTrainLength"], rdf.Literal(ds[0x0018,0x0091].value)))
    g.add((nidash[uid], dicom["PercentSampling"], rdf.Literal(ds[0x0018,0x0093].value)))
    g.add((nidash[uid], dicom["PercentPhaseFieldofView"],rdf.Literal(ds[0x0018,0x0094].value)))
    g.add((nidash[uid], dicom["PixelBandwidth"],rdf.Literal(ds[0x0018,0x0095].value)))
    g.add((nidash[uid], dicom["ProtocolName"],rdf.Literal(ds[0x0018,0x1030].value)))
    g.add((nidash[uid], dicom["TransmitCoilName"],rdf.Literal(ds[0x0018,0x1251].value)))
    g.add((nidash[uid], dicom["AcquisitionMatrix"],rdf.Literal(ds[0x0018,0x1310].value)))
    g.add((nidash[uid], dicom["InplanePhaseEncodingDirection"],rdf.Literal(ds[0x0018,0x1312].value)))
    g.add((nidash[uid], dicom["FlipAngle"], rdf.Literal(ds[0x0018,0x1314].value)))
    g.add((nidash[uid], dicom["VariableFlipAngleFlag"], rdf.Literal(ds[0x0018,0x1315].value)))
    g.add((nidash[uid], dicom["SAR"], rdf.Literal(ds[0x0018,0x1316].value)))
    g.add((nidash[uid], dicom["dB_dt"], rdf.Literal(ds[0x0018,0x1318].value)))
    g.add((nidash[uid], dicom["PatientPosition"], rdf.Literal(ds[0x0018,0x5100].value)))

In [9]:
#traverse directory structure, store visit/study metadata parsed from directory names per
#FBIRN hierarchy, parse DICOM metadata at lowest level and format according to NIDM
root_dir = "./HID_Download/"
subj_dict={}
#session_dict={}

#create visit dictionary
visit_dict = {}  
for dname in os.listdir(root_dir):
    if ((os.path.isdir(os.path.join(root_dir,dname))) and (not dname.startswith('.'))):
        #highest level directory contains subj-dirs with subjectID
        #grab subjectID and store in NIDM-Experiment graph
        #subject ID should be unique so can drop the UUID...and then we don't need a dictionary
        #subj_uid = dname + "_" + str(uuid.uuid1())
        subj_uid = str(dname)
        subj_dict[dname] = subj_uid
        nidm_add_session_person(g,subj_uid, dname)
    
        ###VISIT COLLECTION TAKES PLACE OF THIS CODE....####
        #create session activity and collection
        #create an investigation collection / investigation activity
        #collectid = "SessionCollection_" + str(uuid.uuid1())
        
        #key session collection by subjectID+visit_number
        #session_dict[dname + "_" + tokens[2]] = collectid
        #activityid = "SessionActivity_"+str(uuid.uuid1())
        #g.add((nidash[collectid], rdf.RDF.type, prov["Collection"]))
        #g.add((nidash[activityid], rdf.RDF.type, prov["Activity"]))
        
        #associate person with these session collections/activities
        #g.add((nidash[collectid], prov["wasAttributedTo"], nidash[subj_uid]))
        #g.add((nidash[activityid], prov["wasAssociatedWith"], nidash[subj_uid]))

        #now start traversing, next level is visit level
      
        for visitname in os.listdir(os.path.join(root_dir,dname)):
            if ((os.path.isdir(os.path.join(root_dir,dname,visitname))) and (not visitname.startswith('.'))):
                #next sub dirs are visits
                #parse visit info from directory name
                tokens=visitname.split("__")
                #token[0] = visitname, token[1] = siteid, token[2] = visit number
                #add visit entity and associate wtih session collection
                #visit unique id is "Visit_"+visit number+visit name + subjectID
                visitid = "AcquisitionCollection_" + str(tokens[2]) + "_" + str(tokens[0]) + "_" + dname
                visit_dict[dname + "_" + tokens[2]] = visitid
                g.add((nidash[visitid], rdf.RDF.type, prov["Entity"]))
                g.add((nidash[visitid], rdf.RDF.type, nidash["Session"]))
                g.add((nidash[visitid], ncit["VisitNum"], rdf.Literal(tokens[2])))
                g.add((nidash[visitid], ncit["StudySiteNumber"], rdf.Literal(tokens[1])))
                g.add((nidash[visitid], prov["label"], rdf.Literal(tokens[0])))
                #g.add((nidash[collectid], prov["hadMember"], nidash[visitid]))
                #visit collection is the "session" collection, attribute to subject
                g.add((nidash[visitid], prov["wasAttributedTo"], nidash[subj_dict[dname]]))
                
                #create scanner dictionary for use in scanners associated with series acquisitions
                scanner_dict={}
                
                #Study Level
                for studyname in os.listdir(os.path.join(root_dir,dname, visitname)):
                    #Skip Levels until Series Level...where do we put multiple studies in model?
                    if ((os.path.isdir(os.path.join(root_dir,dname,visitname,studyname))) and (not studyname.startswith('.'))):
                        for seriesname in os.listdir(os.path.join(root_dir,dname, visitname,studyname)):
                            if ((os.path.isdir(os.path.join(root_dir,dname,visitname,studyname,seriesname))) and (not seriesname.startswith('.'))):
                                #create acquisition activity
                                session_act_id = "AcquisitionActivity" + str(uuid.uuid1())     
                                g.add((nidash[session_act_id], rdf.RDF.type, prov["Activity"]))
                                
                                      
                                #get some metadata about the activity by parsing the DICOM tags
                                fname = os.path.join(root_dir,dname,visitname,studyname,seriesname,"Native","Original__0001","DICOM.tar.gz")
                                tar = tarfile.open(fname, "r:gz")
                                try:
                                    tar_contents = tar.getmembers()
                                    for files in tar_contents:
                                        if ".dcm" in files.name:
                                            dcmfilename = files.name
                                            #print dcmfilename + "found!"
                                            break
                                     
                                    dcmfile = tar.extractfile(dcmfilename)
                                        
                                    #for member in tar.getmembers():
                                    #    print member
                                    #now get DICOM header tags
                                    ds = dcm.read_file(dcmfile)
                                    #add acquisition time to activity object
                                    g.add((nidash[session_act_id], prov["startedAtTime"], rdf.Literal(ds[0x0008,0x0031].value)))
                                    #create scanner unless it already exists then just associate with acquisition object
                                    scanner_key = str(ds[0x0008, 0x0070].value)+ "_"+str(ds[0x0008,0x1090].value)+"_"+str(ds[0x0018,0x0087].value)
                                    if scanner_dict.has_key(scanner_key):
                                        #associate with acquisition activity and scan entity
                                        g.add((nidash[session_act_id], prov["wasGeneratedBy"],nidash[scanner_dict[scanner_key]] ))
                                        
                                    else:
                                        #create scanner agent
                                        #key is manufacturer+model+field_strength
                                        scanner_id = scanner_key + str(uuid.uuid1())
                                        scanner_dict[scanner_key] = scanner_id
                                        nidm_add_scanner(g, scanner_id, ds)
                                        #associate scanner agent with acquisition activity
                                        g.add((nidash[session_act_id], prov["Used"], nidash[scanner_id]))
                                        #create scan entity
                                        session_id = "AcquistionObject_" + str(uuid.uuid1())
                                        g.add((nidash[session_id], rdf.RDF.type, prov["Entity"]))
                                        #add some metadata....what metadata do we add?
                                        nidm_add_dicom_metadata(g, session_id, ds)
                                        #here we need to annotate acquisition object as an anatomical or structural
                                        #scan.... 
                                        #for FBIRN the directory names will tell us but for other data sets
                                        #we should match the DICOM tags to a classification scheme
                                        if ( ((seriesname.lower().find("t1"))>0 ) or ((seriesname.lower().find("t2"))>0)):
                                            #add attribute for anatomical scan
                                            g.add((nidash[session_id], rdf.RDF.type, nidash["MRAnatomical"])) 
                                        else:
                                            g.add((nidash[session_id], rdf.RDF.type, nidash["MRFunctional"])) 
                             
                                        
                                        #add filename/location to entity
                                        g.add((nidash[session_id], nfo["filename"], rdf.Literal("DICOM.tar.gz")))
                                        g.add((nidash[session_id], prov["atLocation"], rdf.Literal(os.path.join(dname,visitname,studyname,seriesname,"Native","Original__0001"))))
                                        
                                        #create association with anatomical activity
                                        g.add((nidash[session_id], prov["wasGeneratedBy"], nidash[session_act_id]))
                                except (tarfile.TarError,tarfile.ReadError,tarfile.CompressionError,tarfile.StreamError,tarfile.ExtractError,tarfile.HeaderError):
                                    print "Error opening DICOM file"+dcmfile
                                   
                                    
                                
                                        
                                        
                                              
                                        
                                
                          
                

#Assessment modeling

In [14]:
#HID assessments are downloaded with filenames "data_download_[AssessmentName].csv" where [AssessmentName] matches the 
#name of the assessment in the data dictionary files.
#For FBIRN phase III there are CMINDS assessments along with HID/tablet clinical assessments.  For the CMINDS assessments
#each assessment has a separate XLS file with the data dictionary.  For the HID/tablet clinical assessments, all
#assessments are in the same data dictionary XLS file...so there will be some complexity in modeling these...


#Since we are going to store the data dictionaries in a separate TTL file, create a new graph
#dd_graph = rdf.Graph()
#changed to use the same file as the rest of the experiment data
dd_graph = g
#bind namespaces
dd_graph.bind('nidash', nidash)
dd_graph.bind('prov', prov)
dd_graph.bind('ncit', ncit)
dd_graph.bind('nidash', nidash)
dd_graph.bind('fbirn', fbirn)

#Find data_download_[*].csv and loop through them
data_dictionary_dir = "./HID_DataDictionary"
#Added acquired assessment directory for code clarity
assessment_dir = root_dir

#create dictionary of assessment UIDs
assessment_dict = {}
acquired_assessment_dict = {}

#create dictionary of value set, coded properties
coded_property_dict = {}

#non-coded property variables
assessment_variables_dict = {}

#for dd_file in os.listdir(os.path.join(root_dir, "data_download*")):
for dd_file in glob(os.path.join(root_dir, "data_download*")):
    datadic_file = ""
    assessment_name = ""

    #do data dictionary modeling first so parse assessment name from dd_file and see if it's one of the CMINDS tests
    tokens=dd_file.split("_")
    #tokens[1]==data, tokens[2]==download, tokens[3]==assessment_name.csv
    #strip off .csv
    tokens[3] = re.sub('\.csv$','', tokens[3])

    #look for matching file in data_dictionary_dir
    for datadic in glob(os.path.join(data_dictionary_dir,"*")):
        #note, CMINDS assessment data dictionaries are stored in separate files named by the assessment vs. FBIRN clinical
        #data collected on the tablet are all stored in the same data dictionary file....so if we can't find a filename
        #matching tokens[3] then it's not a CMINDS assessment
        if (datadic.find(tokens[3])!= -1):
            
            #check if assessment name has already been encountered, if so use existing ID
            #if assessment_dict.has_key("CMINDS_"+ tokens[3]):
            #    assessment_id = assessment_dict["CMINDS_"+ tokens[3]]
            
            #Removed "CMINDS" string from these assessment keys (see previous 2 lines)...decided not to differentiate
            #CMINDS assessments from tablet-based clinical assessments.  We could do this using an agent which is
            #probably the sematically correct way since they were collected using different devices.
            if assessment_dict.has_key(tokens[3]):
                assessment_id = assessment_dict[tokens[3]]
            
            else:
                #print "found matching data dictionary: " + datadic
                #assessment_id = "CMINDS_"+ tokens[3] + "_" + str(uuid.uuid1())
                #assessment_id = "CMINDS_"+ tokens[3]
                #assessment_dict["CMINDS_"+ tokens[3]] = assessment_id
                
                #this doesn't need to be a dictionary anymore since we ditched the UUIDs.  This dictionary
                #was used to map assessment name -> assessment name + UUID
                assessment_id = tokens[3]+"DataStructure"
                assessment_dict[tokens[3]] = assessment_id
            
               
            #create assessment entity
            nidm_create_assessment(dd_graph, assessment_id)
            
            
            #now open CMINDS data dictionary and start modeling
            #Note, CMINDS data dictionaries are 1 sheet, no value-sets for these
            xls = pd.ExcelFile(datadic)
            #3rd sheet is summary variables
            variables = xls.parse(2)
            #iterate over the variables in the data dictionary
            for index, row in variables.iterrows():
                #add elements to RDF graph for variable definitions
                nidm_add_elements(dd_graph,row,assessment_id, "Variable Name", "Description", "Valid_Values") 
                #store assessment variables in dictionary
                assessment_variables_dict[row["Variable Name"]] =row["Variable Name"] 
                
            
                  
        else:
            #assessment not a CMINDS assessment so parse data dictionary stuff from HID PhaseIII data dictionary
            xls = pd.ExcelFile(os.path.join(data_dictionary_dir,"FBIRN_PhaseIII_Assessment_DataDictionary_20110330.xls"))
            variables = xls.parse(0)
            value_sets = xls.parse(1)
            #iterate over the variables in the data dictionary
            for index, row in variables.iterrows():
            
                #check if assessment name has already been encountered, if so use existing ID
                if assessment_dict.has_key(tokens[3]):
                    assessment_id = assessment_dict[tokens[3]]
                else:
                    #print "found matching data dictionary: " + datadic
                    #assessment_id = tokens[3] + "_" + str(uuid.uuid1())
                    
                    #this doesn't need to be a dictionary anymore since we ditched the UUIDs.  This dictionary
                    #was used to map assessment name -> assessment name + UUID
                    assessment_id = tokens[3]+"DataStructure"
                    assessment_dict[tokens[3]] = assessment_id
            
                #create assessment entity
                nidm_create_assessment(dd_graph, assessment_id)
            
                
                #if assessment name in column A matches the assessment name in the data_download_[assessment name].csv
                #filename then we'll start parsing the value set information from the other columns
                if (row['Assessment Name'].find(tokens[3])!=-1):
                    #print "Found match: " + tokens[3] + " in " + row['Assessment Name']
                    #add elements to RDF graph for variable definitions
                    nidm_add_elements(dd_graph,row,assessment_id, "Data ID", "Question Text","") 
              
                    #store assessment variables in list
                    assessment_variables_dict[row["Data ID"]] = row["Data ID"]

            
                    #look for value sets in data dictionary that match the assessment name+variable name
                    query_sets = value_sets[value_sets["Data ID"].str.contains(row["Data ID"])]
                    #iterate over the value set rows and add to RDF graph
                    for query_index, query_row in query_sets.iterrows():
                        #print query_row['SCORECODE'] + "," + query_row['SCORELABEL'] 
                        #add attribute to coded data element for valueset
                        #valueset_id = assessment_id+"_"+row["Data ID"]+"_ValueSet"
                        valueset_id = row["Data ID"]+"_ValueSet"
                        dd_graph.add((nidash[row["Data ID"]], nidash["ValueSet"], nidash[valueset_id]))
                        #create value set collection
                        dd_graph.add((nidash[valueset_id], rdf.RDF.type, prov["Collection"]))
                        dd_graph.add((nidash[valueset_id], rdf.RDF.type, nidash["ValueSet"]))
                        nidm_add_codedproperty(dd_graph, query_row, valueset_id ,row["Data ID"] + "_" + str(query_row["SCORECODE"]), "SCORECODE","SCORELABEL" )
                        #This if for finding a coded property URL from the acquired data.  If the assessment variable
                        #is a coded-property then in the acquired assessment data entity instead of using the code literal 
                        #we use the coded-property URL by searching this dictionary 
                        coded_property_dict[row["Data ID"]] = str(query_row["SCORECODE"])
    
    #print"Coded Property Dict"
    #print coded_property_dict
    
    #print "Assessments variables list"
    #print assessment_variables_list
    
    #parse/model measured assessments
    input_file = csv.DictReader(open(dd_file))
    
    #dictionary keys to skip during assessment data modeling
    nondata_keys = ['SubjectID', 'SiteID', 'ExperimentID', 'VisitID', 'SegmentID']
    
    #print coded_property_dict
    
    #figure out which experiment, subj, site, and visit each row corresponds to and model assessment data
    #match with assessment from data dictionary
    if assessment_dict.has_key(tokens[3]):
        assessment_id = assessment_dict[tokens[3]]
    else:
        #if this happens we won't be able to link the assessment data to a data dictionary class
        #so let's just create an ad-hoc assessment in the data dictionary with no items unfortunately.  We could
        #create an empirical data dictionary from the collected assessment data as we did with the SimpleData 
        #example: https://github.com/incf-nidash/nidm/tree/master/nidm/nidm-experiment/simpledata_example_ohbm_hack_2016
        assessment_id = tokens[3]+"DataStructure"+"_"+str(uuid.uuid1())
        nidm_create_assessment(dd_graph, assessment_id)
        print "ERROR: Assessment data found with no matching data dictionary class!!"
        print "Creating assessment object in data dictionary with no elements. Consider creating empirical data dictionary!"

 
    #each row is a subject observation key'd by subjID, siteID, ExperimentID, and VisitID
    for line in input_file:
        #create assessment collection activity for each subject/project/visit/assessment
        activity_uri = "AcquisitionActivity_"+line["SubjectID"]+"_"+str(line["VisitID"]).rjust(4,'0')
        #add activity to graph
        g.add((nidash[activity_uri], rdf.RDF.type, prov["Activity"]))
        #add label for debugging/model evaluation
        g.add((nidash[activity_uri],prov["label"], rdf.Literal(" Assessment Data Collection Activity")))
        
        #match data to correct experiment 
        if exp_collection_dict.has_key(str(line["ExperimentID"])):
            #print "Found Experiment Collection Object"
            #associate acquired assessment with experiment acquisition activity
            g.add((nidash[activity_uri], prov["wasAssociatedWith"], nidash[exp_collection_dict[str(line["ExperimentID"])]]))
        #associate activity with subject agent
        if subj_dict.has_key(str(line["SubjectID"])):
            #associated activity with subject
            g.add((nidash[activity_uri], prov["wasAssociatedWith"], nidash[subj_dict[str(line["SubjectID"])]]))
        else:
            #create subject
            subj_uid = str(line["SubjectID"])
            subj_dict[str(line["SubjectID"])] = subj_uid
            nidm_add_session_person(g,subj_uid, subj_uid)
    
        #associate activity with Visit
        if visit_dict.has_key(str(line["SubjectID"]) + "_" + str(line["VisitID"]).rjust(4,'0')):
            #print "Found Visit Collection Object"
            #associate acitivity with visit
            g.add((nidash[activity_uri], prov["wasAssociatedWith"], nidash[visit_dict[str(line["SubjectID"]) + "_" + str(line["VisitID"]).rjust(4,'0')]]))
        else:
            #if not session_dict.has_key(str(line["SubjectID"]) + "_" + str(line["VisitID"]).rjust(4,'0')):
                #create a session for this visit
            #    collectid = "SessionCollection_" + str(uuid.uuid1())
            #    session_dict[str(line["SubjectID"]) + "_" + str(line["VisitID"]).rjust(4,'0')] = collectid
            
            #add visit to session collection
            #add a visit collection for this visit
            visitid = "AcquisitionCollection_" + str(line["VisitID"]).rjust(4,'0') + "_" + "AssessmentAcquisition" + "_" + str(line["SubjectID"])
            visit_dict[str(line["SubjectID"]) + "_" + str(line["VisitID"]).rjust(4,'0')] = visitid
            g.add((nidash[visitid], rdf.RDF.type, prov["Entity"]))
            g.add((nidash[visitid], rdf.RDF.type, nidash["Session"]))
            g.add((nidash[visitid], ncit["VisitNum"], rdf.Literal(line["VisitID"])))
            g.add((nidash[visitid], ncit["StudySiteNumber"], rdf.Literal(line["SiteID"])))
            g.add((nidash[visitid], prov["label"], rdf.Literal("Assessment Acquisition Visit")))
            g.add((nidash[visitid], rdf.RDF.type, nidash["Assessment"]))
        
            #g.add((nidash[session_dict[str(line["SubjectID"]) + "_" + str(line["VisitID"]).rjust(4,'0')]], prov["hadMember"], nidash[visitid]))
            #associate subject with visit collection
            #visit collection is the "session" collection, attribute to subject
            g.add((nidash[visitid], prov["wasAttributedTo"], nidash[subj_dict[str(line["SubjectID"])]]))
                
            #associate assessment acquisition acitivity with visit
            g.add((nidash[activity_uri], prov["wasAssociatedWith"], nidash[visit_dict[str(line["SubjectID"]) + "_" + str(line["VisitID"]).rjust(4,'0')]]))

        #create an entity for the assessment data
        acquired_assessment_id = tokens[3] + "_" + line["SubjectID"] + "_" + str(line["VisitID"]).rjust(4,'0')
        nidm_create_assessment_acquisition_object(g, acquired_assessment_id, assessment_id)
        #add assessment items to entity but need to skip subjID, siteID, experimentID, and visitID columns
        
        for items in input_file.fieldnames:
            #if this key is not one of the non-data columns (see nondata_keys above) then it's data from the 
            #assessment
            if items not in nondata_keys:
                #check if variable_codedvalue is a coded property, if so pass the coded property URL instead of the actual value
                #find assessment item variable name from compound assessmentname_variablename patterns in 
                #acquired assessment download files
                #found=0
                for key in coded_property_dict.keys():
                    if (items.find(key) > 0):
                        var_name_indx  = items.find(key)
                        #add data to our entity where value is the URL of the coded-property
                        
                        nidm_add_elements_assessment_acquisition_object(g, acquired_assessment_id, items[var_name_indx:], nidash[items[var_name_indx:]] + "_" + safe_uri(coded_property_dict[key]))
                        #found = 1
                        #print "items = " + items + " ,var_name_indx = " + str(var_name_indx) + " , key= " + key + " ,items[var_name_indx:] = " + items[var_name_indx:] + \
                        #" , found = " + str(found)
                     
                        break
                else:
                    #this is not a coded property so just get variable name without extra compound name
                    #stuff that appears in the acquired assessment download file
                    for name in assessment_variables_dict.keys():
                        if (items.find(name) > 0):
                            #print "non-coded property: " + name
                            #name is the variable name 
                            #add the data to our entity
                            nidm_add_elements_assessment_acquisition_object(g, acquired_assessment_id, name, line[items])
        
        #add association with activity
        g.add((nidash[acquired_assessment_id], prov["wasGeneratedBy"], nidash[activity_uri]))

In [15]:
#print g.serialize(format='turtle')

In [16]:
with open("FBIRNPhaseIII_Experiment.ttl",'w') as f:
    f.write(g.serialize(format='turtle'))
#with open("FBIRNPhaseIII_DataDictionary.ttl",'w') as f:
#    f.write(dd_graph.serialize(format='turtle'))


In [17]:
from rdflib.tools import rdf2dot
with open("FBIRNPhaseIII_Experiment.dot",'w') as f:
    s = rdf2dot.rdf2dot(g, f)
#with open("FBIRNPhaseIII_DataDictionary.dot",'w') as f:
#    s = rdf2dot.rdf2dot(dd_graph, f)


UnicodeEncodeError: 'ascii' codec can't encode character u'\u2013' in position 461: ordinal not in range(128)