# OMOP to Phenopackets Conversion



In [1]:
import pymssql
import convertPheno
import time

## SQL Database Connection
Replace with appropriate server connection details and database names. **All inputs to be changed are within this block**

In [2]:
# Define the database connection parameters
server = 'INPUT SERVER DETAILS'
user = 'INPUT USER DETAILS'
password = 'INPUT PASSWORD'  
database = 'INPUT DATABASE NAME'

# Create a connection to the SQL Server, and cursor to execute queries
conn = pymssql.connect(server=server, user=user, password=password, database=database)
cur = conn.cursor()

# Input Variables

In [3]:
db = '''DATABASENAME.dbo.''' # Database storing patient data
ohdsi_db = '''OHDSI_DATABASENAME.dbo.''' # Database storing OHDSI vocabulary data
pid = '''(123456,123457,123458)''' # Insert comma separated list of person_ids (any number of pids >= 1)

# File path to the semantic type mapping file
sem_mapping_file = '../data/semantic_type_map.csv'

# Name of user - for metadata
name = 'INPUT USER NAME'

# Filepath to store Phenopacket JSONs
output_path = '/FILE PATH TO STORE PHENOPACKETS/' 

# Phenopacket Creation

## Domain Transformation

The result of this block is a dictionary for each domain (*Individual, Condition, PhenotypicFeature, Measurement, Treatment, Procedure*). Each dictionary's keys are person IDs, and their values are a list of entries for that entity. This block also produces logging for experimental evaluation. 

For each domain, three functions are executed: 
* *get_DOMAIN_query* extracts the OMOP SQL data 
* *parse_DOMAIN* converts the SQL records to the fields specified by Phenopackets documentation
* *createDictDOMAIN* transform data types and values, and formats the data according to Phenopackets specification
* *createPhenoDOMAIN* converts each indivdual's domain dictionaries to Phenopacket objects

Special notes:
* **Individual** entity combines data extracted from both the *individual* and the *vital_status* query. The output for the Individual entity is a dictionary whose values are an Individual dictionary itself (as opposed to the other domains that are lists), given that the subejct data has only one entry for each field. 
* **Condition and PhenotypicFeature** *parse_Conditions* implements semantic type filtering and provides two outputs. The second output is combined with the extracted data from the *PhenotypicFeature* query. 





In [None]:
# Tracking time
t1 = time.time()

# Individual and Vitals
cur.execute(convertPheno.get_individual_query(pid, db))
records =cur.fetchall()
mydict = convertPheno.parse_Individual(records)

cur.execute(convertPheno.get_vitalstatus_query(pid, db))
records = cur.fetchall()
vsdict = convertPheno.parse_VitalStatus(records)

idict_all = convertPheno.createDictIndividual(mydict,vsdict)

indiv_phenos = {}
for key, value in idict_all.items():
    indiv_phenos[key] = convertPheno.createPhenoIndividual(value) 

ellapsed_time = (time.time() - t1) / 60
print(f'Individual data extracted and transformed - {ellapsed_time:.01f} min')

# Conditions
phefeatures = convertPheno.get_sem_mapping(sem_mapping_file)

cur.execute(convertPheno.get_condition_query(pid, db, ohdsi_db))
records = cur.fetchall()
condict, phedict1 = convertPheno.parse_Conditions(records, phefeatures)
conlist = convertPheno.createListDictConditions(condict)

condition_phenos = {}
for key, value in conlist.items():
    condition_phenos[key] = convertPheno.createPhenoConditions(value) 

ellapsed_time = (time.time() - t1) / 60
print(f'Conditions data extracted and transformed - {ellapsed_time:.01f} min')

# PhenotypicFeatures
cur.execute(convertPheno.get_phenofeature_query(pid, db, ohdsi_db))
records = cur.fetchall()
phedict2 = convertPheno.parse_PhenoFeatures(records)

phelist1 = convertPheno.createListDictPhenoFeature(phedict1, flag = 'condition')
phelist2 = convertPheno.createListDictPhenoFeature(phedict2, flag = 'observation')
phelist = convertPheno.combineDicts(phelist1, phelist2)

feature_phenos = {}
for key, value in phelist.items():
    feature_phenos[key] = convertPheno.createPhenoFeature(value) 
    
ellapsed_time = (time.time() - t1) / 60
print(f'PhenotypicFeature data extracted and transformed - {ellapsed_time:.01f} min')

# Measurement
cur.execute(convertPheno.get_measurement_query(pid,db,ohdsi_db))
records = cur.fetchall()
mesdict = convertPheno.parse_Measurements(records)
meslist = convertPheno.createListDictMeasurements(mesdict)
measurement_phenos = {}
for key, value in meslist.items():
    measurement_phenos[key] = convertPheno.createPhenoMeasurement(value) 

ellapsed_time = (time.time() - t1) / 60
print(f'Measurement data extracted and transformed - {ellapsed_time:.01f} min')

# Treatment
cur.execute(convertPheno.get_treatment_query(pid,db,ohdsi_db))
records = cur.fetchall()
txdict = convertPheno.parse_Treatments(records)
txlist = convertPheno.createListDictTreatment(txdict)
treatment_phenos = {}
for key, value in txlist.items():
    treatment_phenos[key] = convertPheno.createPhenoTreatment(value) # dictionary where key is PID, value is list of phenopackets.schema.v2.core.medical_action_pb2.Treatment entities

ellapsed_time = (time.time() - t1) / 60
print(f'Treatment data extracted and transformed - {ellapsed_time:.01f} min')

# Procedure
cur.execute(convertPheno.get_procedure_query(pid,db,ohdsi_db))
records = cur.fetchall()
procdict = convertPheno.parse_Procedures(records)
proclist = convertPheno.createListDictProcedures(procdict)
procedure_phenos = {}
for key, value in proclist.items():
    procedure_phenos[key] = convertPheno.createPhenoProcedure(value) # dictionary where key is PID, value is list of phenopackets.schema.v2.core.medical_action_pb2.Procedure entities

ellapsed_time = (time.time() - t1) / 60
print(f'Procedure data extracted and transformed - {ellapsed_time:.01f} min')

## Phenopacket Generation
* Generate metadata for the given Phenopacket
* Generate the overall Phenopacket, combining domains
* Writes the Phenopacket to JSON

In [None]:
t1 = time.time()

meta_data=convertPheno.createMetadata(name)

count_pids = 0

for pid in indiv_phenos.keys():

    medical_act_args = {
        'txpheno': treatment_phenos[pid] if pid in treatment_phenos else None,
        'procpheno': procedure_phenos[pid] if pid in procedure_phenos else None
    }
    
    medicalactpheno = convertPheno.createPhenoMedicalAction(**medical_act_args)

    # Create a dictionary of arguments with values if pid exists in the respective dictionaries
    pheno_args = {
        'myid': str(pid),
        'meta_data': meta_data,
        'subject': indiv_phenos[pid] if pid in indiv_phenos else None,
        'phenotypic_features': feature_phenos[pid] if pid in feature_phenos else None,
        'measurements': measurement_phenos[pid] if pid in measurement_phenos else None,
        'diseases': condition_phenos[pid] if pid in condition_phenos else None,
        'medical_actions': medicalactpheno if (len(medicalactpheno) > 0) else None
    }

    pheno = convertPheno.createPheno(**pheno_args)

    # Write Phenopacket
    # outputfile = "PATH_" + time.strftime("%Y%m%d") + '_' + pid + '.json'
    outputfile = output_path + "phenopacket_" + time.strftime("%Y%m%d") + '_' + str(pid) + '.json'
    pheno_json= convertPheno.MessageToJson(pheno)
    with open(outputfile,'w') as of:
        of.write(pheno_json)
        print(f'{pid}: phenopacket written in file {outputfile}')
    
    # Tracking time 
    count_pids += 1 
    ellapsed_time = (time.time() - t1) / 60
    if(count_pids % 10 == 0): print(f'{count_pids} - {ellapsed_time:.01f} min')