In [1]:
import csv
import pandas as pd
import re
import numpy as np
import urllib.parse
import requests

In [2]:
dfori = pd.read_csv('data/pangaea_mappings_original.csv', names =['unit','ucum'],header=0,delimiter=",",encoding='utf-8')

In [3]:
dfori.shape
dfori['source'] = 'pangaea'

In [4]:
dfori.head()

Unnamed: 0,unit,ucum,source
0,#,n,pangaea
1,(K2)m/W,m2.K/W,pangaea
2,(\\(eq\\)),{eq},pangaea
3,(eq),{eq},pangaea
4,AD,{AD},pangaea


In [5]:
dfori.shape

(174, 3)

In [6]:
dfext = pd.read_csv('data/concepts.tsv', sep='\t', header=0, encoding='utf-8')
dfext['source'] = 'fhir'
dfext.head(1)

Unnamed: 0,Code,Descriptive Name,Code System,Definition,Date Created,Synonym,Status,Kind of Quantity,Date Revised,ConceptID,Dimension,Unnamed: 11,source
0,%,Percent [Most Common Healthcare Units],PH_UnitsOfMeasure_UCUM_Expression,,07/01/2004,%,Active,Most Common Healthcare Units,12/08/2005,Percent,1,,fhir


In [7]:
dfext.shape

(343, 13)

In [8]:
dfext = dfext[['Synonym','Code','source']]

In [9]:
dfext.head()

Unnamed: 0,Synonym,Code,source
0,%,%,fhir
1,/uL,/uL,fhir
2,IU/L,[iU]/L,fhir
3,"K/uL, x10^3/mm^3",10*3/uL,fhir
4,"M/uL, x10^6/mm^3",10*6/uL,fhir


In [10]:
df = pd.concat([dfori,dfext.rename(columns={'Synonym':'unit', 'Code':'ucum', 'source':'source'})], ignore_index=True)
df.shape

(517, 3)

In [11]:
df[df.duplicated()]

Unnamed: 0,unit,ucum,source
209,%,%,fhir
312,pg,pg,fhir
370,fL,fL,fhir
403,mmol/kg,mmol/kg,fhir
456,pg/mL,pg/mL,fhir
457,ng/mL,ng/mL,fhir
459,ug/mL,ug/mL,fhir
461,g/mL,g/mL,fhir
464,ug/dL,ug/dL,fhir
466,mg/dL,mg/dL,fhir


In [12]:
df.head(2)

Unnamed: 0,unit,ucum,source
0,#,n,pangaea
1,(K2)m/W,m2.K/W,pangaea


In [13]:
#another external file IsoToUcum
dfext2 = pd.read_excel('data/IsoToUcum.xlsx',encoding='utf-8')
dfext2['source']='hl7'
dfext2.head()

Unnamed: 0,ISO+,UCUM,source
0,(arb_u),[arb'U],hl7
1,(bdsk_u),[bdsk'U],hl7
2,(bsa),{bsa},hl7
3,(cal),cal,hl7
4,(cfu),{cfu},hl7


In [14]:
df_final = pd.concat([df,dfext2.rename(columns={'ISO+':'unit', 'UCUM':'ucum','source':'source'})], ignore_index=True)
df_final.shape

(613, 3)

In [15]:
df_final.head(2)

Unnamed: 0,unit,ucum,source
0,#,n,pangaea
1,(K2)m/W,m2.K/W,pangaea


In [16]:
df_final[df_final.duplicated()].shape

(20, 3)

In [17]:
df_final = df_final.drop_duplicates()
df_final.shape

(593, 3)

In [18]:
#remove all start with slash
df_final = df_final[~df_final['unit'].astype(str).str.startswith('/')]

In [19]:
#remove if both columns contains same value
df_final = df_final[df_final['unit'] != df_final['ucum']]
df_final.shape

(439, 3)

In [20]:
#one ucum can have many units, but not vice-versa!
df_final[df_final.duplicated(subset=['unit'])]

Unnamed: 0,unit,ucum,source


In [21]:
df_final.head()

Unnamed: 0,unit,ucum,source
0,#,n,pangaea
1,(K2)m/W,m2.K/W,pangaea
2,(\\(eq\\)),{eq},pangaea
3,(eq),{eq},pangaea
4,AD,{AD},pangaea


In [22]:
df_final[df_final.unit=='sec']

Unnamed: 0,unit,ucum,source
334,sec,s,fhir


In [23]:
df_final = df_final.sort_values('unit', ascending=False)
df_final.head(10)

Unnamed: 0,unit,ucum,source
173,Å,Ao,pangaea
172,µtorr,u[Torr],pangaea
171,µg-at,n{molecule},pangaea
170,µ,u,pangaea
169,±,{StandardDeviation},pangaea
168,°C,Cel,pangaea
167,°,deg,pangaea
342,yr,a,fhir
166,years,a,pangaea
165,year,a,pangaea


In [24]:
df_final.shape

(439, 3)

In [25]:
from datetime import datetime
datestring = datetime.strftime(datetime.now(), '%Y%m%d')
f = 'data/pangaea_mappings_'+datestring+'.csv'
f

'data/pangaea_mappings_20181022.csv'

In [26]:
df_final.to_csv(f, index=False, header=False)

In [27]:
#external_units = list(set(dfext.Synonym.tolist()+dfext2['ISO+'].tolist()))
#len(external_units), dfori.unit.nunique()
#df_no = dfori[~dfori['unit'].isin(external_units)]
#df_no.to_csv('pangaea_mappings_ori.csv', index=False, header=False)

Test all ucum mappings

In [28]:
ucumlist = list(df_final.ucum.unique())
#validate formatted units through ucum service
HOST = 'http://localhost:8080'
UCUM_SERVICE_VALIDATE =HOST+'/pucum/v1/api/validate/'

In [29]:
ucumlist

['Ao',
 'u[Torr]',
 'n{molecule}',
 'u',
 '{StandardDeviation}',
 'Cel',
 'deg',
 'a',
 '[syd_i]',
 '[cyd_i]/min',
 '[cyd_i]',
 '[yd_i]',
 '10*8',
 '10*6/{Specimen}',
 '10*6',
 '10*5',
 '10*3.{RBC}',
 '10*3',
 'wk',
 '{v}',
 'm[Torr]',
 'umol/mol{Cre}',
 'umol/mg{Cre}',
 'umol/g{Hgb}',
 'umol/g{Cre}',
 'u[iU]',
 'ug/mg{Cre}',
 'ug/m2',
 'ug/h',
 'ug/g{Hgb}',
 'ug/g{DryWeight}',
 'ug/g{Cre}',
 'ug/dL{rbc}',
 'ug/{TotalVolume}',
 'ug/{Specimen}',
 'ug/(100.g)',
 'ug/(kg.h)',
 'ug/(8.h.kg)',
 'ueq/L',
 'u[iU]/mL',
 '[tsp_us]',
 '[oz_tr]',
 '[Torr]',
 't/[cyd_i]',
 't/[yd_i]',
 '[ston_av]/h',
 't',
 '10*-1',
 '[tbs_us]',
 '{tablet}',
 'S',
 's',
 "[arb'U]",
 'm{rmcd}',
 'REM',
 '[rd_us]',
 '{quanta}',
 '[qt_us]',
 '[pt_us]',
 '[psi]',
 '{protein}',
 '[pptr]{v}',
 '[pptr]',
 '[ppm]{v}',
 '[ppm]',
 '[ppb]{v}',
 '{polyp}',
 '[pk_us]',
 'n{pixel}',
 '[pt_br]',
 '{photons}',
 '[phi]',
 'Pa',
 'pA',
 '[pMC]',
 '[pH]',
 '[oz_av]',
 'osm/kg',
 'osm/L',
 'osm',
 'Ohm ',
 'Oe',
 '[octa]',
 'nmol/mmo

In [30]:
for u in ucumlist:
    q = UCUM_SERVICE_VALIDATE+ urllib.parse.quote(u) 
    resp = requests.get(q)
    data = resp.json()
    if (resp.status_code != requests.codes.ok):
        print(u)