In [6]:
import csv
import pandas as pd
import numpy as np
import urllib.parse
import requests
import simplejson
import json
import re
import xml.etree.ElementTree as ET

In [7]:
class XML2DataFrame:

    def __init__(self, xml_data):
        self.root = ET.XML(xml_data)

    def parse_root(self, root):
        return [self.parse_element(child) for child in iter(root)]

    def parse_element(self, element, parsed=None):
        if parsed is None:
            parsed = dict()
        for key in element.keys():
            parsed[key] = element.attrib.get(key)
        if element.text:
            parsed[element.tag] = element.text
        for child in list(element):
            self.parse_element(child, parsed)
        return parsed

    def process_data(self):
        structure_data = self.parse_root(self.root)
        return pd.DataFrame(structure_data)


Read RH Pangea Quantites

In [8]:
root = ET.parse('data/quantities_pangaea.xml').getroot()
xmlstr = ET.tostring(root, encoding='utf8', method='xml')
xml2df = XML2DataFrame(xmlstr)
xml_df = xml2df.process_data()
xml_df.head()

Unnamed: 0,dimension,name,quantity,source,unit
0,L-1.Z,Abundance per Length/Frequency,\t,pan,n/m
1,L-2.Z,Individual Density,\t,pan,n/m2
2,L-3.Z,Abundance per Volume,\t,pan,n/m3
3,M-1.Z,Abundance per Mass,\t,pan,n/g
4,Z,Abundance or Count,\t,pan,n


In [9]:
xml_df = xml_df.drop(columns=['quantity', 'source'])
xml_df["Source"]= 'pangaea'
xml_df.shape

(25, 4)

In [10]:
xml_df= xml_df[['name','dimension','unit','Source']]
xml_df.head()

Unnamed: 0,name,dimension,unit,Source
0,Abundance per Length/Frequency,L-1.Z,n/m,pangaea
1,Individual Density,L-2.Z,n/m2,pangaea
2,Abundance per Volume,L-3.Z,n/m3,pangaea
3,Abundance per Mass,M-1.Z,n/g,pangaea
4,Abundance or Count,Z,n,pangaea


In [11]:
xml_df.shape

(25, 4)

QUDT

In [12]:
#validate formatted units through ucum service
HOST = 'http://localhost:8080'
UCUM_SERVICE_VALIDATE =HOST+'/pucum/v1/api/validate/'

In [13]:
df = pd.read_excel('data/QUDT_QuantityDimUnit.xlsx', encoidng="utf-8",headers=0, delimiter=",",converters={'Dimension':str,'Unit':str})
#df['Quantity']= df['Quantity'].astype(str)
#df['Dimension']= df['Dimension'].astype(str)
#df['UnitName']= df['UnitName'].astype(str)
#df['Unit']= df['Unit'].astype(str)

In [14]:
df['Source']='qudt'
df.head()

Unnamed: 0,Quantity,Dimension,UnitName,Unit,Source
0,Absolute Humidity,,,,qudt
1,Absorbed Dose,,Gray,Gy,qudt
2,Absorbed Dose,,Rad,RAD,qudt
3,Absorbed Dose Rate,L^2 T^-3,Gray per Second,Gy/s,qudt
4,Acceleration,,,,qudt


In [15]:
df[df.Quantity =="Absorbed Dose"]

Unnamed: 0,Quantity,Dimension,UnitName,Unit,Source
1,Absorbed Dose,,Gray,Gy,qudt
2,Absorbed Dose,,Rad,RAD,qudt


In [16]:
df[df.Quantity =="Angle"]

Unnamed: 0,Quantity,Dimension,UnitName,Unit,Source
12,Angle,,Radian,rad,qudt
13,Angle,,Arc Second,arcSec,qudt
14,Angle,,Arc Minute,arcMin,qudt
15,Angle,,Degree Angle,°,qudt
16,Angle,,Second Angle,,qudt
17,Angle,,Minute Angle,,qudt


In [17]:
df.dtypes, df.shape

(Quantity     object
 Dimension    object
 UnitName     object
 Unit         object
 Source       object
 dtype: object, (2401, 5))

In [18]:
#df.dropna(subset=['Dimension'], inplace=True)
# remove duplicate rows
#df = df.drop_duplicates().reset_index(drop=True)
#df.head()

In [19]:
df.shape,df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2401 entries, 0 to 2400
Data columns (total 5 columns):
Quantity     2401 non-null object
Dimension    2027 non-null object
UnitName     2332 non-null object
Unit         2049 non-null object
Source       2401 non-null object
dtypes: object(5)
memory usage: 93.9+ KB


((2401, 5), None)

In [20]:
df.nunique()

Quantity     234
Dimension    146
UnitName     785
Unit         541
Source         1
dtype: int64

In [21]:
#remove rows where dimension and unit are null
df =  df.dropna(subset=['Dimension','Unit'], how='all')
df = df.reset_index(drop=True)
df.head()

Unnamed: 0,Quantity,Dimension,UnitName,Unit,Source
0,Absorbed Dose,,Gray,Gy,qudt
1,Absorbed Dose,,Rad,RAD,qudt
2,Absorbed Dose Rate,L^2 T^-3,Gray per Second,Gy/s,qudt
3,Activity,,Becquerel,Bq,qudt
4,Activity,,Curie,Ci,qudt


In [22]:
df.shape

(2157, 5)

In [23]:
df[df.Unit =='a']

Unnamed: 0,Quantity,Dimension,UnitName,Unit,Source
63,Area,L^2,Are,a,qudt
75,Area,L^2,Are,a,qudt
87,Area,L^2,Are,a,qudt
99,Area,L^2,Are,a,qudt
228,Dimensionless,U,Atto,a,qudt
262,Dimensionless,U,Atto,a,qudt
296,Dimensionless,U,Atto,a,qudt
330,Dimensionless,U,Atto,a,qudt
364,Dimensionless,U,Atto,a,qudt
398,Dimensionless,U,Atto,a,qudt


In [24]:
def pattern_match(m):
    m = m.group(0).replace('-', '.')
    return m

In [25]:
#reformat units, dimensions
pattern  = re.compile(r'-[A-z]+') #ft^2-s-degF -> ft2.s.degF

def replaceUnits(u):
    if pd.isnull(u):
        return u
    else:
        u= u.replace('^','')
        u1 = pattern.sub(pattern_match, u)
        u2 = re.sub(r'\b(\s)\b', '.', u1)# L^2 M T^-1 -> L2.M.T-1 #\w is equal to [A-Za-z0-9_]
        u2 = u2.replace(" ", "")
        #replace last dot
        u2 = u2.rstrip('.')
        return u2
#t ='cm^2-m' , 'ft^2-s-degF', 'lbf / s', 'L^2 M T^-1 '
replaceUnits('C m-3')

'C.m-3'

In [26]:
#df['Dimension'] = df['Dimension'].apply(replaceUnits )
#df['UnitFormatted'] = df['Unit'].apply(replaceUnits )

In [27]:
df['DimensionFormatted'] = df['Dimension']
df["Ucum"] = np.nan
df.head()

Unnamed: 0,Quantity,Dimension,UnitName,Unit,Source,DimensionFormatted,Ucum
0,Absorbed Dose,,Gray,Gy,qudt,,
1,Absorbed Dose,,Rad,RAD,qudt,,
2,Absorbed Dose Rate,L^2 T^-3,Gray per Second,Gy/s,qudt,L^2 T^-3,
3,Activity,,Becquerel,Bq,qudt,,
4,Activity,,Curie,Ci,qudt,,


In [28]:
def validateUnit(uom):
    q = UCUM_SERVICE_VALIDATE+ urllib.parse.quote(uom) 
    resp = requests.get(q)
    data = resp.json()
    if (resp.status_code == requests.codes.ok):
        return data['ucum']
    else:
        return None

In [29]:
invalid_units =[]

In [30]:
for index, row in df.iterrows():
    if pd.notnull(row['Unit']):
        uom = row['Unit']
        ucum = validateUnit(uom)
        if not (ucum is None):
            #df.UnitFormatted.iloc[index] = ucum
            df.loc[index,'Ucum'] = ucum
        else:
            formattedUnit = replaceUnits(uom)
            ucum_fmt = validateUnit(formattedUnit)
            if not (ucum_fmt is None):
                df.loc[index,'Ucum'] = ucum_fmt
            else:
                invalid_units.append(uom)

In [31]:
#units_list = df['UnitFormatted'].unique()
#cleanedList = [x for x in units_list if str(x) != 'nan']
#len(units_list), len(cleanedList)

In [32]:
invalid_units= list(set(invalid_units))
len(set(invalid_units)), invalid_units[:10]

(94,
 ['statH',
  'statF',
  'rev',
  'kip',
  'Z',
  'Q_p',
  'Ti',
  'l_P',
  'rev/h',
  'therm (US)'])

In [33]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2157 entries, 0 to 2156
Data columns (total 7 columns):
Quantity              2157 non-null object
Dimension             2027 non-null object
UnitName              2147 non-null object
Unit                  2049 non-null object
Source                2157 non-null object
DimensionFormatted    2027 non-null object
Ucum                  1610 non-null object
dtypes: object(7)
memory usage: 118.0+ KB


In [34]:
#Replace dimensions
df['DimensionFormatted'] = df['DimensionFormatted'].str.replace('T I','Q')
df['DimensionFormatted'] = df['DimensionFormatted'].str.replace('Θ','C')
df['DimensionFormatted'] = df['DimensionFormatted'].apply(replaceUnits )
#df = df.drop_duplicates(subset=['Quantity', 'Dimension','Unit','Ucum'])
df = df.drop_duplicates()

In [35]:
df.shape

(847, 7)

In [36]:
df.head()

Unnamed: 0,Quantity,Dimension,UnitName,Unit,Source,DimensionFormatted,Ucum
0,Absorbed Dose,,Gray,Gy,qudt,,Gy
1,Absorbed Dose,,Rad,RAD,qudt,,RAD
2,Absorbed Dose Rate,L^2 T^-3,Gray per Second,Gy/s,qudt,L2.T-3,Gy/s
3,Activity,,Becquerel,Bq,qudt,,Bq
4,Activity,,Curie,Ci,qudt,,Ci


In [37]:
df[df.Ucum =="rad"]

Unnamed: 0,Quantity,Dimension,UnitName,Unit,Source,DimensionFormatted,Ucum
9,Angle,,Radian,rad,qudt,,rad
1613,Plane Angle,,Radian,rad,qudt,,rad


In [38]:
df[df.Ucum =="''"]

Unnamed: 0,Quantity,Dimension,UnitName,Unit,Source,DimensionFormatted,Ucum
10,Angle,,Arc Second,arcSec,qudt,,''
1617,Plane Angle,,Arc Second,arcSec,qudt,,''


In [39]:
#dfqudt = df.groupby( [ "Quantity", "Dimension"] )
#dfqudt.head()#
#dfqudt = df.groupby('Dimension')['Quantity'].apply(list)
#dfqudt.head()

In [40]:
df.shape, df.Unit.nunique(dropna=True), df.Ucum.nunique(dropna=True)

((847, 7), 541, 437)

In [41]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 847 entries, 0 to 2156
Data columns (total 7 columns):
Quantity              847 non-null object
Dimension             717 non-null object
UnitName              838 non-null object
Unit                  793 non-null object
Source                847 non-null object
DimensionFormatted    717 non-null object
Ucum                  617 non-null object
dtypes: object(7)
memory usage: 52.9+ KB


In [42]:
df = df[['Quantity','DimensionFormatted','Ucum','Source']]
df.head()

Unnamed: 0,Quantity,DimensionFormatted,Ucum,Source
0,Absorbed Dose,,Gy,qudt
1,Absorbed Dose,,RAD,qudt
2,Absorbed Dose Rate,L2.T-3,Gy/s,qudt
3,Activity,,Bq,qudt
4,Activity,,Ci,qudt


Combine QUDT and PANGAEA

In [43]:
df_final = pd.concat([df,xml_df.rename(columns={'name':'Quantity', 'dimension':'DimensionFormatted','unit':'Ucum','Source':'Source'})], ignore_index=True)

In [44]:
df_final.head()

Unnamed: 0,Quantity,DimensionFormatted,Ucum,Source
0,Absorbed Dose,,Gy,qudt
1,Absorbed Dose,,RAD,qudt
2,Absorbed Dose Rate,L2.T-3,Gy/s,qudt
3,Activity,,Bq,qudt
4,Activity,,Ci,qudt


In [45]:
#df_final.to_csv('qudt_quantities.csv', sep='\t', encoding='utf-8',index=False)
df_final.shape

(872, 4)

In [46]:
df_final[df_final.Ucum=='n/m3/s']

Unnamed: 0,Quantity,DimensionFormatted,Ucum,Source
858,Accumulation Rate,L-3.Z.T-1,n/m3/s,pangaea


In [47]:
out = df_final.to_json(orient='records').replace('[', '{\"Quantitites\":[',1)
out = out.replace(']', ']}', -1)

In [48]:
#with open('data/quantities.json', 'w') as outfile:  
    #json.dump(datastore, outfile)

In [49]:
from datetime import datetime
datestring = datetime.strftime(datetime.now(), '%Y%m%d')
f = 'data/quantities_'+datestring+'.json'
with open(f, 'w',encoding='utf-8') as f:
    f.write(out)