In [150]:
import csv
import pandas as pd
import re
import numpy as np
import string, math

In [151]:
df = pd.read_excel('data/ImportedQUDT.xlsx', headers=0, delimiter=",")

In [152]:
df.nunique()

Term              234
Abbreviation       35
Description       102
Comment             0
SemanticURI       234
URI                95
Root Term           1
Master Term         0
ID Terminology      1
ID Category         1
IDTermStatus        1
dtype: int64

In [153]:
df['Description'] = df['Description'].astype(str)

In [154]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 234 entries, 0 to 233
Data columns (total 11 columns):
Term              234 non-null object
Abbreviation      46 non-null object
Description       234 non-null object
Comment           0 non-null float64
SemanticURI       234 non-null object
URI               99 non-null object
Root Term         234 non-null int64
Master Term       0 non-null float64
ID Terminology    234 non-null int64
ID Category       234 non-null int64
IDTermStatus      234 non-null int64
dtypes: float64(2), int64(4), object(5)
memory usage: 20.2+ KB


In [155]:
df.head()

Unnamed: 0,Term,Abbreviation,Description,Comment,SemanticURI,URI,Root Term,Master Term,ID Terminology,ID Category,IDTermStatus
0,Absolute Humidity,,Absolute humidity is the mass of water in a pa...,,http://qudt.org/1.1/vocab/quantity#AbsoluteHum...,,0,,13,3,3
1,Absorbed Dose,,Absorbed dose (also known as Total Ionizing Do...,,http://qudt.org/1.1/vocab/quantity#AbsorbedDose,http://dbpedia.org/resource/Absorbed_dose,0,,13,3,3
2,Absorbed Dose Rate,,,,http://qudt.org/1.1/vocab/quantity#AbsorbedDos...,,0,,13,3,3
3,Acceleration,,Acceleration is the (instantaneous) rate of ch...,,http://qudt.org/1.1/vocab/quantity#Acceleration,http://dbpedia.org/resource/Acceleration,0,,13,3,3
4,Action,S,,,http://qudt.org/1.1/vocab/quantity#Action,,0,,13,3,3


In [156]:
df[df.Term.duplicated()]

Unnamed: 0,Term,Abbreviation,Description,Comment,SemanticURI,URI,Root Term,Master Term,ID Terminology,ID Category,IDTermStatus


In [157]:
df[df.Term=='Energy and Work'] # duplicates because of two skos:exactMatch

Unnamed: 0,Term,Abbreviation,Description,Comment,SemanticURI,URI,Root Term,Master Term,ID Terminology,ID Category,IDTermStatus
63,Energy and Work,,,,http://qudt.org/1.1/vocab/quantity#EnergyAndWork,http://dbpedia.org/resource/Energy,0,,13,3,3


In [158]:
df[df.Term=='Kinetic Energy']#duplicates because of same term defined twice in qudt.

Unnamed: 0,Term,Abbreviation,Description,Comment,SemanticURI,URI,Root Term,Master Term,ID Terminology,ID Category,IDTermStatus
106,Kinetic Energy,,The energy which a body possesses as a consequ...,,http://qudt.org/1.1/vocab/quantity#KineticEnergy,http://dbpedia.org/resource/Kinetic_energy,0,,13,3,3


In [159]:
#df.drop(df.index[[64,68]])

# Format Terms

In [160]:
#df['Description'] = df['Description'].str.strip()
#df["Description"] = df["Description"].map(str.strip)

In [161]:
regex = re.compile(r'[\n\r\t]+')
mpa = dict.fromkeys(range(32))

def formatControlChar(s):
    txt = regex.sub('|', s)
    txt = txt.translate(mpa)
    return ' '.join(txt.split())

df['DescriptionFormatted'] = df['Description'].map(formatControlChar)
df['DescriptionFormatted'] = df['DescriptionFormatted'].replace('nan','')

In [162]:
df.head(3)

Unnamed: 0,Term,Abbreviation,Description,Comment,SemanticURI,URI,Root Term,Master Term,ID Terminology,ID Category,IDTermStatus,DescriptionFormatted
0,Absolute Humidity,,Absolute humidity is the mass of water in a pa...,,http://qudt.org/1.1/vocab/quantity#AbsoluteHum...,,0,,13,3,3,Absolute humidity is the mass of water in a pa...
1,Absorbed Dose,,Absorbed dose (also known as Total Ionizing Do...,,http://qudt.org/1.1/vocab/quantity#AbsorbedDose,http://dbpedia.org/resource/Absorbed_dose,0,,13,3,3,Absorbed dose (also known as Total Ionizing Do...
2,Absorbed Dose Rate,,,,http://qudt.org/1.1/vocab/quantity#AbsorbedDos...,,0,,13,3,3,


In [163]:
df.iloc[41]['Description']

'Dynamic Pressure (indicated with q, or Q, and sometimes called velocity pressure) is the quantity defined by:\n\nq = 1/2 * ρv^2\n \nwhere (using SI units):\n\nq = dynamic pressure in pascals \nρ = fluid density in kg/m3 (e.g. density of air) \nv = fluid velocity in m/s'

In [164]:
df.iloc[41]['DescriptionFormatted']

'Dynamic Pressure (indicated with q, or Q, and sometimes called velocity pressure) is the quantity defined by:|q = 1/2 * ρv^2| |where (using SI units):|q = dynamic pressure in pascals |ρ = fluid density in kg/m3 (e.g. density of air) |v = fluid velocity in m/s'

In [165]:
df.drop('Description', axis=1, inplace=True)
df.rename(columns={'DescriptionFormatted': 'Description'}, inplace=True)

In [166]:
cols = ['Term','Abbreviation','Description','Comment','SemanticURI','URI','Root Term','Master Term','ID Terminology','ID Category','IDTermStatus']

In [167]:
df = df[cols]
df.head(2)

Unnamed: 0,Term,Abbreviation,Description,Comment,SemanticURI,URI,Root Term,Master Term,ID Terminology,ID Category,IDTermStatus
0,Absolute Humidity,,Absolute humidity is the mass of water in a pa...,,http://qudt.org/1.1/vocab/quantity#AbsoluteHum...,,0,,13,3,3
1,Absorbed Dose,,Absorbed dose (also known as Total Ionizing Do...,,http://qudt.org/1.1/vocab/quantity#AbsorbedDose,http://dbpedia.org/resource/Absorbed_dose,0,,13,3,3


In [168]:
df.to_csv('data/qudt_terms.csv', sep='\t', encoding='utf-8', line_terminator='\n', index=False)

In [169]:
quantityList = df.Term.tolist()
len(quantityList)

234

# Format Relations

In [170]:
dfr = pd.read_excel('data/ImportedQUDT.xlsx', headers=0, delimiter=",", sheet_name='Term_Relation_All', index=False)
dfr.shape

(235, 3)

In [171]:
dfr[dfr.duplicated()]

Unnamed: 0,Term,Relation Type,Term.1
37,Kinetic Energy,has broader term,Energy and Work


In [172]:
#dfr.to_csv('data/qudt_terms_relations.csv', sep='\t', encoding='utf-8', line_terminator='\n',index=False)
dfr = dfr.drop_duplicates()
dfr.shape

(234, 3)

In [173]:
dfr.head()

Unnamed: 0,Term,Relation Type,Term.1
0,Angular Acceleration,has broader term,Acceleration
1,Linear Acceleration,has broader term,Acceleration
2,Serum or Plasma Level,has broader term,Amount of Substance Per Unit Volume
3,Plane Angle,has broader term,Angle
4,Solid Angle,has broader term,Angle


In [174]:
param ='Inverse Volume'
dfr[dfr.Term.str.contains(param) | dfr['Term.1'].str.contains(param)]

Unnamed: 0,Term,Relation Type,Term.1
49,Number Density,has broader term,Inverse Volume
185,Inverse Volume,has broader term,Quantity Kind


In [175]:
list(set(dfr.Term.tolist()) - set(quantityList))

[]

In [176]:
dfr.to_csv('data/qudt_terms_relations.csv', sep='\t', encoding='utf-8', line_terminator='\n',index=False)