In [100]:
import csv
import pandas as pd
import re
import numpy as np
import string, math

In [101]:
df = pd.read_excel('data/ImportedQUDT.xlsx', headers=0, delimiter=",")

In [102]:
df.nunique()

Term              234
Abbreviation       35
Description       102
Comment             0
SemanticURI       235
URI                96
Root Term           1
Master Term         0
ID Terminology      1
ID Category         1
IDTermStatus        1
dtype: int64

In [103]:
df['Description'] = df['Description'].astype(str)

In [104]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 236 entries, 0 to 235
Data columns (total 11 columns):
Term              236 non-null object
Abbreviation      46 non-null object
Description       236 non-null object
Comment           0 non-null float64
SemanticURI       236 non-null object
URI               101 non-null object
Root Term         236 non-null int64
Master Term       0 non-null float64
ID Terminology    236 non-null int64
ID Category       236 non-null int64
IDTermStatus      236 non-null int64
dtypes: float64(2), int64(4), object(5)
memory usage: 20.4+ KB


In [105]:
df.head()

Unnamed: 0,Term,Abbreviation,Description,Comment,SemanticURI,URI,Root Term,Master Term,ID Terminology,ID Category,IDTermStatus
0,Absolute Humidity,,Absolute humidity is the mass of water in a pa...,,http://qudt.org/1.1/vocab/quantity#AbsoluteHum...,,0,,13,3,3
1,Absorbed Dose,,Absorbed dose (also known as Total Ionizing Do...,,http://qudt.org/1.1/vocab/quantity#AbsorbedDose,http://dbpedia.org/resource/Absorbed_dose,0,,13,3,3
2,Absorbed Dose Rate,,,,http://qudt.org/1.1/vocab/quantity#AbsorbedDos...,,0,,13,3,3
3,Acceleration,,Acceleration is the (instantaneous) rate of ch...,,http://qudt.org/1.1/vocab/quantity#Acceleration,http://dbpedia.org/resource/Acceleration,0,,13,3,3
4,Action,S,,,http://qudt.org/1.1/vocab/quantity#Action,,0,,13,3,3


In [106]:
df[df.Term.duplicated()]

Unnamed: 0,Term,Abbreviation,Description,Comment,SemanticURI,URI,Root Term,Master Term,ID Terminology,ID Category,IDTermStatus
64,Energy and Work,,,,http://qudt.org/1.1/vocab/quantity#EnergyAndWork,http://dbpedia.org/resource/Work_%28physics%29,0,,13,3,3
108,Kinetic Energy,,The energy which a body possesses as a consequ...,,http://qudt.org/1.1/vocab/quantity#KineticEnergy,http://dbpedia.org/resource/Kinetic_energy,0,,13,3,3


In [107]:
df[df.Term=='Energy and Work'] # duplicates because of two skos:exactMatch

Unnamed: 0,Term,Abbreviation,Description,Comment,SemanticURI,URI,Root Term,Master Term,ID Terminology,ID Category,IDTermStatus
63,Energy and Work,,,,http://qudt.org/1.1/vocab/quantity#EnergyAndWork,http://dbpedia.org/resource/Energy,0,,13,3,3
64,Energy and Work,,,,http://qudt.org/1.1/vocab/quantity#EnergyAndWork,http://dbpedia.org/resource/Work_%28physics%29,0,,13,3,3


In [108]:
df[df.Term=='Kinetic Energy']#duplicates because of same term defined twice in qudt.

Unnamed: 0,Term,Abbreviation,Description,Comment,SemanticURI,URI,Root Term,Master Term,ID Terminology,ID Category,IDTermStatus
68,Kinetic Energy,,,,http://qudt.org/1.1/vocab/quantity#EnergyKinetic,http://dbpedia.org/resource/Kinetic_energy,0,,13,3,3
108,Kinetic Energy,,The energy which a body possesses as a consequ...,,http://qudt.org/1.1/vocab/quantity#KineticEnergy,http://dbpedia.org/resource/Kinetic_energy,0,,13,3,3


In [None]:
#df.drop(df.index[[64,68]])

# Format Terms

In [24]:
#df['Description'] = df['Description'].str.strip()
#df["Description"] = df["Description"].map(str.strip)

In [25]:
regex = re.compile(r'[\n\r\t]+')
mpa = dict.fromkeys(range(32))

def formatControlChar(s):
    txt = regex.sub('|', s)
    txt = txt.translate(mpa)
    return ' '.join(txt.split())

df['DescriptionFormatted'] = df['Description'].map(formatControlChar)
df['DescriptionFormatted'] = df['DescriptionFormatted'].replace('nan','')

In [26]:
df.head(3)

Unnamed: 0,Term,Abbreviation,Description,Comment,SemanticURI,URI,Root Term,Master Term,ID Terminology,ID Category,IDTermStatus,DescriptionFormatted
0,Absolute Humidity,,Absolute humidity is the mass of water in a pa...,,http://qudt.org/1.1/vocab/quantity#AbsoluteHum...,,0,,13,3,3,Absolute humidity is the mass of water in a pa...
1,Absorbed Dose,,Absorbed dose (also known as Total Ionizing Do...,,http://qudt.org/1.1/vocab/quantity#AbsorbedDose,http://dbpedia.org/resource/Absorbed_dose,0,,13,3,3,Absorbed dose (also known as Total Ionizing Do...
2,Absorbed Dose Rate,,,,http://qudt.org/1.1/vocab/quantity#AbsorbedDos...,,0,,13,3,3,


In [27]:
df.iloc[41]['Description']

'Dynamic Pressure (indicated with q, or Q, and sometimes called velocity pressure) is the quantity defined by:\n\nq = 1/2 * ρv^2\n \nwhere (using SI units):\n\nq = dynamic pressure in pascals \nρ = fluid density in kg/m3 (e.g. density of air) \nv = fluid velocity in m/s'

In [28]:
df.iloc[41]['DescriptionFormatted']

'Dynamic Pressure (indicated with q, or Q, and sometimes called velocity pressure) is the quantity defined by:|q = 1/2 * ρv^2| |where (using SI units):|q = dynamic pressure in pascals |ρ = fluid density in kg/m3 (e.g. density of air) |v = fluid velocity in m/s'

In [29]:
df.drop('Description', axis=1, inplace=True)
df.rename(columns={'DescriptionFormatted': 'Description'}, inplace=True)

In [30]:
cols = ['Term','Abbreviation','Description','Comment','SemanticURI','URI','Root Term','Master Term','ID Terminology','ID Category','IDTermStatus']

In [31]:
df = df[cols]
df.head(2)

Unnamed: 0,Term,Abbreviation,Description,Comment,SemanticURI,URI,Root Term,Master Term,ID Terminology,ID Category,IDTermStatus
0,Absolute Humidity,,Absolute humidity is the mass of water in a pa...,,http://qudt.org/1.1/vocab/quantity#AbsoluteHum...,,0,,13,3,3
1,Absorbed Dose,,Absorbed dose (also known as Total Ionizing Do...,,http://qudt.org/1.1/vocab/quantity#AbsorbedDose,http://dbpedia.org/resource/Absorbed_dose,0,,13,3,3


In [32]:
df.to_csv('data/qudt_terms.csv', sep='\t', encoding='utf-8', line_terminator='\n', index=False)

In [85]:
quantityList = df.Term.tolist()
len(quantityList)

236

Unnamed: 0,Term,Abbreviation,Description,Comment,SemanticURI,URI,Root Term,Master Term,ID Terminology,ID Category,IDTermStatus
64,Energy and Work,,,,http://qudt.org/1.1/vocab/quantity#EnergyAndWork,http://dbpedia.org/resource/Work_%28physics%29,0,,13,3,3
108,Kinetic Energy,,The energy which a body possesses as a consequ...,,http://qudt.org/1.1/vocab/quantity#KineticEnergy,http://dbpedia.org/resource/Kinetic_energy,0,,13,3,3


# Format Relations

In [86]:
dfr = pd.read_excel('data/ImportedQUDT.xlsx', headers=0, delimiter=",", sheet_name='Term_Relation_All', index=False)
dfr.shape

(235, 3)

In [87]:
dfr[dfr.duplicated()]

Unnamed: 0,Term,Relation Type,Term.1
37,Kinetic Energy,has broader term,Energy and Work


In [88]:
#dfr.to_csv('data/qudt_terms_relations.csv', sep='\t', encoding='utf-8', line_terminator='\n',index=False)
dfr = dfr.drop_duplicates()
dfr.shape

(234, 3)

In [89]:
dfr.head(1)

Unnamed: 0,Term,Relation Type,Term.1
0,Angular Acceleration,has broader term,Acceleration


In [109]:
param ='Energy Kinetic'
dfr[dfr.Term.str.contains(param) | dfr['Term.1'].str.contains(param)]

Unnamed: 0,Term,Relation Type,Term.1


In [98]:
list(set(dfr.Term.tolist()) - set(quantityList))

[]

In [79]:
dfr.to_csv('data/qudt_terms_relations.csv', sep='\t', encoding='utf-8', line_terminator='\n',index=False)

In [None]:
df