In [425]:
# external imports
import pandas as pd
import spacy

# internal imports
from PhraseEntityMatcher import PhraseEntityMatcher
nlp = spacy.load("en_core_web_sm")



In [426]:
# Method designed to:
# 1- identify acronyms defined into elements of a list (list2edit)
# 2- save them into a dict (acronymsDict)
# 3- remove the acronym definition from element of list (list2edit)
def decoupleAcronym(list2edit,acronymsDict):
    for index,elem in enumerate(list2edit):
        if elem.find('(')!=-1 and elem.find(')')!=-1:
            acr = elem[elem.find('(')+1:elem.find(')')]
            list2edit[index] = elem.replace("("+acr+")", "").strip()
            acronymsDict[acr.strip()] = list2edit[index]
    return list2edit, acronymsDict

In [427]:
# test for decoupleAcronym method 
acrDict = {}
txt = ["triode for alternating current (triac )"]
txtclnd,acrDict = decoupleAcronym(txt, acrDict)
print(txtclnd,acrDict)

['triode for alternating current'] {'triac': 'triode for alternating current'}


In [428]:
# Method to
# 1- identify acronyms defined in dictionary (keywordsDict)
# 2- save acronyms into a dict (acronymsDict)
# keywordsDict = {tag1: [...], tag2: [...], ..., tagN: [...]}
def cleanKeywordsDict(keywordsDict):
    acronymsDict = {}
    for key in keywordsDict.keys():
        keywordsDict[key], acronymsDict = decoupleAcronym(keywordsDict[key],acronymsDict)
    return keywordsDict, acronymsDict

In [429]:
# test for cleanKeywordsDict method 
txt = {"p1":["triode for alternating current (triac )", "alternate current (AC)"], "p2":["triode "]}
txtCleaned, acronymsDict = cleanKeywordsDict(txt)
print(txtCleaned)
print(acronymsDict)

{'p1': ['triode for alternating current', 'alternate current'], 'p2': ['triode ']}
{'triac': 'triode for alternating current', 'AC': 'alternate current'}


In [430]:
keywordSetFile = "./data/tag_keywords_lists.xlsx"
xls = pd.ExcelFile(keywordSetFile)

In [431]:
# Dictionary of keywords. Set of keywords are grouped into classes 
# This dictionary is structured as follows:
# keywordsDict = {keyword_tag_ID: [list of keywords which belong to the same class]}
# If any of these keyowrds is located in the text, then such keywords should be tagged with 
# their own ID (i.e., keyword_tag_ID)
keywordsDict = {}

In [432]:
mechKWDF      = pd.read_excel(xls,'comp_mech')
elnHydKWDF    = pd.read_excel(xls,'comp_eln_hyd')
assetsDF      = pd.read_excel(xls,'assets')
systemsDF     = pd.read_excel(xls,'systems')
toolsTreatsDF = pd.read_excel(xls,'tools_treatments')
operandsDF    = pd.read_excel(xls,'operands')
matCompDF     = pd.read_excel(xls,'mat_comp')
reactionDF    = pd.read_excel(xls,'reactions')
nucOrgDF      = pd.read_excel(xls,'organizations')

In [433]:
# Clean dataframe
mechKWDF.replace(u'\xa0',u''     , regex=True, inplace=True)
elnHydKWDF.replace(u'\xa0',u''   , regex=True, inplace=True)
assetsDF.replace(u'\xa0',u''     , regex=True, inplace=True)
systemsDF.replace(u'\xa0',u''    , regex=True, inplace=True)
toolsTreatsDF.replace(u'\xa0',u'', regex=True, inplace=True)
operandsDF.replace(u'\xa0',u''   , regex=True, inplace=True)
matCompDF.replace(u'\xa0',u''    , regex=True, inplace=True)
reactionDF.replace(u'\xa0',u''   , regex=True, inplace=True)
nucOrgDF.replace(u'\xa0',u''     , regex=True, inplace=True)

In [434]:
# mechanical components
keywordsDict['comp_mech_fast']   = mechKWDF['Fasteners'].to_frame().dropna().applymap(str.lower).values.T[0].tolist()
keywordsDict['comp_mech_rot']    = mechKWDF['Elements of rotary motion drive'].to_frame().dropna().applymap(str.lower).values.T[0].tolist()
keywordsDict['comp_mech_struct'] = mechKWDF['Structural'].to_frame().dropna().applymap(str.lower).values.T[0].tolist()
keywordsDict['comp_mech_spec']   = mechKWDF['Specific purpose'].to_frame().dropna().applymap(str.lower).values.T[0].tolist()

# electrical, electronic, hydraulic and pneumatic components
keywordsDict['comp_elt_eln'] = elnHydKWDF['Electrical/electronic'].to_frame().dropna().applymap(str.lower).values.T[0].tolist()
keywordsDict['comp_hyd_pne'] = elnHydKWDF['Hydraulic/Pneumatic'].to_frame().dropna().applymap(str.lower).values.T[0].tolist()

# assets
keywordsDict['ast_mech']    = assetsDF['Mechanical'].to_frame().dropna().applymap(str.lower).values.T[0].tolist()
keywordsDict['ast_elt']     = assetsDF['Electrical'].to_frame().dropna().applymap(str.lower).values.T[0].tolist()
keywordsDict['ast_hyd_pne'] = assetsDF['Hydraulic/Pneumatic'].to_frame().dropna().applymap(str.lower).values.T[0].tolist()
keywordsDict['ast_eln']     = assetsDF['Electronic'].to_frame().dropna().applymap(str.lower).values.T[0].tolist()
keywordsDict['ast_I&C']     = assetsDF['I&C'].to_frame().dropna().applymap(str.lower).values.T[0].tolist()
keywordsDict['ast_fuel']    = assetsDF['Fuel'].to_frame().dropna().applymap(str.lower).values.T[0].tolist()

# systems and architectural elements
keywordsDict['sys']  = systemsDF['Systems'].to_frame().dropna().applymap(str.lower).values.T[0].tolist()
keywordsDict['arch'] = systemsDF['Buildings/rooms'].to_frame().dropna().applymap(str.lower).values.T[0].tolist()

# tools and tratments sheet
keywordsDict['tool'] = toolsTreatsDF['Tools'].to_frame().dropna().applymap(str.lower).values.T[0].tolist()
keywordsDict['ops']  = toolsTreatsDF['Treatments/operations/testing'].to_frame().dropna().applymap(str.lower).values.T[0].tolist()

# operands sheet
keywordsDict['opd_elt']     = operandsDF['Electrical'].to_frame().dropna().applymap(str.lower).values.T[0].tolist()
keywordsDict['opd_hyd_pne'] = operandsDF['Hydraulic/Pneumatic'].to_frame().dropna().applymap(str.lower).values.T[0].tolist()
keywordsDict['opd_prop']    = operandsDF['Properties'].to_frame().dropna().applymap(str.lower).values.T[0].tolist()
keywordsDict['meas_units']  = operandsDF['Units'].to_frame().dropna().applymap(str.lower).values.T[0].tolist()

# mat_comp sheet
keywordsDict['chem_cmpd'] = matCompDF['Chemical compounds'].to_frame().dropna().applymap(str.lower).values.T[0].tolist()
keywordsDict['chem_elem'] = matCompDF['Chemical elements'].to_frame().dropna().applymap(str.lower).values.T[0].tolist()
keywordsDict['mat']       = matCompDF['Material'].to_frame().dropna().applymap(str.lower).values.T[0].tolist()
keywordsDict['mat_class'] = matCompDF['Material classes'].to_frame().dropna().applymap(str.lower).values.T[0].tolist()

# organization sheet
keywordsDict['chem_rx']   = reactionDF['Chemical reaction'].to_frame().dropna().applymap(str.lower).values.T[0].tolist()
keywordsDict['deg_mech']  = reactionDF['Degradation mechanism'].to_frame().dropna().applymap(str.lower).values.T[0].tolist()
keywordsDict['fail_type'] = reactionDF['Failure type'].to_frame().dropna().applymap(str.lower).values.T[0].tolist()

# organization sheet
keywordsDict['nuc_org'] = nucOrgDF['acronym'].to_frame().dropna().applymap(str.lower).values.T[0].tolist()

In [435]:
# clean keywordsDict
keywordsDictCleaned, acronymsDict = cleanKeywordsDict(keywordsDict)  

In [436]:
# Initialize a phrase matcher for each class
pmatcherList = []

for key in keywordsDict.keys():
    pmatcher = PhraseEntityMatcher(nlp, key, keywordsDict[key])
    pmatcherList.append(pmatcher)

In [437]:
rawdoc = "The shaft deflection is causing the safety cage to rattle. \
          Pumps not experiencing enough flow for the pumps to keep the check valves open during test. \
          Pump not experiencing enough flow during test. Shaft made noise. Vibration seems like it is coming from the shaft."

doc = nlp(rawdoc)
processedDoc = doc
for pmatch in pmatcherList:
    processedDoc = pmatch(processedDoc)

for ent in processedDoc.ents:
    print(ent, ent.label_)

shaft comp_mech_rot
cage comp_mech_struct
flow opd_prop
Pump ast_hyd_pne
flow opd_prop
Shaft ORG
Vibration deg_mech
shaft comp_mech_rot
