# Processing of Condition Reports (CRs): HX

In [None]:
import os, sys, time
import re
import pandas as pd
import numpy as np
import spacy
import warnings
warnings.filterwarnings("ignore")
from matplotlib.ticker import MaxNLocator

# Load language model
nlp = spacy.load("en_core_web_lg", exclude=[])

cwd = os.getcwd()
pathToDACKAR   = os.path.join(cwd, '..', '..', 'src')
sys.path.append(pathToDACKAR)

from dackar.text_processing.Preprocessing import Preprocessing
from dackar.utils.utils import getOnlyWords, getShortAcronym
from dackar.text_processing import Abbreviation 
from dackar.utils.nlp.nlp_utils import resetPipeline
from dackar.text_processing.AbbrExpander import AbbrExpander

from dackar.utils.nlp.nlp_utils import generatePatternList
from dackar.pipelines.GeneralEntity import GeneralEntity

## Import Data

In [None]:
data = pd.read_csv('raw_data/textual/HX_CR.csv')
data = data.applymap(lambda x: x.lower() if isinstance(x, str) else x)
data

# Identify equipment IDs

In [None]:
idLabel = "equip_ID"
ID      = "equip_ID" 

ID_list = pd.read_csv('processed_data/customMBSEobject_ID.csv')['ID'].to_list()
patterns_IDs = generatePatternList(ID_list, label=idLabel, id=ID, nlp=nlp, attr="LEMMA")

pipelines = []
resetPipeline(nlp, pipelines)
nlp.disable_pipes("ner")

ID_ents = GeneralEntity(nlp, patterns_IDs)

ents  = []
for index, act in data['Component'].to_frame().iterrows():
    doc = nlp(data['Component'].iloc[index].lower())
    newDoc = ID_ents(doc)
    
    newTuple = ()
    for ent in newDoc.ents:
        newTuple = newTuple + (ent.lemma_,)

    ents.append(newTuple)  
data['identifiedID'] = ents
data


# Identify nuclear related entities

In [None]:
from dackar.utils import tagKeywordListReader as tklr

tagDict = tklr.entityLibrary('../../../DACKAR/data/tag_keywords_lists.xlsx')
tagDict.checker()

nuc_ent_dict = tagDict.getLibrary()

patterns_ents =[]

for key in nuc_ent_dict.keys():
    entLabel = str(key)
    entId    = str(key) 
    patterns_ents.extend(generatePatternList(nuc_ent_dict[key], label=entLabel, id=entId, nlp=nlp, attr="LEMMA"))

In [None]:
pipelines = []
resetPipeline(nlp, pipelines)
nlp.disable_pipes("ner")

# General entity object
generalEntity_ents = GeneralEntity(nlp, patterns_ents)

ents  = []
for index, act in data['Component'].to_frame().iterrows():
    doc = nlp(data['Component'].iloc[index].lower())
    newDoc = generalEntity_ents(doc)
    
    newTuple = ()
    for ent in newDoc.ents:
        newTuple = newTuple + (ent.lemma_,)

    ents.append(newTuple)  
data['Component NER entities'] = ents

ents  = []
for index, act in data['Issue Observed'].to_frame().iterrows():
    doc = nlp(data['Issue Observed'].iloc[index].lower())
    newDoc = generalEntity_ents(doc)
    
    newTuple = ()
    for ent in newDoc.ents:
        newTuple = newTuple + (ent.lemma_,)

    ents.append(newTuple)  
data['Issue NER entities'] = ents

ents  = []
for index, act in data['Inspection Method'].to_frame().iterrows():
    doc = nlp(data['Inspection Method'].iloc[index].lower())
    newDoc = generalEntity_ents(doc)
    
    newTuple = ()
    for ent in newDoc.ents:
        newTuple = newTuple + (ent.lemma_,)

    ents.append(newTuple)  
data['Inspection NER entities'] = ents

data

# Print on files: Nodes and Edges

In [None]:
# CR nodes
data= data.rename(columns={'Report ID': 'Report_ID'})
data[['date', 'Report_ID']].to_csv('processed_data/CR_HX_nodes.csv', index=False)

# NER entity nodes
compEnt  = data['Component NER entities'].tolist()
issueEnt = data['Issue NER entities'].tolist()
compEnt  = data['Inspection NER entities'].tolist()
selected = compEnt + issueEnt + compEnt
selected = list(set(selected))
temp_dict = {'entities':[i for sub in selected for i in sub]}
pd.DataFrame(temp_dict).to_csv('processed_data/entities_HX_nodes.csv', index=False)

# Edges
id_edges_orig = []
id_edges_dest = []
ent_edges_orig = []
ent_edges_dest = []
ent_edges_attr = []

for index, row in data.iterrows():
    if row['identifiedID']:
        for id in row['identifiedID']:
            id_edges_orig.append(id)
            id_edges_dest.append(row['Report_ID'])

    if row['Component NER entities']:
        for ent in row['Component NER entities']:
            ent_edges_orig.append(row['Report_ID'])
            ent_edges_dest.append(ent)
            ent_edges_attr.append('component')

    if row['Issue NER entities']:
        for ent in row['Issue NER entities']:
            ent_edges_orig.append(row['Report_ID'])
            ent_edges_dest.append(ent)
            ent_edges_attr.append('issue')

    if row['Inspection NER entities']:
        for ent in row['Inspection NER entities']:
            ent_edges_orig.append(row['Report_ID'])
            ent_edges_dest.append(ent)
            ent_edges_attr.append('inspection')

edges_ent_dict = {'orig':ent_edges_orig, 'dest':ent_edges_dest, 'attribute': ent_edges_attr}
edges_id_dict  = {'orig':id_edges_orig, 'dest':id_edges_dest}

pd.DataFrame(edges_ent_dict).to_csv('processed_data/HX_edges_ent.csv', index=False)
pd.DataFrame(edges_id_dict).to_csv('processed_data/HX_edges_id.csv', index=False)
