# Demo For NLP Workflow: based on branch wangc/nlp

### 1. Set Paths and Loading Required Modules 
- required libraries, please check SR2ML/dependencies.xml

In [1]:
# External Modules #
import os
import sys
import pandas as pd
import spacy
import logging
import numerizer 
#######################

# Settings #
cwd = os.getcwd()
frameworkDir = os.path.abspath(os.path.join(cwd, os.pardir, 'src'))
sys.path.append(frameworkDir)
########################

# Internal Modules #
from dackar.workflows.RuleBasedMatcher import RuleBasedMatcher
from dackar import config
from dackar.utils.nlp.nlp_utils import generatePatternList
from dackar.utils.opm.OPLparser import OPMobject
from dackar.text_processing.Preprocessing import Preprocessing
from dackar.text_processing.Preprocessing import SpellChecker
#########################

# logging.basicConfig(format='%(asctime)s %(name)-20s %(levelname)-8s %(message)s', datefmt='%d-%b-%y %H:%M:%S', level=logging.DEBUG)
logging.basicConfig(format='%(asctime)s %(name)-20s %(levelname)-8s %(message)s', datefmt='%d-%b-%y %H:%M:%S', level=logging.INFO)
nlp = spacy.load("en_core_web_lg", exclude=[])

  from .autonotebook import tqdm as notebook_tqdm
Warming up PyWSD (takes ~10 secs)... took 3.64813494682312 secs.
  _C._set_default_tensor_type(t)


### 2. Initialize variables 

In [2]:
cleanDoc = True         # prepocessor the raw doc text, i.e., remove brackets, repeatings, punctuations.
numerizerDoc = True     # converts natural language numerics into ints and floats
spellCorrectDoc = False # spell check and perform corrections

entLabel = "pump_component"       # user defined entity label
entId = "SSC"                     # user defined entity ID 
causalLabel = "causal_keywords"   # user defined causal keyword label 
causalID = "causal"               # user defined causal keyword ID 
ents = []                         # user provided entities list
causalList = []                   # user provided causal keyword list

removeBrackets = ['curly', 'square', 'round']
removeRepeatings = ['.']
# TODO: extend repeating_chars to handle a list of chars, right now we can only pass one chars
removePunctuation = ['/', "#", '~'] # right now puncuation is replaced with whitespace, we may need to replace it with None
# TODO: add replace functions, for example, replace acronyms with full name

preprocessorList = ['bullet_points', 
                    'hyphenated_words', 
                    'quotation_marks',   
                    'unicode', 
                    'repeating_chars',
                    'accents', 
                    'brackets', 
                    'html_tags', 
                    'punctuation', 
                    # 'currency_symbols', 
                    'emails', 
                    'emojis', 
                    'hashtags', 
                    # 'numbers', 
                    'phone_numbers', 
                    'urls', 
                    'user_handles', 
                    'whitespace',
                    'numerize']
preprocessorOptions = {'repeating_chars': {'chars': removeRepeatings[0], 'maxn': 1}, 
                       'unicode': {'form': 'NFKC'}, 
                       'accents': {'fast': False}, 
                       'brackets': {'only': removeBrackets},
                       'punctuation': {'only': removePunctuation}}

preprocess = Preprocessing(preprocessorList, preprocessorOptions)

### 3. Load entity list and causal list or provide directly

In [3]:
entityFile = config.nlpConfig['files']['entity_file']
entityList = pd.read_csv(entityFile).values.ravel().tolist()
ents.extend(entityList)

causalFilename = config.nlpConfig['files']['cause_effect_keywords_file']
ds = pd.read_csv(causalFilename, skipinitialspace=True)
for col in ds.columns:
    causalList.extend(set(ds[col].dropna()))

### 4. Generate patterns that can be used in NER 

In [4]:
patternsEnts = generatePatternList(ents, label=entLabel, id=entId, nlp=nlp, attr="LEMMA")
patternsCausal = generatePatternList(causalList, label=causalLabel, id=causalID, nlp=nlp, attr="LEMMA")

### 5. Create Rule-based matcher with entity list and causal entity list

In [5]:
matcher = RuleBasedMatcher(nlp, entLabel=entId, causalKeywordLabel=causalID)

matcher.addEntityPattern('ssc_entity_ruler', patternsEnts)
matcher.addEntityPattern('causal_keywords_entity_ruler', patternsCausal)

24-Oct-23 20:04:16 dackar.workflows.RuleBasedMatcher INFO     Create instance of RuleBasedMatcher
24-Oct-23 20:04:20 dackar.utils.nlp.nlp_utils INFO     Model: core_web_lg, Language: en
24-Oct-23 20:04:20 dackar.utils.nlp.nlp_utils INFO     Available pipelines:pysbdSentenceBoundaries, tok2vec, tagger, parser, attribute_ruler, lemmatizer, mergePhrase, normEntities, initCoref, aliasResolver, coreferee, anaphorCoref, anaphorEntCoref


### 6. Read Raw Text Data and Preprocess it

In [6]:
# Read Raw Text Data, Users can also provide a raw string here
textFile = config.nlpConfig['files']['text_file']
with open(textFile, 'r') as ft:
    doc = ft.read()

# clean doc
if cleanDoc:
    doc = preprocess(doc)
if numerizerDoc:
    doc = numerizer.numerize(doc)

### 7. Correct the doc 

In [7]:
availCheckers = ['autocorrect', 'ContextualSpellCheck']
if spellCorrectDoc: 
    checker = SpellChecker(doc, checker=availCheckers[0])
    misspelledWords = checker.getMisspelledWords()
    print('MisspelledWords: ', ','.join(misspelledWords))
    updatedWords = input('Provide the words that will not be treated as misspelled words (comma sperated words):')
    updatedWords = [word.strip() for word in updatedWords.split(',')]
    if len(updatedWords) != 0:
        checker.addWordsToDictionary(updatedWords)
    doc = checker.correct()

In [8]:
# raw text need to convert to lower case so that Spacy can perform POS correctly
doc = doc.lower()

In [9]:
matcher(doc)

24-Oct-23 20:04:21 dackar.workflows.RuleBasedMatcher INFO     Start to extract health status
24-Oct-23 20:04:21 dackar.workflows.RuleBasedMatcher INFO     End of health status extraction!
24-Oct-23 20:04:21 dackar.workflows.RuleBasedMatcher INFO     Start to extract causal relation using OPM model information
'
'
'
'
'
'
24-Oct-23 20:04:21 dackar.workflows.RuleBasedMatcher INFO     End of causal relation extraction!


0: relay(37), relay(52); 1: asdvs(224), asdvs(247), asdvs(294), asdvs(311), asdvs(379); 2: fuse(331), fuse(357); 3: relay(425), relay(443), relay(462), relay(466), relay(481); 4: the duty cycle(440), it(450); 5: asdvs(486), asdvs(500), asdvs(547); 6: relay(556), relay(601), relay(620); 7: coil(591), it(608); 8: asdvs(672), asdvs(678); 9: relay(708), relay(720), relay(737); 10: the preventive maintenance optimization code(713), it(724)


TODO:
1. Able to retrieve info from Jupyter Notebook by loading CSV files 
2. Able to analysis results directly, for example, knowledge graph 
3. Update rules for excavator use case
4. Review regexp 

In [10]:
# # Following used to retrieve causal effect information
# causalEffect = config.nlpConfig['files']['output_causal_effect_file']
# causalEffect = pd.read_csv(causalEffect)

In [11]:
healthStatus = config.nlpConfig['files']['output_health_status_file']
healthStatus = pd.read_csv(healthStatus)

In [12]:
healthStatus

Unnamed: 0.1,Unnamed: 0,entities,root,status keywords,health statuses,conjecture,sentence
0,0,control room,,,an acrid odor,False,"at the palisades nuclear plant, on june 16, 20..."
1,1,steam dump control relay,,,failed,False,investigation revealed that the steam dump con...
2,2,atmospheric steam dump valves,,,inoperable,False,investigation revealed that the steam dump con...
3,3,relay,,,replaced,False,the relay was replaced and the asdvs were retu...
4,4,asdvs,return,,service,False,the relay was replaced and the asdvs were retu...
...,...,...,...,...,...,...,...
67,67,relay,,,replaced,False,the fuse and relay were replaced.
68,68,relay,,show,,False,the preventive maintenance optimization code i...
69,69,relay,,,maintenance,True,this action will appropriately prioritize main...
70,70,atmospheric steam dump valves,,,inoperable,False,atmospheric steam dump valves inoperable due t...


In [13]:
for i in range(healthStatus.shape[0]):
    print(list(healthStatus.iloc[i]))

[0, 'control room', nan, nan, 'an acrid odor', False, 'at the palisades nuclear plant, on june 16, 2021, at 1550 edt, with the plant in mode 1, at 100% power, operations identified an acrid odor in the control room.']
[1, 'steam dump control relay', nan, nan, 'failed', False, 'investigation revealed that the steam dump control relay had failed, rendering all 4 atmospheric steam dump valves inoperable.']
[2, 'atmospheric steam dump valves', nan, nan, 'inoperable', False, 'investigation revealed that the steam dump control relay had failed, rendering all 4 atmospheric steam dump valves inoperable.']
[3, 'relay', nan, nan, 'replaced', False, 'the relay was replaced and the asdvs were returned to service.']
[4, 'asdvs', 'return', nan, 'service', False, 'the relay was replaced and the asdvs were returned to service.']
[5, 'control room', nan, nan, 'an acrid odor', False, 'at the palisades nuclear plant, on june 16, 2021, at 1550 edt, with the plant in mode 1, at 100% power, operators identi