# Demo for Rule Based Natural Language Processing

### 1. Set up the path, so that the NLP modules can be found

In [1]:
import os
import sys
nlpPath = os.path.abspath(os.path.join(os.getcwd(), '..', '..'))
sys.path.append(nlpPath)

### 2. Load Spacy module

In [2]:
import spacy
nlp = spacy.load("en_core_web_lg", exclude=[])

### 3. Load other modules

In [3]:
import pandas as pd

### 4. Import NLP modules 

In [4]:
from nlp.RuleBasedMatcher import RuleBasedMatcher
from nlp import config
from nlp.nlp_utils import generatePatternList
from utils.nlpUtils.OPLparser import OPLentityParser

### 5. Set up logging  

In [5]:
import logging
logging.basicConfig(format='%(asctime)s %(name)-20s %(levelname)-8s %(message)s', datefmt='%d-%b-%y %H:%M:%S', level=logging.DEBUG)

###  6. Read and process entities

In [6]:
ents = []
entityFile = config.nlpConfig['files']['entity_file']
entityList = pd.read_csv(entityFile).values.ravel().tolist()
ents.extend(entityList)
ents = set(ents)
label = "pump_component"
entId = "SSC"
patternsOPM = generatePatternList(ents, label=label, id=entId, nlp=nlp, attr="LEMMA")

### 7. Read and process causal keywords 

In [7]:
causalLabel = "causal_keywords"
causalID = "causal"
patternsCausal = []
causalFilename = config.nlpConfig['files']['cause_effect_keywords_file']
ds = pd.read_csv(causalFilename, skipinitialspace=True)
for col in ds.columns:
    vars = set(ds[col].dropna())
    patternsCausal.extend(generatePatternList(vars, label=causalLabel, id=causalID, nlp=nlp, attr="LEMMA"))

### 8. Create Rule-based matcher with entity list and causal entity list

In [8]:
name = 'ssc_entity_ruler'
matcher = RuleBasedMatcher(nlp, entLabel=entId, causalKeywordLabel=causalID)
matcher.addEntityPattern(name, patternsOPM)

causalName = 'causal_keywords_entity_ruler'
matcher.addEntityPattern(causalName, patternsCausal)

17-Oct-22 15:48:04 nlp.RuleBasedMatcher INFO     Create instance of RuleBasedMatcher
17-Oct-22 15:48:07 nlp.nlp_utils        INFO     Model: core_web_lg, Language: en
17-Oct-22 15:48:07 nlp.nlp_utils        INFO     Available pipelines:pysbdSentenceBoundaries, tok2vec, tagger, parser, attribute_ruler, lemmatizer, mergePhrase, normEntities, initCoref, aliasResolver, coreferee, anaphorCoref, anaphorEntCoref


### 9. Read input text file, or users can provide a raw string

In [9]:
textFile = config.nlpConfig['files']['text_file']
with open(textFile, 'r') as ft:
    doc = ft.read()

### 10. Process raw string data using matcher

In [10]:
matcher(doc)

17-Oct-22 15:48:08 nlp.RuleBasedMatcher DEBUG    Entity Ruler Matches:
17-Oct-22 15:48:08 nlp.RuleBasedMatcher DEBUG    Print Coreference Info:
17-Oct-22 15:48:08 nlp.RuleBasedMatcher INFO     Start to extract health status
17-Oct-22 15:48:08 nlp.RuleBasedMatcher DEBUG    pump health status: A leak
17-Oct-22 15:48:08 nlp.RuleBasedMatcher DEBUG    pump health status: not operating
17-Oct-22 15:48:08 nlp.RuleBasedMatcher DEBUG    pump health status: inoperative
17-Oct-22 15:48:08 nlp.RuleBasedMatcher DEBUG    pump health status: signs of past leakage
17-Oct-22 15:48:08 nlp.RuleBasedMatcher DEBUG    Pump health status: not enough flow during test
17-Oct-22 15:48:08 nlp.RuleBasedMatcher DEBUG    pump shaft health status: Slight Vibrations
17-Oct-22 15:48:08 nlp.RuleBasedMatcher DEBUG    Pump health status: not responding
17-Oct-22 15:48:08 nlp.RuleBasedMatcher DEBUG    pump bearings health status: Rupture
17-Oct-22 15:48:08 nlp.RuleBasedMatcher DEBUG    pump shaft health status: degradatio

[('pump', 'pump_component', 'SSC'), ('pump', 'pump_component', 'SSC'), ('pump', 'pump_component', 'SSC'), ('pump', 'pump_component', 'SSC'), ('Pump', 'pump_component', 'SSC'), ('pump shaft', 'pump_component', 'SSC'), ('Pump', 'pump_component', 'SSC'), ('pump bearings', 'pump_component', 'SSC'), ('pump shaft', 'pump_component', 'SSC'), ('pump bearings', 'pump_component', 'SSC'), ('pump shaft', 'pump_component', 'SSC'), ('Power supply', 'pump_component', 'SSC'), ('Pump', 'pump_component', 'SSC'), ('power supply', 'pump_component', 'SSC'), ('Pump', 'pump_component', 'SSC'), ('impeller', 'pump_component', 'SSC'), ('Pump', 'pump_component', 'SSC'), ('impeller', 'pump_component', 'SSC'), ('pump', 'pump_component', 'SSC'), ('pump', 'pump_component', 'SSC'), ('pump shaft', 'pump_component', 'SSC'), ('pump', 'pump_component', 'SSC'), ('pump', 'pump_component', 'SSC'), ('pump shaft', 'pump_component', 'SSC'), ('motor', 'pump_component', 'SSC'), ('Pump', 'pump_component', 'SSC'), ('pump shaft', '

### 11. Access processed information from matcher

In [11]:
matcher._extractedCausals

[[pump bearings,
  Rupture,
  caused,
  pump shaft,
  degradation,
  Rupture of pump bearings caused pump shaft degradation.,
  False],
 [pump bearings,
  Rupture,
  caused,
  pump shaft,
  degradation,
  Rupture of pump bearings caused pump shaft degradation and consequent flow reduction.,
  False],
 [power supply,
  failure,
  due to,
  Pump,
  test,
  Pump test failed due to power supply failure.,
  False],
 [Pump,
  inspection,
  revealed,
  impeller,
  degradation,
  Pump inspection revealed excessive impeller degradation.,
  False],
 [Pump,
  inspection,
  revealed,
  impeller,
  degradation,
  Pump inspection revealed excessive impeller degradation likely due to cavitation.,
  True],
 [pump shaft,
  Several cracks,
  caused,
  pump,
  failure,
  Several cracks on pump shaft were observed; they could have caused pump failure within few days.,
  True],
 [pump shaft,
  vibration,
  causing,
  motor,
  vibrate,
  The pump shaft vibration appears to be causing the motor to vibrate as