# Demo For NLP Workflow: based on branch wangc/nlp

### 1. Set Paths and Loading Required Modules 
- required libraries, please check SR2ML/dependencies.xml

In [1]:
# External Modules #
import os
import sys
import pandas as pd
import spacy
import logging
import numerizer 
#######################

# Settings #
nlpPath = os.path.abspath(os.path.join(os.getcwd(), '..', '..'))
sys.path.append(nlpPath)
########################

# Internal Modules #
from nlp.RuleBasedMatcher import RuleBasedMatcher
from nlp import config
from nlp.nlp_utils import generatePatternList
from utils.nlpUtils.OPLparser import OPLentityParser
from nlp.Preprocessing import Preprocessing
from nlp.Preprocessing import SpellChecker
#########################

# logging.basicConfig(format='%(asctime)s %(name)-20s %(levelname)-8s %(message)s', datefmt='%d-%b-%y %H:%M:%S', level=logging.DEBUG)
logging.basicConfig(format='%(asctime)s %(name)-20s %(levelname)-8s %(message)s', datefmt='%d-%b-%y %H:%M:%S', level=logging.INFO)
nlp = spacy.load("en_core_web_lg", exclude=[])

### 2. Initialize variables 

In [2]:
cleanDoc = True         # prepocessor the raw doc text, i.e., remove brackets, repeatings, punctuations.
numerizerDoc = True     # converts natural language numerics into ints and floats
spellCorrectDoc = False # spell check and perform corrections

entLabel = "pump_component"       # user defined entity label
entId = "SSC"                     # user defined entity ID 
causalLabel = "causal_keywords"   # user defined causal keyword label 
causalID = "causal"               # user defined causal keyword ID 
ents = []                         # user provided entities list
causalList = []                   # user provided causal keyword list

removeBrackets = ['curly', 'square', 'round']
removeRepeatings = ['.']
# TODO: extend repeating_chars to handle a list of chars, right now we can only pass one chars
removePunctuation = ['/', "#", '~'] # right now puncuation is replaced with whitespace, we may need to replace it with None
# TODO: add replace functions, for example, replace acronyms with full name

preprocessorList = ['bullet_points', 
                    'hyphenated_words', 
                    'quotation_marks',   
                    'unicode', 
                    'repeating_chars',
                    'accents', 
                    'brackets', 
                    'html_tags', 
                    'punctuation', 
                    # 'currency_symbols', 
                    'emails', 
                    'emojis', 
                    'hashtags', 
                    # 'numbers', 
                    'phone_numbers', 
                    'urls', 
                    'user_handles', 
                    'whitespace',
                    'numerize']
preprocessorOptions = {'repeating_chars': {'chars': removeRepeatings[0], 'maxn': 1}, 
                       'unicode': {'form': 'NFKC'}, 
                       'accents': {'fast': False}, 
                       'brackets': {'only': removeBrackets},
                       'punctuation': {'only': removePunctuation}}

preprocess = Preprocessing(preprocessorList, preprocessorOptions)

### 3. Load entity list and causal list or provide directly

In [3]:
entityFile = config.nlpConfig['files']['entity_file']
entityList = pd.read_csv(entityFile).values.ravel().tolist()
ents.extend(entityList)

causalFilename = config.nlpConfig['files']['cause_effect_keywords_file']
ds = pd.read_csv(causalFilename, skipinitialspace=True)
for col in ds.columns:
    causalList.extend(set(ds[col].dropna()))

### 4. Generate patterns that can be used in NER 

In [4]:
patternsEnts = generatePatternList(ents, label=entLabel, id=entId, nlp=nlp, attr="LEMMA")
patternsCausal = generatePatternList(causalList, label=causalLabel, id=causalID, nlp=nlp, attr="LEMMA")

### 5. Create Rule-based matcher with entity list and causal entity list

In [5]:
matcher = RuleBasedMatcher(nlp, entLabel=entId, causalKeywordLabel=causalID)

matcher.addEntityPattern('ssc_entity_ruler', patternsEnts)
matcher.addEntityPattern('causal_keywords_entity_ruler', patternsCausal)

19-Jan-23 10:46:34 nlp.RuleBasedMatcher INFO     Create instance of RuleBasedMatcher
19-Jan-23 10:46:37 nlp.nlp_utils        INFO     Model: core_web_lg, Language: en
19-Jan-23 10:46:37 nlp.nlp_utils        INFO     Available pipelines:pysbdSentenceBoundaries, tok2vec, tagger, parser, attribute_ruler, lemmatizer, mergePhrase, normEntities, initCoref, aliasResolver, coreferee, anaphorCoref, anaphorEntCoref


### 6. Read Raw Text Data and Preprocess it

In [6]:
# Read Raw Text Data, Users can also provide a raw string here
textFile = config.nlpConfig['files']['text_file']
with open(textFile, 'r') as ft:
    doc = ft.read()

# clean doc
if cleanDoc:
    doc = preprocess(doc)
if numerizerDoc:
    doc = numerizer.numerize(doc)

### 7. Correct the doc 

In [7]:
availCheckers = ['autocorrect', 'ContextualSpellCheck']
if spellCorrectDoc: 
    checker = SpellChecker(doc, checker=availCheckers[0])
    misspelledWords = checker.getMisspelledWords()
    print('MisspelledWords: ', ','.join(misspelledWords))
    updatedWords = input('Provide the words that will not be treated as misspelled words (comma sperated words):')
    updatedWords = [word.strip() for word in updatedWords.split(',')]
    if len(updatedWords) != 0:
        checker.addWordsToDictionary(updatedWords)
    doc = checker.correct()

In [8]:
# raw text need to convert to lower case so that Spacy can perform POS correctly
doc = doc.lower()

In [9]:
matcher(doc)

19-Jan-23 10:46:38 nlp.RuleBasedMatcher INFO     Start to extract health status
"
19-Jan-23 10:46:38 nlp.RuleBasedMatcher INFO     End of health status extraction!
19-Jan-23 10:46:38 nlp.RuleBasedMatcher INFO     Start to extract causal relation using OPM model information
19-Jan-23 10:46:38 nlp.RuleBasedMatcher INFO     End of causal relation extraction!
19-Jan-23 10:46:38 nlp.RuleBasedMatcher INFO     Start to use general extraction method to extract causal relation
19-Jan-23 10:46:38 nlp.RuleBasedMatcher INFO     End of causal relation extraction using general extraction method!






TODO:
1. Able to retrieve info from Jupyter Notebook by loading CSV files 
2. Able to analysis results directly, for example, knowledge graph 
3. Update rules for excavator use case
4. Review regexp 

In [10]:
# # Following used to retrieve causal effect information
# causalEffect = config.nlpConfig['files']['output_causal_effect_file']
# causalEffect = pd.read_csv(causalEffect)

In [11]:
healthStatus = config.nlpConfig['files']['output_health_status_file']
healthStatus = pd.read_csv(healthStatus)

In [12]:
healthStatus

Unnamed: 0.1,Unnamed: 0,entities,root,status keywords,health statuses,conjecture,sentence
0,0,grease line,,,,False,grease line on bucket broken.
1,1,bucket,,,grease line,False,grease line on bucket broken.
2,2,steel tube,,,,False,steel tube on bucket clam broken.
3,3,bucket clam,,,steel tube,False,steel tube on bucket clam broken.
4,4,rh120c bucket front lip,,,,False,rh120c bucket front lip changeout.
5,5,grease line,,,\n,False,"loose grease line on bucket ,."
6,6,bucket,,,loose grease line,False,"loose grease line on bucket ,."
7,7,bucket teeth,,,missing,False,2 bucket teeth and adaptors missing.
8,8,adaptors,,,missing,False,2 bucket teeth and adaptors missing.
9,9,bucket cyl gland seal,,,leaking,False,l h bucket cyl gland seal leaking.


In [13]:
for i in range(healthStatus.shape[0]):
    print(list(healthStatus.iloc[i]))

[0, 'grease line', nan, nan, nan, False, 'grease line on bucket broken.']
[1, 'bucket', nan, nan, 'grease line', False, 'grease line on bucket broken.']
[2, 'steel tube', nan, nan, nan, False, 'steel tube on bucket clam broken.']
[3, 'bucket clam', nan, nan, 'steel tube', False, 'steel tube on bucket clam broken.']
[4, 'rh120c bucket front lip', nan, nan, nan, False, 'rh120c bucket front lip changeout.']
[5, 'grease line', nan, nan, '\n', False, 'loose grease line on bucket ,.']
[6, 'bucket', nan, nan, 'loose grease line', False, 'loose grease line on bucket ,.']
[7, 'bucket teeth', nan, nan, 'missing', False, '2 bucket teeth and adaptors missing.']
[8, 'adaptors', nan, nan, 'missing', False, '2 bucket teeth and adaptors missing.']
[9, 'bucket cyl gland seal', nan, nan, 'leaking', False, 'l h bucket cyl gland seal leaking.']
[10, 'lip pivot grease lines', 'replace', nan, 'busted', False, 'replace busted lip pivot grease lines.']
[11, 'boom valve clam pipe adapter', nan, nan, 'oil leak 