In [1]:
import os, sys
nlpPath = os.path.join(os.getcwd(), '..', '..', '..')
sys.path.append(nlpPath)

In [2]:
import pandas as pd

In [3]:
csvPath = os.path.join(os.getcwd(), 'excavator_2015_raw_forpdl.csv')

In [4]:
excavator_df = pd.read_csv(csvPath)

In [5]:
excavator_df.head()

Unnamed: 0,BscStartDate,Asset,OriginalShorttext,PMType,Cost
0,2004-07-01,A,BUCKET WON'T OPEN,PM01,183.05
1,2005-03-20,A,L/H BUCKET CYL LEAKING.,PM01,407.4
2,2006-05-05,A,SWAP BUCKET,PM01,0.0
3,2006-07-11,A,FIT BUCKET TOOTH,PM01,0.0
4,2006-11-10,A,REFIT BUCKET TOOTH,PM01,1157.27


In [6]:
len(excavator_df)

5485

In [7]:
excavator_df['OriginalShorttext'][191:200]

191          Change out bucket teeth.
192     REPLACE MISSING BUCKET TOOTH.
193    REPLACE A MISSING BUCKET TOOTH
194              REPLACE BUCKET TOOTH
195      BUCKET TOOTH MISSING    @R10
196     REPLACE MISSING BUCKET TOOTH.
197         REPAIR BUCKET WEAR PLATES
198                     BUCKET CRACKS
199              REPLACE BUCKET TOOTH
Name: OriginalShorttext, dtype: object

In [8]:
# looked at OriginalShorttext from 0:200 for these entities
entity_list = ['Adapter', 
               'Adapter pin',
               'Adaptor', 
               'Adaptors', 
               'Adaptor keeper pin',
               'Adaptor pins', 
               'Air con.', 
               'Boom',
               'Bucket', 
               'Bucket clam', 
               'Bucket clam cylinder',
               'Bucket clam pipe', 
               'Bucket crowd cylinder',
               'Bucket cyl',
               'Bucket cylinder',
               'Bucket cylinder grease line', 
               'Bucket grease line', 
               'Bucket hose', 
               'Bucket lube fittings', 
               'Bucket pin', 
               'Bucket ram hyd hose',
               'Bucket roll cylinder',
               'Bucket teeth', 
               'Bucket tip', 
               'Bucket tips',
               'Bucket tooth', 
               'Bucket wear plates',
               'Gland seal', 
               'Grease fitting',
               'Grease hose bucket pin',
               'Grease line', 
               'Grease lines', 
               'Hose', 
               'Hydraulic oil hose',
               'Lip', 
               'Lip ear',
               'Lip shroud',
               'Lip shrouds', 
               'Lip shroud pin'
               'Lip tips', 
               'Lower wear edge plate', 
               'Lube fitting', 
               'Lube hose',
               'Lube line',
               'Pin',
               'Pins',
               'Stick',
               'Teeth',
               'Tip', 
               'Tooth', 
               'Wear package', 
               'Wear plates']

In [9]:
entity_df = pd.DataFrame({'Entities': entity_list})

In [10]:
entity_df

Unnamed: 0,Entities
0,Adapter
1,Adapter pin
2,Adaptor
3,Adaptors
4,Adaptor keeper pin
5,Adaptor pins
6,Air con.
7,Boom
8,Bucket
9,Bucket clam


In [11]:
# # write entities to a csv
# entityPath = os.path.join(os.getcwd(), '..', '..', 'data', 'testing_data', 'excavator_entities.csv')
# entity_df = pd.DataFrame({'Entities': entity_list})
# entity_df.to_csv(entityPath, index=False)

# # write text to a file
# txtPath = os.path.join(os.getcwd(), '..', '..', 'data', 'testing_data', 'excavator.txt')
# with open(txtPath, 'w') as f:
#     f.write(".\n".join(excavator_df['OriginalShorttext'][0:200]))

## Follow Congjian's Process

In [12]:
from nlp.RuleBasedMatcher import RuleBasedMatcher
from nlp import config
from nlp.nlp_utils import generatePatternList
import spacy
nlp = spacy.load("en_core_web_lg", exclude=[])
import logging
logging.basicConfig(format='%(asctime)s %(name)-20s %(levelname)-8s %(message)s', datefmt='%d-%b-%y %H:%M:%S', level=logging.DEBUG)

### Read and process entities

In [13]:
ents = []
entityFile = config.nlpConfig['files']['entity_file']
entityList = pd.read_csv(entityFile).values.ravel().tolist()
ents.extend(entityList)
ents = set(ents)
label = "excavator_component"
entId = "SSC"
patternsOPM = generatePatternList(ents, label=label, id=entId, nlp=nlp, attr="LEMMA")

In [14]:
patternsOPM

[{'label': 'excavator_component',
  'pattern': [{'LOWER': 'adaptor'}, {'LOWER': 'keeper'}, {'LOWER': 'pin'}],
  'id': 'SSC'},
 {'label': 'excavator_component',
  'pattern': [{'LEMMA': 'adaptor'}, {'LEMMA': 'keeper'}, {'LEMMA': 'pin'}],
  'id': 'SSC'},
 {'label': 'excavator_component',
  'pattern': [{'LOWER': 'bucket'}, {'LOWER': 'lube'}, {'LOWER': 'fittings'}],
  'id': 'SSC'},
 {'label': 'excavator_component',
  'pattern': [{'LEMMA': 'bucket'}, {'LEMMA': 'lube'}, {'LEMMA': 'fitting'}],
  'id': 'SSC'},
 {'label': 'excavator_component',
  'pattern': [{'LOWER': 'bucket'},
   {'LOWER': 'ram'},
   {'LOWER': 'hyd'},
   {'LOWER': 'hose'}],
  'id': 'SSC'},
 {'label': 'excavator_component',
  'pattern': [{'LEMMA': 'bucket'},
   {'LEMMA': 'ram'},
   {'LEMMA': 'hyd'},
   {'LEMMA': 'hose'}],
  'id': 'SSC'},
 {'label': 'excavator_component',
  'pattern': [{'LOWER': 'grease'},
   {'LOWER': 'hose'},
   {'LOWER': 'bucket'},
   {'LOWER': 'pin'}],
  'id': 'SSC'},
 {'label': 'excavator_component',
  'pat

### Read and process causal key words

In [15]:
causalLabel = "causal_keywords"
causalID = "causal"
patternsCausal = []
causalFilename = config.nlpConfig['files']['cause_effect_keywords_file']
ds = pd.read_csv(causalFilename, skipinitialspace=True)
for col in ds.columns:
    vars = set(ds[col].dropna())
    patternsCausal.extend(generatePatternList(vars, label=causalLabel, id=causalID, nlp=nlp, attr="LEMMA"))

In [16]:
patternsCausal

[{'label': 'causal_keywords',
  'pattern': [{'LOWER': 'stimulate'}],
  'id': 'causal'},
 {'label': 'causal_keywords',
  'pattern': [{'LEMMA': 'stimulate'}],
  'id': 'causal'},
 {'label': 'causal_keywords',
  'pattern': [{'LOWER': 'bring'}, {'LOWER': 'about'}],
  'id': 'causal'},
 {'label': 'causal_keywords',
  'pattern': [{'LEMMA': 'bring'}, {'LEMMA': 'about'}],
  'id': 'causal'},
 {'label': 'causal_keywords',
  'pattern': [{'LOWER': 'insitute'}],
  'id': 'causal'},
 {'label': 'causal_keywords',
  'pattern': [{'LEMMA': 'insitute'}],
  'id': 'causal'},
 {'label': 'causal_keywords',
  'pattern': [{'LOWER': 'decrease'}],
  'id': 'causal'},
 {'label': 'causal_keywords',
  'pattern': [{'LEMMA': 'decrease'}],
  'id': 'causal'},
 {'label': 'causal_keywords',
  'pattern': [{'LOWER': 'benefit'}],
  'id': 'causal'},
 {'label': 'causal_keywords',
  'pattern': [{'LEMMA': 'benefit'}],
  'id': 'causal'},
 {'label': 'causal_keywords', 'pattern': [{'LOWER': 'deny'}], 'id': 'causal'},
 {'label': 'causa

### Create Rule-based matcher with entity list and causal list

In [17]:
name = 'ssc_entity_ruler'
matcher = RuleBasedMatcher(nlp, entLabel=entId, causalKeywordLabel=causalID)
matcher.addEntityPattern(name, patternsOPM)

causalName = 'causal_keywords_entity_ruler'
matcher.addEntityPattern(causalName, patternsCausal)

06-Dec-22 14:15:49 nlp.RuleBasedMatcher INFO     Create instance of RuleBasedMatcher
06-Dec-22 14:15:52 nlp.nlp_utils        INFO     Model: core_web_lg, Language: en
06-Dec-22 14:15:52 nlp.nlp_utils        INFO     Available pipelines:pysbdSentenceBoundaries, tok2vec, tagger, parser, attribute_ruler, lemmatizer, mergePhrase, normEntities, initCoref, aliasResolver, coreferee, anaphorCoref, anaphorEntCoref


### Read text file back in

In [18]:
textFile = config.nlpConfig['files']['text_file']
with open(textFile, 'r') as ft:
    doc = ft.read()

### Process data with matcher

In [19]:
# matcher(doc)

## Process line by line

In [20]:
i = 1
print(excavator_df['OriginalShorttext'][i])
name = 'ssc_entity_ruler'
matcher = RuleBasedMatcher(nlp, entLabel=entId, causalKeywordLabel=causalID)
matcher.addEntityPattern(name, patternsOPM)

causalName = 'causal_keywords_entity_ruler'
matcher.addEntityPattern(causalName, patternsCausal)
# matcher(excavator_df['OriginalShorttext'][i])
matcher(doc)

06-Dec-22 14:15:52 nlp.RuleBasedMatcher INFO     Create instance of RuleBasedMatcher


L/H BUCKET CYL LEAKING.


06-Dec-22 14:15:55 nlp.nlp_utils        INFO     Model: core_web_lg, Language: en
06-Dec-22 14:15:55 nlp.nlp_utils        INFO     Available pipelines:pysbdSentenceBoundaries, tok2vec, tagger, parser, attribute_ruler, lemmatizer, mergePhrase, normEntities, initCoref, aliasResolver, coreferee, anaphorCoref, anaphorEntCoref
06-Dec-22 14:15:55 nlp.RuleBasedMatcher DEBUG    Entity Ruler Matches:
06-Dec-22 14:15:55 nlp.RuleBasedMatcher DEBUG    Print Coreference Info:
06-Dec-22 14:15:55 nlp.RuleBasedMatcher INFO     Start to extract health status
06-Dec-22 14:15:55 nlp.RuleBasedMatcher DEBUG    BUCKET health status: OPEN
06-Dec-22 14:15:55 nlp.RuleBasedMatcher DEBUG    BUCKET CYL health status: LEAKING
06-Dec-22 14:15:55 nlp.RuleBasedMatcher DEBUG    BUCKET TOOTH health status: TOOTH
06-Dec-22 14:15:55 nlp.RuleBasedMatcher DEBUG    BUCKET TOOTH health status: TOOTH
06-Dec-22 14:15:55 nlp.RuleBasedMatcher DEBUG    BUCKET CYLINDER health status: CYLINDER
06-Dec-22 14:15:55 nlp.RuleBasedMatche

[('BUCKET', 'excavator_component', 'SSC'), ('BUCKET CYL', 'excavator_component', 'SSC'), ('BUCKET', 'excavator_component', 'SSC'), ('BUCKET TOOTH', 'excavator_component', 'SSC'), ('BUCKET TOOTH', 'excavator_component', 'SSC'), ('BUCKET CYLINDER', 'excavator_component', 'SSC'), ('BUCKET TEETH', 'excavator_component', 'SSC'), ('LIP', 'excavator_component', 'SSC'), ('LIP', 'excavator_component', 'SSC'), ('lip', 'excavator_component', 'SSC'), ('BUCKET TEETH', 'excavator_component', 'SSC'), ('lip', 'excavator_component', 'SSC'), ('tip', 'excavator_component', 'SSC'), ('tip', 'excavator_component', 'SSC'), ('adaptor', 'excavator_component', 'SSC'), ('lip shrouds', 'excavator_component', 'SSC'), ('BUCKET', 'excavator_component', 'SSC'), ('ADAPTOR', 'excavator_component', 'SSC'), ('lip shroud', 'excavator_component', 'SSC'), ('adapter', 'excavator_component', 'SSC'), ('tooth', 'excavator_component', 'SSC'), ('TOOTH', 'excavator_component', 'SSC'), ('wear package', 'excavator_component', 'SSC')

06-Dec-22 14:15:55 nlp.RuleBasedMatcher DEBUG    BUCKET HOSE health status: HOSE
06-Dec-22 14:15:55 nlp.RuleBasedMatcher DEBUG    BUCKET TEETH health status: BROKEN
"
06-Dec-22 14:15:55 nlp.RuleBasedMatcher DEBUG    BUCKET CLAM CYLINDER health status: CYLINDER
06-Dec-22 14:15:55 nlp.RuleBasedMatcher DEBUG    BUCKET TOOTH health status: @R10
06-Dec-22 14:15:55 nlp.RuleBasedMatcher DEBUG    BUCKET WEAR PLATES health status: PLATES
06-Dec-22 14:15:55 nlp.RuleBasedMatcher DEBUG    BUCKET health status: CRACKS
06-Dec-22 14:15:55 nlp.RuleBasedMatcher INFO     End of health status extraction!
06-Dec-22 14:15:55 nlp.RuleBasedMatcher INFO     Start to extract causal relation using OPM model information
06-Dec-22 14:15:55 nlp.RuleBasedMatcher DEBUG    Conjuncts pairs: [[BUCKET]]
06-Dec-22 14:15:55 nlp.RuleBasedMatcher DEBUG    Conjuncts pairs: [[BUCKET CYL]]
06-Dec-22 14:15:55 nlp.RuleBasedMatcher DEBUG    Conjuncts pairs: [[BUCKET]]
06-Dec-22 14:15:55 nlp.RuleBasedMatcher DEBUG    Conjuncts pai

Identified Cause-Effect Pairs:

