# Emergent Activity Analysis demo

In [None]:
import os, sys
import warnings
import spacy
from spacy.matcher import Matcher
warnings.filterwarnings("ignore")

cwd = os.getcwd()
frameworkDir = os.path.abspath(os.path.join(cwd, os.pardir, 'src'))
sys.path.append(frameworkDir)

from dackar.text_processing.Preprocessing import Preprocessing
from dackar.text_processing.Abbreviation import Abbreviation

## Text example

In [6]:
# General Example
content = """ wo101 wo 102  wo# 103 , wo#104 or wo #105 wo # 106 or a 107 wrong wo .
ABCD01D hrs- 8hr PERFORM 8-hr REPAIRS IF 10-hrs REQUIRED 24hrs (contingency 24 hrs).
1EFGH/J08K ERECT AB-7603 FOR IJKL-7148 XYZA7148abc OPGH0248 M-N 100 for WO# 84658 1BC/E08D-34r.
RELEASE [CLEARANCE] #3693 RED/Replace # the "A" ** (Switch).
A218-82-9171 -  REMOVE {INSUL}  [ISO].
"""

## Pipeline creation
Preprocessing requires a list of all desired preprocessors and a dictionary containing any additional options from textacy. The main keys of the options dictionary correspond to the name of the preprocessor. See the Text Preprocessing section from https://textacy.readthedocs.io/en/latest/ for available options.

In [7]:
preprocessorList = ['hyphenated_words',
                    'punctuation',
                    'quotation_marks',
                    'whitespace']
preprocessorOptions = {
                      'punctuation': {'only': ['(', ')', '[', ']', '{','}', '*', '#', '~', '@', '$', '^', ':', ';', '=', '_', '"', '!']}
                      }

preprocess = Preprocessing(preprocessorList, preprocessorOptions)
post = preprocess(content)
print(post)

wo101 wo 102 wo 103 , wo 104 or wo 105 wo 106 or a 107 wrong wo .
ABCD01D hrs- 8hr PERFORM 8-hr REPAIRS IF 10-hrs REQUIRED 24hrs contingency 24 hrs .
1EFGH/J08K ERECT AB-7603 FOR IJKL-7148 XYZA7148abc OPGH0248 M-N 100 for WO 84658 1BC/E08D-34r.
RELEASE CLEARANCE 3693 RED/Replace the A Switch .
A218-82-9171 - REMOVE INSUL ISO .


### Abbreviation Handling

In [None]:
abbreviation = Abbreviation()
# abbrDict = abbreviation.getAbbreviation()
# print(abbrDict)

# FIXME: do not work well when the abbreviation is combined with '-' and numbers, such as '8-hr' or '8hr'
# abbrDict = {'hr':'hour', 'hrs':'hours', '-hr':' hour', '-hrs':'hours'}
abbrDict = {'hr':'hour', 'hrs':'hours'}
abbreviation.updateAbbreviation(abbrDict, reset=True)

cleanedText = abbreviation.abbreviationSub(post)
print(cleanedText)

wo101 wo 102 wo 103 , wo 104 or wo 105 wo 106 or a 107 wrong wo. abcd01d hours- 8hour perform 8-hour repairs if 10-hours required 24hours contingency 24 hours. 1efgh/j08k erect ab-7603 for ijkl-7148 xyza7148abc opgh0248 m-n 100 for wo 84658 1bc/e08d-34r. release clearance 3693 red/replace the a switch. a218-82-9171 - remove insul iso. 


In [9]:
nlp = spacy.load("en_core_web_lg")
matcher = Matcher(nlp.vocab)
pattern = [[{"LOWER": "wo"}, {"IS_PUNCT": True, "OP":"*"}, {"IS_DIGIT": True}], [{"TEXT":{"REGEX":"(?<=wo)\d+"}}]]
matcher.add("IdentifyWO", pattern)
doc = nlp(cleanedText)
matches = matcher(doc)

In [10]:
for match_id, start, end in matches:
    span = doc[start:end]
    print(span.text)

wo101
wo 102
wo 103
wo 104
wo 105
wo 106
wo 84658


In [11]:
matcher = Matcher(nlp.vocab)
pattern = [[{"TEXT":{"REGEX":"(?=\S*[a-zA-Z])(?=\S*[0-9])"}}]]
matcher.add("IdentifyWO", pattern)
doc = nlp(cleanedText)
matches = matcher(doc)
for match_id, start, end in matches:
    span = doc[start:end]
    print(span.text)

wo101
abcd01d
8hour
24hours
1efgh
j08k
ab-7603
ijkl-7148
xyza7148abc
opgh0248
1bc
e08d-34r
a218
