In [2]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


Here we define the regular expression based pattern(s) applied to each label, and the templates used to contextualise

In [4]:
import uuid
from weak_supervision import TemplateAnnotator, CharPretokenizer
import spacy
from skweak import utils

nlp = spacy.blank('en')
nlp.tokenizer = CharPretokenizer(nlp.vocab)

patterns = {
    # Building
    "BLD": [
        r"[a-z][a-z0-9]+"
        ],
    # Locations - mostly room and a couple of specific descriptions
    "ROOM": [
        r"rm-[a-z0-9]+",
        r"bsmt",
        r"server room"
        ],
    # Equipment, mostly class followed by identifier, vfd's are special as they
    #            associated with other equipement, i.e. ahu-01-vsd sp we've split
    #            them. This may or may not be optimal
    "EQUIP": [
        r"(crac|sf|rf|ef|ah|chwp|compres|hwp|hx)(-?[0-9]+)+",
        r"(vfd|fschw)"
        ],
    # Points: any alphanumerics except vfd
    "POINT": [
        r"(?!.*vfd)[a-z][a-z\- _0-9]*",
        ],
    "*": [
        r".*",
        ]
}

# first argument is name (used as key and for display), second is actual pattern
template_annotators = [
    TemplateAnnotator("<BLD>.<*>", "<BLD>.<*>", patterns),
    TemplateAnnotator("<*>.<EQUIP>.<*>", "<*>.<EQUIP>.<*>", patterns),
    TemplateAnnotator("<*>.<EQUIP>-<EQUIP>.<*>", "<*>.<EQUIP>-<EQUIP>.<*>", patterns),
    TemplateAnnotator("<*>.<EQUIP>-<EQUIP>", "<*>.<EQUIP>-<EQUIP>", patterns),
    TemplateAnnotator("<*>.<EQUIP>-<POINT>", "<*>.<EQUIP>-<POINT>", patterns),
    TemplateAnnotator("<*>.<ROOM>.<*>", "<*>.<ROOM>.<*>", patterns),
]

# this is special, it is only applied when the previous rules don't match
point_annotator = TemplateAnnotator("<*>.<POINT>", "<*>.<POINT>", patterns, to_exclude=[lf.name for lf in template_annotators])


Construct a pipeline

In [5]:
from skweak import base

pipeline = base.CombinedAnnotator()
pipeline.add_annotators(*template_annotators)
pipeline.add_annotator(point_annotator)

<skweak.base.CombinedAnnotator at 0x7fcca3786c40>

This is useful to test the label functions to ensure they match as expected. 

In [6]:
#text = "ebu3b.rm-1200c..zn-t"
text = "ebu3b.chwp3-vfd.dir-sts"        # "<*>.<EQUIP>-<EQUIP>.<*>"
#text = "ebu3b.rm-1208..ef2-c"           # "<*>.<EQUIP>-<POINT>"
#text = "ebu3b.fschw.chwp3-vfd"         # "<*>.<EQUIP>-<EQUIP>"

doc = nlp(text)
for lf in template_annotators:
    print(lf.name)
    utils.display_entities(lf(doc), lf.name)

<BLD>.<*>


<*>.<EQUIP>.<*>


<*>.<EQUIP>-<EQUIP>.<*>


<*>.<EQUIP>-<EQUIP>


<*>.<EQUIP>-<POINT>


<*>.<ROOM>.<*>


Load the BMS dataset and convert to spacy documents with a custom character tokeniser

In [7]:
# load corpus and tokenise
import csv

docs = []
with open('experiments/ebu3b/eub_string.txt') as f:
    reader = csv.reader(f)
    for row in reader:
        docs.append(nlp(row[0]))

Apply the label functions to each document

In [8]:
from tqdm import tqdm
docs=list(tqdm(pipeline.pipe(docs), total=len(docs)))

100%|██████████| 1074/1074 [00:04<00:00, 224.78it/s]


Randomly sample some examples to estimate coverage.

In [9]:
import random 

for doc in random.sample(docs, 5):   
    utils.display_entities(doc, "*")

Run the aggregation model over the labelled documents

In [10]:
# aggregate
from skweak import aggregation

hmm = aggregation.HMM("hmm", ["BLD", "ROOM", "EQUIP", "POINT"], sequence_labelling=False)
docs = hmm.fit_and_aggregate(docs)

Starting iteration 1
Number of processed documents: 1000
Finished E-step with 1066 documents
Starting iteration 2


         1        4714.3815             +nan


Number of processed documents: 1000
Finished E-step with 1066 documents
Starting iteration 3


         2        4726.5188         +12.1373


Number of processed documents: 1000
Finished E-step with 1066 documents
Starting iteration 4


         3        4730.3866          +3.8678


Number of processed documents: 1000
Finished E-step with 1066 documents


         4        4732.6319          +2.2454


Load the gold annotations and generate comparable list of gold and predicted entities

In [11]:
import json

gold_spans = {}
predicted_spans = {}
with open('experiments/ebu3b/annotations.jsonl') as f:
    for example in f:
        example=json.loads(example)
        
        doc = nlp(example['text'])
        doc = hmm(pipeline(doc))

        spans = [(span.text, span.label_) for span in doc.spans['hmm']]
        predicted_spans[doc.text] = set(spans)

        spans = []
        for span in example['spans']:
            #spans.append((span['start'],span['end'],span['label']))
            spans.append((example['text'][span['start']:span['end']],span['label']))
        gold_spans[example['text']] = set(spans)

Compute metrics

In [12]:
TP,FP,FN = 0,0,0

for key in predicted_spans.keys():
    for span in gold_spans[key]:
        if span in predicted_spans[key]:
            TP += 1
        else:
            FN += 1

    for span in predicted_spans[key]:
        if span not in gold_spans[key]:
            FP += 1

In [13]:
precision = TP / (TP + FP) if TP > 0 else 0
recall = TP / (TP + FN) if TP > 0 else 0
f1 = 2 * precision * recall / (precision + recall) if precision + recall > 0 else 0

precision, recall, f1

(0.9943767572633552, 0.9866707997520149, 0.9905087910378092)

Sample output of aggregation model

In [14]:
#text = "ebu3b.crac-4.main-fan"
text = "ebu3b.fschw.chwp3-vfd"

doc = nlp(text)
doc = hmm(pipeline(doc))

utils.display_entities(doc, "hmm")

Summary of entities extracted

In [16]:
from collections import defaultdict

entities = defaultdict(set)

for doc in docs:
    for text, label in [(span.text, span.label_) for span in doc.spans['hmm']]:
        entities[label].add(text)

print([(l,len(e)) for l,e in entities.items()])
    

[('BLD', 1), ('ROOM', 243), ('POINT', 126), ('EQUIP', 25)]


Enumerate the point entities. These now need to be mapped to specific point classes

In [19]:
sorted(entities['POINT'])

['acc-time',
 'actclgsp',
 'acthtgsp',
 'ai1-actual',
 'ai2-actual',
 'alm',
 'ao1-cmd',
 'boxmode',
 'bsmt avg clg-pid1',
 'c',
 'chw-lead',
 'chwp-db',
 'clgmaxflo',
 'clgminflo',
 'clgpid-o',
 'co2-hi alarm',
 'co2-level',
 'commonsp',
 'compres1',
 'compres2',
 'curr-lim',
 'current',
 'cwdp-sp',
 'dasp hi out',
 'dasp-hi in',
 'dec-time',
 'dhw-vlv',
 'dir-cmd',
 'dir-sts',
 'direct-cmd',
 'direction',
 'dly',
 'dmpr-pos',
 'dmprpos',
 'do1 cmd',
 'do2 cmd',
 'dpr-c',
 'dpr-cmd',
 'drv-rdy',
 'econocyc',
 'ena-sts',
 'f',
 'fan-c',
 'fault',
 'flt reset',
 'flt-rst',
 'flt-sts',
 'fltr-dp',
 'freq-output',
 'glycool',
 'heating',
 'hi-head1',
 'hi-humd',
 'hoa-sts',
 'htgflow',
 'htgpid-o',
 'humd-pan',
 'humd-tol',
 'humdfier',
 'humidity',
 'hwdp-pid',
 'inv-byps',
 'iso',
 'k-energy',
 'ld-sw',
 'leak-shutdown',
 'lo-humd',
 'lo-temp',
 'local-1',
 'local-2',
 'lohum-stpt',
 'loss-air',
 'loss-h2o',
 'lotemp-stpt',
 'main-fan',
 'mtw-dp load shed',
 'mtws-t',
 'no-e-pow',
 'occ

Enumerate the equipment entities. These now need to be mapped to specific equipment classes

In [20]:
sorted(entities['EQUIP'])

['ah-1',
 'ah1-1',
 'chwp1',
 'chwp2',
 'chwp3',
 'chwp4',
 'crac-1',
 'crac-2',
 'crac-3',
 'crac-4',
 'ef2',
 'ef5',
 'fschw',
 'hwp1',
 'hwp2',
 'hx2',
 'rf1',
 'rf2',
 'rf3',
 'rf4',
 'sf1',
 'sf2',
 'sf3',
 'sf4',
 'vfd']