In [3]:
import numpy as np
import pandas as pd
from weighted_levenshtein import lev
import sklearn.cluster
from sklearn.metrics.cluster import adjusted_rand_score
import re
import itertools  
from nltk import ngrams
from eppy import modeleditor
from eppy.modeleditor import IDF
IDF.setiddname('C:/EnergyPlusV8-8-0/Energy+.idd')

import warnings
warnings.simplefilter(action='ignore', category=DeprecationWarning)

%matplotlib inline

## Extracting core information from BMS point tags

In [2]:
raw = pd.read_csv('../.csv')
tags = raw.copy()
tags['measure'] = tags['tag'].apply(lambda x: '_'.join(x.split('_')[2:]) )
len(tags['measure'].unique())

1743

## Tokenization of BMS tags using X-gram

In [4]:
def razors(string):
    m = len(string)-1
    result = []
    for k in range(min(m+1,7)):
        for bits in itertools.combinations(range(m), k):
            s = [1] * m
            for bit in bits:
                s[bit] = 0
            result.append(s)
    return result

def Xgrams(string):
    substrings = re.split('[ _,:]',string)
    if len(substrings) < 2:
        if len(substrings[0])>8:
            ngrams = [[substrings[0]]]
        else:
            ngrams = []
            for item in razors(string):
                ngram = []
                flag = 0
                for i in range(len(item)):
                    if item[i]:
                        ngram.append(string[flag:i+1])
                        flag = i+1
                ngram.append(string[flag:])
                ngram.sort(key = len,reverse=True)   # so that the longer substrings get matched first
                ngrams.append(ngram)
    else:
        ngrams = [[]]
        for s in substrings:
            if len(s)>3:
                ngrams = list(x+y for x in ngrams for y in [[s]])
            else:
                Ngram = []
                for item in razors(s):
                    ngram = []
                    flag = 0
                    for i in range(len(item)):
                        if item[i]:
                            ngram.append(s[flag:i+1])
                            flag = i+1
                    ngram.append(s[flag:])
                    ngram.sort(key = len,reverse=True)   # so that the longer substrings get matched first
                    Ngram.append(ngram)
                ngrams = list(x+y for x in ngrams for y in Ngram)
    return ngrams

In [5]:
OPCtag = pd.DataFrame(tags['measure'].unique(),columns=['tag'])
OPCtag['Xgrams'] = OPCtag['tag'].apply(lambda x: str.upper(' '.join(x.split('_'))))
OPCtag['Xgrams'] = OPCtag['Xgrams'].apply(lambda x: Xgrams(x))
OPCtag[:50]

Unnamed: 0,tag,Xgrams
0,Floor_1760001_Master_Hyperion_Enabled,"[[FLOOR, 1760001, MASTER, HYPERION, ENABLED]]"
1,Floor_1760001_Master_Hyperion_Mode_Sunny_or_Dark,"[[FLOOR, 1760001, MASTER, HYPERION, MODE, SUNN..."
2,Floor_1760001_Master_Loadshed_Enabled,"[[FLOOR, 1760001, MASTER, LOADSHED, ENABLED]]"
3,Floor_1760001_Timeclock_atrium_evening_Enabled,"[[FLOOR, 1760001, TIMECLOCK, ATRIUM, EVENING, ..."
4,Floor_1760001_Timeclock_Disable_Wall_Control_E...,"[[FLOOR, 1760001, TIMECLOCK, DISABLE, WALL, CO..."
5,Floor_1760001_Timeclock_Dusk_to_11_PM_Enabled,"[[FLOOR, 1760001, TIMECLOCK, DUSK, T, O, 1, 1,..."
6,Floor_1760001_Timeclock_Dusk_to_Dawn_Enabled,"[[FLOOR, 1760001, TIMECLOCK, DUSK, T, O, DAWN,..."
7,Floor_1760001_Timeclock_Interior_Atrium_Lighti...,"[[FLOOR, 1760001, TIMECLOCK, INTERIOR, ATRIUM,..."
8,Floor_1760001_Timeclock_Interior_Lighting_Non_...,"[[FLOOR, 1760001, TIMECLOCK, INTERIOR, LIGHTIN..."
9,ALT OAT FAIL,"[[A, L, T, O, A, T, FAIL], [A, L, T, OA, T, FA..."


## Defining the dictionary used to tokenize the EnergyPlus variables

In [6]:
def updateABB(abb,full):
    global ABBdict
    for i in full:
        if i in list(ABBdict['name']):
            ABBdict['abb'].loc[ABBdict['name']==i].item().add(abb[full.index(i)])
        else:
            ABBdict = ABBdict.append({'abb': set([abb[full.index(i)]]),'name':i}, ignore_index=True)

In [7]:
dict1 = pd.read_csv('../HVACacronym.csv')
dict2 = pd.read_csv('../HVACacronym1.csv')

dict1['full']=dict1['full'].apply(lambda x: str.lower(x))
dict2['name']=dict2['name'].apply(lambda x: str.lower(x))

ABBdict = pd.concat([dict1.rename({'full':'name'}, axis='columns'),dict2])
ABBdict['abb']= ABBdict[['abb']].values.tolist()
ABBdict = pd.DataFrame(ABBdict.groupby('name')['abb'].sum())
ABBdict['name']=ABBdict.index
ABBdict.rename(index=str, columns={0: "abb"},inplace=True)
ABBdict['abb']= ABBdict['abb'].apply(lambda x: set(x))

ABBdict.reset_index(drop=True,inplace=True)

In [8]:
abb = ['OC','TMP','TEMP','SA','STATIC','RA','RH','RM','BM','FLWR','RET','KWATT','KWHR','PM','ENGRY','FWD','PWR','AHU','FAN','SPEED','BYP','CLG','WATER','VLV','TP','SMK','FA','PM','BM','CURR','VOLT','PW']
full=['off coil','temperature','temperature','supply air','pressure','return air','relative humidity','room','btu meter','flow rate','return','power','energy','power meter','energy','supply','power','ahu','fan','speed','bypass','cooling','water','valve','time program','smoke','fresh air','power meter','btu meter','current','voltage','power']
updateABB(abb,full)

EPname = ['zone','air','energy','heating','cooling','air terminal','pump','boiler','condenser','setpoint','heat exchanger','humidifier','heater','water','people','window','infiltration','equipment','outdoor','drybulb','dewpoint','speed','angle','radiation','pressure','precipitation','occupant','lights','humidity ratio','facility','mass flow rate','interior','gas','office','room','level','extract','fan','supply','air loop','site','power','radiant','fan coil unit','indoor','heat pump','vav','vrf','return','inlet','outlet','relief air','rate']
EPabb = ['RM','A','KWHR','HT','CL','VAV','PMP','B','COND','SP','HX','HUMID','HTR','WTR','PPL','WD','INFIL','EQUIP','O','DB','DP','SPD','AGL','R','PR','P','OCC','LT','HR','FAC','FLWR','INTR','G','OFC','RM','LV','E','F','S','AHU','LOC','KWATT','R','FCU','I','CU','VAV','VRF','R','S','R','EA','KW']
updateABB(EPabb,EPname)

NODEname = ['supply side inlet','supply side outlet','coil air outlet','mixed inlet','splitter outlet','coil outlet','inlet','outlet','supply inlet']
NODEabb = ['R','S','OC','R','S','S','FWD','RET','R']
updateABB(NODEabb,NODEname)

testName = ['electricity','energy','power','demand','rate','coil air outlet','coil outlet']
testABB = ['KWHR','ENGRY','PWR','ENGRY','PWR','S','OC']
updateABB(testABB,testName)

iwName = ['dewpoint','relative humidity','humidity ratio','wind','direction','global','solar','zone','electricity','water','mass flow rate']
iwABB = ['DWP','HM','HM','W','DIREC','GLO','S','R','TOTKW','W','FL']
updateABB(iwABB,iwName)

iwName = ['humidity ratio','relative humidity','mass flow rate']
iwABB = ['HUM','HUM','FLO']
updateABB(iwABB,iwName)

sdeName = ['temperature','dewpoint','relative humidity','humidity ratio','pressure','barometric','wind','direction','solar','radiation','precipitation','electricity','energy','purchased','net','produced']
sdeABB = ['TEMPERATURE','DEW','HUMIDITY','HUMIDITY','PRESSURE','BAROMETRIC','WIND','DIRECTION','SOLAR','RADIATION','PRECIPITATION','POWER','ENERGY','RECEIVED','BALANCE','DELIVERED']
updateABB(sdeABB,sdeName)

sdeABB = ['KWH','SP']
sdeName = ['electricity','pressure']
updateABB(sdeABB,sdeName)

CSLabb = ['DEWPOINT','H','H','OCCUPANCY','USAGE','SUPPLY','CFM','STPT','ZN']
CSLname = ['dewpoint','humidity ratio','relative humidity','occupant','energy','inlet','flow rate','setpoint','zone']
updateABB(CSLabb,CSLname)

CSLabb = ['KCFM','SETPOINT','RETURN','RETURN','ZONE','SUPPLY','RETURN']
CSLname = ['flow rate','setpoint','supply inlet','outlet','zone','supply side outlet','return']
updateABB(CSLabb,CSLname)

CSL3abb = ['FLOW','RETURN']
CSL3name = ['flow rate','supply side inlet']
updateABB(CSL3abb,CSL3name)

## read the EnergyPlus variables as well as the groudtruth

In [11]:
truth = pd.read_csv('../.csv')
for i in truth.index:
    if truth.loc[i,'alter'] in [np.nan]:
        truth.loc[i,'truth'] = [truth.loc[i,'truth']]
    else:
        truth.loc[i,'truth'] = [truth.loc[i,'truth'],truth.loc[i,'alter']]
truth.drop(columns=['alter'],inplace = True)

## fuzzy string matching

In [12]:
def match(s,OPCtag,th):
    
    ngram = []
    for i in range(3,0,-1):
        grams = ngrams(s.split(' '),i)
        for gram in grams:
            ngram.append(' '.join(list(gram)))
    
    ABBs = []
    while len(ngram)>0:
        i = ngram[0]
        ngram.remove(i)
        try:
            ABBs.append(ABBdict.loc[ABBdict['name'] == i,'abb'].values[0])
            ngram = [s for s in ngram if (s not in i)]
        except:
            pass

    details = []
    for i in OPCtag.index:
        idx = 0
        m = 0
        r = 0
        for j in range(len(OPCtag['Xgrams'][i])):
            used = [0]*len(ABBs)    # so that a set of ABBs is only matched once
            count = 0
            for gram in OPCtag.loc[i,'Xgrams'][j]:
                for c in range(len(ABBs)):
                    if gram in ABBs[c] and used[c]==0:
                        used[c] = 1
                        count += len(gram)
                        break
            if count > m:
                m = count # to avoid division by zero error
                # harmonic average of ratio on both side
                r = 2/(len(ABBs)/sum(used) + len(OPCtag['tag'][i])/m)
                idx = j
        if r >= .01:
            details.append([round(r,4),OPCtag['tag'][i],tuple(OPCtag['Xgrams'][i][idx])])

    details = list(set(tuple(x) for x in details))
    details.sort()
    matches = [i[1] for i in details][-th:]

    return [matches,details,ABBs]

In [13]:
def check(result,th):
    for i in result.index:
        if len(result.loc[i,'matches']) == 0:
            if np.nan in result.loc[i,'truth']:
                result.loc[i,'test'] = 'TN'
            else:
                result.loc[i,'test'] = 'FN'
        elif np.nan in result.loc[i,'truth']:
            result.loc[i,'test'] = 'FP'
        else:
            for m in result.loc[i,'truth']:
                if m in result.loc[i,'matches'][-th:]:
                    result.loc[i,'test'] = 'TP'
                    break
                else:
                    result.loc[i,'test'] = 'FP1'
    return result

In [14]:
top5 = pd.DataFrame(columns = ['input','abbs','matches'])
ls = truth['input'].unique()
for i in range(len(ls)):
    test = match(ls[i],OPCtag,5)
    top5.loc[i,'input'] = ls[i]
    top5.loc[i,'abbs'] = test[2]
    top5.loc[i,'matches'] = test[0]
top5 = top5.join(truth.set_index('input'), on='input')

In [15]:
t5 = check(top5,5)
t5['test'].value_counts()

FP     414
TP      48
FP1      3
TN       1
Name: test, dtype: int64

In [16]:
t5[t5['test']=='FP1']

Unnamed: 0,input,abbs,matches,truth,test
27,zone mean air humidity ratio,"[{HR, HUM, H, HUMIDITY, HM}, {RM, R, ZONE, ZN}...","[hoa, oah, OA Humidity, oa_humidity, RA Humidity]","[ZNH HI, ZNH LO]",FP1
285,zone air node humidity ratio,"[{HR, HUM, H, HUMIDITY, HM}, {RM, R, ZONE, ZN}...","[hoa, oah, OA Humidity, oa_humidity, RA Humidity]","[ZNH HI, ZNH LO]",FP1
451,air relief node name temperature,"[{A}, {RLF}, {T, TEMP, TMP, TEMPERATURE}]","[OAT, oat, OA Temperature AV, OA Temp, oa_temp...",[Return Air Temp],FP1


In [18]:
t1 = check(top5,1)
t1['test'].value_counts()

FP     414
TP      27
FP1     24
TN       1
Name: test, dtype: int64

# baseline


In [20]:
# tokenization
OPCtag['ngram'] = OPCtag['tag'].apply(lambda x: str.upper(x))
for i in OPCtag.index:
    Ngrams = []    
    substrings = re.split('[ _,:]',OPCtag['ngram'][i])
    if len(substrings) < 2:
        if len(substrings[0])>8:
            Ngrams = [substrings[0]]
        else:
            for n in range(1,min(7,len(substrings[0])+1)):
                grams = ngrams(substrings[0],n)
                for gram in grams:
                    Ngrams.append(''.join(list(gram)))
    else:
        for s in substrings:
            if len(s)>3:
                Ngrams.append(s)
            else:
                for n in range(1,min(7,len(s)+1)):
                    grams = ngrams(s,n)
                    for gram in grams:
                        Ngrams.append(''.join(list(gram)))

    OPCtag.at[i,'ngram'] = Ngrams
OPCtag[:30]

Unnamed: 0,tag,Xgrams,ngram
0,Floor_1760001_Master_Hyperion_Enabled,"[[FLOOR, 1760001, MASTER, HYPERION, ENABLED]]","[FLOOR, 1760001, MASTER, HYPERION, ENABLED]"
1,Floor_1760001_Master_Hyperion_Mode_Sunny_or_Dark,"[[FLOOR, 1760001, MASTER, HYPERION, MODE, SUNN...","[FLOOR, 1760001, MASTER, HYPERION, MODE, SUNNY..."
2,Floor_1760001_Master_Loadshed_Enabled,"[[FLOOR, 1760001, MASTER, LOADSHED, ENABLED]]","[FLOOR, 1760001, MASTER, LOADSHED, ENABLED]"
3,Floor_1760001_Timeclock_atrium_evening_Enabled,"[[FLOOR, 1760001, TIMECLOCK, ATRIUM, EVENING, ...","[FLOOR, 1760001, TIMECLOCK, ATRIUM, EVENING, E..."
4,Floor_1760001_Timeclock_Disable_Wall_Control_E...,"[[FLOOR, 1760001, TIMECLOCK, DISABLE, WALL, CO...","[FLOOR, 1760001, TIMECLOCK, DISABLE, WALL, CON..."
5,Floor_1760001_Timeclock_Dusk_to_11_PM_Enabled,"[[FLOOR, 1760001, TIMECLOCK, DUSK, T, O, 1, 1,...","[FLOOR, 1760001, TIMECLOCK, DUSK, T, O, TO, 1,..."
6,Floor_1760001_Timeclock_Dusk_to_Dawn_Enabled,"[[FLOOR, 1760001, TIMECLOCK, DUSK, T, O, DAWN,...","[FLOOR, 1760001, TIMECLOCK, DUSK, T, O, TO, DA..."
7,Floor_1760001_Timeclock_Interior_Atrium_Lighti...,"[[FLOOR, 1760001, TIMECLOCK, INTERIOR, ATRIUM,...","[FLOOR, 1760001, TIMECLOCK, INTERIOR, ATRIUM, ..."
8,Floor_1760001_Timeclock_Interior_Lighting_Non_...,"[[FLOOR, 1760001, TIMECLOCK, INTERIOR, LIGHTIN...","[FLOOR, 1760001, TIMECLOCK, INTERIOR, LIGHTING..."
9,ALT OAT FAIL,"[[A, L, T, O, A, T, FAIL], [A, L, T, OA, T, FA...","[A, L, T, AL, LT, ALT, O, A, T, OA, AT, OAT, F..."


In [19]:
def onNgram(s,OPCtag,th):
    
    ngram = []
    for i in range(3,0,-1):
        grams = ngrams(s.split(' '),i)
        for gram in grams:
            ngram.append(' '.join(list(gram)))
    
    ABBs = []
    while len(ngram)>0:
        i = ngram[0]
        ngram.remove(i)
        try:
            ABBs.append(ABBdict.loc[ABBdict['name'] == i,'abb'].values[0])
            ngram = [s for s in ngram if (s not in i)]
        except:
            pass
    
    detail = []
    for i in OPCtag.index:
        r = 0
        used = [0]*len(ABBs)    # so that a set of ABBs is only matched once
        count = 0
        for gram in OPCtag['ngram'][i]:
            for c in range(len(ABBs)):
                if gram in ABBs[c] and used[c]==0:
                    used[c] = 1
                    count += 1
                    break
        if count >= 1:
            r = count/(len(ABBs)+len(OPCtag['ngram'][i])-count)
            detail.append([round(r,4),OPCtag['tag'][i]])

    detail = list(set(tuple(x) for x in detail))
    detail.sort()
    matches = [i[1] for i in detail][-th:]
            
    return [matches,detail,ABBs]

In [21]:
top5 = pd.DataFrame(columns = ['input','abbs','matches'])
ls = truth['input'].unique()
for i in range(len(ls)):
    test = onNgram(ls[i],OPCtag,5)
    top5.loc[i,'input'] = ls[i]
    top5.loc[i,'abbs'] = test[2]
    top5.loc[i,'matches'] = test[0]
top5 = top5.join(truth.set_index('input'), on='input')

In [22]:
t5 = check(top5,5)
t5['test'].value_counts()

FP     414
TP      33
FP1     18
TN       1
Name: test, dtype: int64

In [23]:
t1 = check(top5,1)
t1['test'].value_counts()

FP     414
FP1     33
TP      18
TN       1
Name: test, dtype: int64