In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import datetime as dt
import matplotlib.dates as mdates
import re
import sklearn.cluster
from nltk import ngrams
from eppy import modeleditor
from eppy.modeleditor import IDF
IDF.setiddname('C:/EnergyPlusV8-8-0/Energy+.idd')

import warnings
warnings.simplefilter(action='ignore', category=DeprecationWarning)

%matplotlib inline

# on Tagnames

### when the building is not large or the BMS is too detailed (diff. points at one location outnumber one point at diff. location). The assumption wont hold. If the naming convention is clear enough, the measurement information can be extracted by just trimming. There's no need for clustering.

In [2]:
raw = pd.read_csv('IW.csv')
tags = raw['tag'].copy()
tags = tags.apply(lambda x: re.split('[.:]',x)[-1])
tags.head()

0    CSE
1    CV_
2    CWR
3    CWS
4    CWV
Name: tag, dtype: object

In [3]:
from itertools import combinations
def razors(string):
    m = len(string)-1
    result = []
    for k in range(min(m+1,7)):
        for bits in combinations(range(m), k):
            s = [1] * m
            for bit in bits:
                s[bit] = 0
            result.append(s)
    return result

def Xgrams(string):
    string = ''.join(re.split('[ _,:]',string))
    ngrams = []
    for item in razors(string):
        ngram = []
        flag = 0
        for i in range(len(item)):
            if item[i]:
                ngram.append(string[flag:i+1])
                flag = i+1
        ngram.append(string[flag:])
        ngram.sort(key = len,reverse=True)   # so that the longer substrings get matched first
        ngrams.append(ngram)
    return ngrams

In [4]:
OPCtag = pd.DataFrame(tags.unique(),columns=['tag'])

# for i in range(1,len(OPCtag)+1):
#     s = OPCtag.loc[i,('tag')]
#     OPCtag.loc[i,('loc')] = s[:s.index(OPCtag.loc[i,('loc')])]

OPCtag['tag'] = OPCtag['tag'].apply(lambda x: str.upper(x))
OPCtag['Xgrams'] = OPCtag['tag'].apply(lambda x: Xgrams(x))
OPCtag = OPCtag[OPCtag['tag'].str.len()<7]
OPCtag.head()

Unnamed: 0,tag,Xgrams
0,CSE,"[[C, S, E], [CS, E], [SE, C], [CSE]]"
1,CV_,"[[C, V], [CV]]"
2,CWR,"[[C, W, R], [CW, R], [WR, C], [CWR]]"
3,CWS,"[[C, W, S], [CW, S], [WS, C], [CWS]]"
4,CWV,"[[C, W, V], [CW, V], [WV, C], [CWV]]"


# create dict

In [5]:
def updateABB(abb,full):
    global ABBdict
    for i in full:
        if i in list(ABBdict['name']):
            ABBdict['abb'].loc[ABBdict['name']==i].item().add(abb[full.index(i)])
        else:
            ABBdict = ABBdict.append({'abb': set([abb[full.index(i)]]),'name':i}, ignore_index=True)

In [6]:
dict1 = pd.read_csv('HVACacronym.csv')
dict2 = pd.read_csv('HVACacronym1.csv')

dict1['full']=dict1['full'].apply(lambda x: str.lower(x))
dict2['name']=dict2['name'].apply(lambda x: str.lower(x))

ABBdict = pd.concat([dict1.rename({'full':'name'}, axis='columns'),dict2])
ABBdict['abb']= ABBdict[['abb']].values.tolist()
ABBdict = pd.DataFrame(ABBdict.groupby('name')['abb'].sum())
ABBdict['name']=ABBdict.index
ABBdict.rename(index=str, columns={0: "abb"},inplace=True)
ABBdict['abb']= ABBdict['abb'].apply(lambda x: set(x))

ABBdict.reset_index(drop=True,inplace=True)

In [7]:
abb = ['OC','TMP','TEMP','SA','STATIC','RA','RH','RM','BM','FLWR','RET','KWATT','KWHR','PM','ENGRY','FWD','PWR','AHU',
      'FAN','SPEED','BYP','CLG','WATER','VLV','TP','SMK','FA','PM','BM','CURR','VOLT','PW']
full=['off coil','temperature','temperature','supply air','pressure','return air','relative humidity','room',
      'btu meter','flow rate','return','power','energy','power meter','energy','supply','power','ahu',
      'fan','speed','bypass','cooling','water','valve','time program','smoke','fresh air','power meter','btu meter',
      'current','voltage','power']
updateABB(abb,full)

EPname = ['zone','air','coil','energy','heating','cooling','air terminal','pump','boiler','condenser','setpoint',
          'heat exchanger','humidifier','heater','water','people','window','infiltration','equipment','outdoor',
          'drybulb','dewpoint','speed','angle','radiation','pressure','precipitation','occupant','lights',
          'humidity ratio','facility','mass flow rate','interior','gas','office','room','level','extract',
          'fan','supply','air loop','site','power','radiant','fan coil unit','indoor','heat pump',
          'vav','vrf','return','inlet','outlet','relief air','rate']

EPabb = ['RM','A','C','KWHR','HT','CL','VAV','PMP','B','COND','SP','HX','HUMID','HTR','WTR','PPL','WD','INFIL',
        'EQUIP','O','DB','DP','SPD','AGL','R','PR','P','OCC','LT','HR','FAC','FLWR','INTR','G','OFC','RM','LV',
        'E','F','S','AHU','LOC','KWATT','R','FCU','I','CU','VAV','VRF','R','S','R','EA','KW']
updateABB(EPabb,EPname)

NODEname = ['supply side inlet','supply side outlet','coil air outlet','mixed inlet','splitter outlet',
           'coil outlet','inlet','outlet','supply inlet']

NODEabb = ['R','S','OC','R','S','S','FWD','RET','R']

updateABB(NODEabb,NODEname)

testName = ['electricity','energy','power','demand','rate','coil air outlet','coil outlet']

testABB = ['KWHR','ENGRY','PWR','ENGRY','PWR','S','OC']

updateABB(testABB,testName)

In [8]:
iwName = ['dewpoint','relative humidity','humidity ratio','wind','direction','global','solar',
          'zone','electricity','water','mass flow rate']
iwABB = ['DWP','HM','HM','W','DIREC','GLO','S','R','TOTKW','W','FL']
updateABB(iwABB,iwName)

iwName = ['humidity ratio','relative humidity','mass flow rate']
iwABB = ['HUM','HUM','FLO']
updateABB(iwABB,iwName)

ABBdict[ABBdict['name'].str.contains('flow rate')]

Unnamed: 0,abb,name
287,{FLWR},flow rate
324,"{FLWR, FLO, FL}",mass flow rate


# map

In [9]:
def match(s,OPCtag,th):
    
    ngram = []
    for i in range(3,0,-1):
        grams = ngrams(s.split(' '),i)
        for gram in grams:
            ngram.append(' '.join(list(gram)))
    
#     ngram.sort(key = lambda s: len(s),reverse = True)
    ABBs = []
    while len(ngram)>0:
        i = ngram[0]
        ngram.remove(i)
#         print(ngram)
        try:
            ABBs.append(ABBdict.loc[ABBdict['name'] == i,'abb'].values[0])
            ngram = [s for s in ngram if (s not in i)]
        except:
            pass

    details = []
    for i in OPCtag.index:
        idx = 0
        m = 0
        r = 0
        for j in range(len(OPCtag['Xgrams'][i])):
            used = [0]*len(ABBs)    # so that a set of ABBs is only matched once
            count = 0
            for gram in OPCtag.loc[i,'Xgrams'][j]:
                for c in range(len(ABBs)):
                    if gram in ABBs[c] and used[c]==0:
                        used[c] = 1
                        count += len(gram)
                        break
            if count > m:
                m = count # to avoid division by zero error
                # harmonic average of ratio on both side
                r = 2/(len(ABBs)/sum(used) + len(OPCtag['tag'][i])/m)
                idx = j
# criteria passed: # of hits, # of total hit characters in tag, ratio of hit sets over total
        if r >= .01:
            details.append([round(r,4),OPCtag['tag'][i],tuple(OPCtag['Xgrams'][i][idx])])

    details = list(set(tuple(x) for x in details))
    details.sort()
    matches = [i[1] for i in details][-th:] #top 10 vs top 7 (10%) tested

    return [matches,details,ABBs]

In [10]:
def check(result):
    for i in result.index:
        if len(result.loc[i,'matches']) == 0:
            if np.nan in result.loc[i,'truth']:
                result.loc[i,'test'] = 'TN'
            else:
                result.loc[i,'test'] = 'FN'
        elif np.nan in result.loc[i,'truth']:
            result.loc[i,'test'] = 'FP'
        else:
            for m in result.loc[i,'truth']:
                if m in result.loc[i,'matches']:
                    result.loc[i,'test'] = 'TP'
                    break
                else:
                    result.loc[i,'test'] = 'FP1'
    return result

# reading E+ outputs

In [13]:
truth = pd.read_csv('truthIW.csv')
for i in truth.index:
    if truth.loc[i,'alter'] in [np.nan]:
        truth.loc[i,'truth'] = [truth.loc[i,'truth']]
    else:
        truth.loc[i,'truth'] = [truth.loc[i,'truth'],truth.loc[i,'alter']]
truth.drop(columns=['alter'],inplace = True)
truth[truth['truth'].str.len()>1]

Unnamed: 0,input,truth
0,site outdoor air drybulb temperature,"[TEMP, OAT]"
199,ahu cooling coil water inlet node temperature,"[SWT, CWS]"
200,ahu cooling coil water inlet node mass flow rate,"[FLS, FLO]"
205,ahu cooling coil water outlet node temperature,"[RWT, CWR]"
206,ahu cooling coil water outlet node mass flow rate,"[FLS, FLO]"


In [22]:
top7 = pd.DataFrame(columns = ['input','abbs','matches'])
ls = truth['input'].unique()
# add some input variables to map
for i in range(len(ls)):
    test = match(ls[i],OPCtag,5)
#     print(i,end=' ')
#     print(test[1])
    top7.loc[i,'input'] = ls[i]
    top7.loc[i,'abbs'] = test[2]
    top7.loc[i,'matches'] = test[0]
#     result.loc[i,'detail'] = test[1]
top7 = top7.join(truth.set_index('input'), on='input')

In [42]:
match('electricity',OPCtag,7)

[['TOTKW'], [(1.0, 'TOTKW', ('TOTKW',))], [{'KWHR', 'TOTKW'}]]

In [23]:
t7 = check(top7)
t7['test'].value_counts()

FP     439
TP      17
TN       8
FP1      2
Name: test, dtype: int64

In [15]:
def check1(result):
    for i in result.index:
        if len(result.loc[i,'matches']) == 0:
            if np.nan in result.loc[i,'truth']:
                result.loc[i,'test'] = 'TN'
            else:
                result.loc[i,'test'] = 'FN'
        elif np.nan in result.loc[i,'truth']:
            result.loc[i,'test'] = 'FP'
        else:
            for m in result.loc[i,'truth']:
                if m == result.loc[i,'matches'][-1]:
                    result.loc[i,'test'] = 'TP'
                    break
                else:
                    result.loc[i,'test'] = 'FP1'
    return result

In [16]:
t7 = check1(top7)
t7['test'].value_counts()

FP     413
TP      14
TN       8
FP1      5
Name: test, dtype: int64

# baseline


In [17]:
def onNgram(s,OPCtag,th):
    
    ngram = []
    for i in range(3,0,-1):
        grams = ngrams(s.split(' '),i)
        for gram in grams:
            ngram.append(' '.join(list(gram)))
    
#     ngram.sort(key = lambda s: len(s),reverse = True)
    ABBs = []
    while len(ngram)>0:
        i = ngram[0]
        ngram.remove(i)
        try:
            ABBs.append(ABBdict.loc[ABBdict['name'] == i,'abb'].values[0])
            ngram = [s for s in ngram if (s not in i)]
        except:
            pass
    
    detail = []
    for i in OPCtag.index:
        r = 0
        used = [0]*len(ABBs)    # so that a set of ABBs is only matched once
        count = 0
        for gram in OPCtag['ngram'][i]:
            for c in range(len(ABBs)):
                if gram in ABBs[c] and used[c]==0:
                    used[c] = 1
                    count += 1
                    break
# criteria passed: # of hits, # of total hit characters in tag, ratio of hit sets over total
        if count >= 1:
            # harmonic average of ratio on both side
#             r = 2/(len(ABBs)/sum(used) + len(OPCtag['ngram'][i])/count)
#             Jaccard similarity
            r = count/(len(ABBs)+len(OPCtag['ngram'][i])-count)
            detail.append([round(r,4),OPCtag['tag'][i]])  #,tuple(OPCtag['ngram'][i])

    detail = list(set(tuple(x) for x in detail))
    detail.sort()
    matches = [i[1] for i in detail][-th:] #top 10 vs top 7 (10%) tested
            
    return [matches,detail,ABBs]

# test = match(str.lower('Zone Mean Air Temperature'),OPCtag)
# set(tuple(x) for x in test[0])

In [18]:
OPCtag['ngram'] = OPCtag['tag']
for i in OPCtag.index:
    ngram = []
    for n in range(1,min(7,len(OPCtag['tag'][i])+1)):
        grams = ngrams(OPCtag.loc[i,('tag')],n)
        for gram in grams:
            ngram.append(''.join(list(gram)))
    OPCtag.at[i,'ngram'] = ngram
OPCtag.tail()

Unnamed: 0,tag,Xgrams,ngram
93,8,[[8]],[8]
94,UFHM,"[[U, F, H, M], [UF, H, M], [FH, U, M], [HM, U,...","[U, F, H, M, UF, FH, HM, UFH, FHM, UFHM]"
95,UFT,"[[U, F, T], [UF, T], [FT, U], [UFT]]","[U, F, T, UF, FT, UFT]"
96,MSP,"[[M, S, P], [MS, P], [SP, M], [MSP]]","[M, S, P, MS, SP, MSP]"
97,MV_,"[[M, V], [MV]]","[M, V, _, MV, V_, MV_]"


In [19]:
top7 = pd.DataFrame(columns = ['input','abbs','matches'])
ls = truth['input'].unique()
# add some input variables to map
for i in range(len(ls)):
    test = onNgram(ls[i],OPCtag,7)
#     print(i,end=' ')
#     print(test[1])
    top7.loc[i,'input'] = ls[i]
    top7.loc[i,'abbs'] = test[2]
    top7.loc[i,'matches'] = test[0]
#     result.loc[i,'detail'] = test[1]
top7 = top7.join(truth.set_index('input'), on='input')

In [20]:
t7 = check(top7)
t7['test'].value_counts()

FP     439
TP      12
TN       8
FP1      7
Name: test, dtype: int64

In [21]:
t7 = check1(top7)
t7['test'].value_counts()

FP     439
FP1     10
TP       9
TN       8
Name: test, dtype: int64

In [35]:
top7[top7['test']=='FP1']

Unnamed: 0,input,abbs,matches,truth,test
149,supply side inlet temperature,"[{FWD, S, SUP}, {FWD, S}, {T, TEMP, TMP}, {R}]","[RMTEMP, SATEMP, SS, RMTEMPSPB, RATEMPSP, RMTE...",[RATEMP],FP1
167,air outlet node temperature,"[{A}, {R}, {T, TEMP, TMP}]","[RMTEMPSP, ATEMPSP, RATEMPLOALM, RMTEMP, SATEM...",[OCTEMP],FP1
203,water outlet node temperature,"[{WATER, WTR}, {R}, {T, TEMP, TMP}]","[TRIP, WATERDETALM, RMTEMPSPB, RATEMPSP, RMTEM...",[BMRETTMP],FP1
431,outlet temperature,"[{R}, {T, TEMP, TMP}]","[TMP, TRIP, RMTEMPSPB, RATEMPSP, RMTEMPSP, RAT...",[OCTEMP],FP1


In [27]:
top7[top7['test']=='FP1']

Unnamed: 0,input,abbs,matches,truth,test
110,supply inlet setpoint temperature,"[{SUP, FWD, S}, {FWD, S}, {SP}, {TMP, TEMP, T}]","[RMTEMPSP, SATEMP, ATEMPSP, SASTATICSP, SS, TM...",[SATEMPSP],FP1
146,supply side outlet setpoint temperature,"[{S}, {SP}, {TMP, TEMP, T}]","[RMTEMPSPB, STS, RATEMPSP, RMTEMPSP, SATEMP, A...",[SATEMPSP],FP1
147,supply side outlet relative humidity,"[{S}, {RH}]","[SMKDET, TMPSP, AFS, STS, VSD, RARH, SS]",[RATEMPSP],FP1
153,supply side inlet relative humidity,"[{R}, {RH}]","[RATEMP, RMTEMP, BMPWR, RACO2, DMPR, TRIP, RARH]",[RATEMPSP],FP1
167,air outlet node temperature,"[{A}, {R, RET}, {TMP, TEMP, T}]","[ATEMPSP, BMRETTMP, RATEMPLOALM, RMTEMP, SATEM...",[OCTEMP],FP1
431,outlet temperature,"[{R, RET}, {TMP, TEMP, T}]","[TRIP, RMTEMPSPB, RATEMPSP, RMTEMPSP, BMRETTMP...",[OCTEMP],FP1


In [55]:
t7.to_csv('TOcheck.csv')