In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import datetime as dt
import matplotlib.dates as mdates
import re
import sklearn.cluster
from nltk import ngrams
from eppy import modeleditor
from eppy.modeleditor import IDF
IDF.setiddname('C:/EnergyPlusV8-8-0/Energy+.idd')

import warnings
warnings.simplefilter(action='ignore', category=DeprecationWarning)

%matplotlib inline

# on Tagnames

In [2]:
raw = pd.read_csv('SDE4.csv')
OPCtag = raw.copy()
OPCtag['tag'] = OPCtag['raw']
OPCtag['tag'] = OPCtag['tag'].apply(lambda x: x.split('.')[-1])
OPCtag['tag'][188:332] = OPCtag['tag'][188:332].apply(lambda x: '_'.join(x.split('_')[3:]))

OPCtag.head()

Unnamed: 0,raw,tag
0,WEATHER STATION.OA_CO2,OA_CO2
1,WEATHER STATION.RAINFAIL,RAINFAIL
2,Weather Station.Wind speed,Wind speed
3,Weather Station.Wind direction in degrees,Wind direction in degrees
4,Weather Station.Temp,Temp


In [3]:
from itertools import combinations
def razors(string):
    m = len(string)-1
    result = []
    for k in range(min(m+1,7)):
        for bits in combinations(range(m), k):
            s = [1] * m
            for bit in bits:
                s[bit] = 0
            result.append(s)
    return result

def Xgrams(string):
    string = ''.join(re.split('[ _,:]',string))
    ngrams = []
    for item in razors(string):
        ngram = []
        flag = 0
        for i in range(len(item)):
            if item[i]:
                ngram.append(string[flag:i+1])
                flag = i+1
        ngram.append(string[flag:])
        ngram.sort(key = len,reverse=True)   # so that the longer substrings get matched first
        ngrams.append(ngram)
    return ngrams

In [4]:
OPCtag['tag'] = OPCtag['tag'].apply(lambda x: str.upper(x))
OPCtag['Xgrams'] = OPCtag['tag']
OPCtag['Xgrams'][188:332] = OPCtag['tag'][188:332].apply(lambda x: Xgrams(x))
# OPCtag = OPCtag[OPCtag['tag'].str.len()<7]
OPCtag[180:190]

Unnamed: 0,raw,tag,Xgrams
180,SDE4 MSB1.MSB1 kWh Net Balance,MSB1 KWH NET BALANCE,MSB1 KWH NET BALANCE
181,SDE4 MSB2.MSB2 kWh Net Delivered,MSB2 KWH NET DELIVERED,MSB2 KWH NET DELIVERED
182,SDE4 MSB2.MSB2 kWh Net Received,MSB2 KWH NET RECEIVED,MSB2 KWH NET RECEIVED
183,SDE4 MSB2.MSB2 kWh Net Balance,MSB2 KWH NET BALANCE,MSB2 KWH NET BALANCE
184,SDE4 Net Elec Energy Balance.SDE4 Net Elec Ene...,SDE4 NET ELEC ENERGY BALANCE,SDE4 NET ELEC ENERGY BALANCE
185,SDE4 Main BTU.Main BTU Energy in kWh Net,MAIN BTU ENERGY IN KWH NET,MAIN BTU ENERGY IN KWH NET
186,SDE4 Main BTU.BTU Cooling Elec Energy,BTU COOLING ELEC ENERGY,BTU COOLING ELEC ENERGY
187,SDE4 Net Energy Balance.SDE4 Net Energy Balance,SDE4 NET ENERGY BALANCE,SDE4 NET ENERGY BALANCE
188,PAHU_L2_01_CMD,CMD,"[[C, M, D], [CM, D], [MD, C], [CMD]]"
189,PAHU_L2_01_PFILT,PFILT,"[[P, F, I, L, T], [PF, I, L, T], [FI, P, L, T]..."


In [41]:
# OPCtag.to_csv('med.csv')
# OPCtag.groupby('tag')['raw','Xgrams'].max().to_csv('med.csv')

# create dict

In [5]:
def updateABB(abb,full):
    global ABBdict
    for i in full:
        if i in list(ABBdict['name']):
            ABBdict['abb'].loc[ABBdict['name']==i].item().add(abb[full.index(i)])
        else:
            ABBdict = ABBdict.append({'abb': set([abb[full.index(i)]]),'name':i}, ignore_index=True)

In [6]:
dict1 = pd.read_csv('HVACacronym.csv')
dict2 = pd.read_csv('HVACacronym1.csv')

dict1['full']=dict1['full'].apply(lambda x: str.lower(x))
dict2['name']=dict2['name'].apply(lambda x: str.lower(x))

ABBdict = pd.concat([dict1.rename({'full':'name'}, axis='columns'),dict2])
ABBdict['abb']= ABBdict[['abb']].values.tolist()
ABBdict = pd.DataFrame(ABBdict.groupby('name')['abb'].sum())
ABBdict['name']=ABBdict.index
ABBdict.rename(index=str, columns={0: "abb"},inplace=True)
ABBdict['abb']= ABBdict['abb'].apply(lambda x: set(x))

ABBdict.reset_index(drop=True,inplace=True)

In [7]:
abb = ['OC','TMP','TEMP','SA','STATIC','RA','RH','RM','BM','FLWR','RET','KWATT','KWHR','PM','ENGRY','FWD','PWR','AHU',
      'FAN','SPEED','BYP','CLG','WATER','VLV','TP','SMK','FA','PM','BM','CURR','VOLT','PW']
full=['off coil','temperature','temperature','supply air','pressure','return air','relative humidity','room',
      'btu meter','flow rate','return','power','energy','power meter','energy','supply','power','ahu',
      'fan','speed','bypass','cooling','water','valve','time program','smoke','fresh air','power meter','btu meter',
      'current','voltage','power']
updateABB(abb,full)

EPname = ['zone','air','coil','energy','heating','cooling','air terminal','pump','boiler','condenser','setpoint',
          'heat exchanger','humidifier','heater','water','people','window','infiltration','equipment','outdoor',
          'drybulb','dewpoint','speed','angle','radiation','pressure','precipitation','occupant','lights',
          'humidity ratio','facility','mass flow rate','interior','gas','office','room','level','extract',
          'fan','supply','air loop','site','power','radiant','fan coil unit','indoor','heat pump',
          'vav','vrf','return','inlet','outlet','relief air','rate']

EPabb = ['RM','A','C','KWHR','HT','CL','VAV','PMP','B','COND','SP','HX','HUMID','HTR','WTR','PPL','WD','INFIL',
        'EQUIP','O','DB','DP','SPD','AGL','R','PR','P','OCC','LT','HR','FAC','FLWR','INTR','G','OFC','RM','LV',
        'E','F','S','AHU','LOC','KWATT','R','FCU','I','CU','VAV','VRF','R','S','R','EA','KW']
updateABB(EPabb,EPname)

NODEname = ['supply side inlet','supply side outlet','coil air outlet','mixed inlet','splitter outlet',
           'coil outlet','inlet','outlet']

NODEabb = ['R','S','OC','R','S','S','FWD','RET']

updateABB(NODEabb,NODEname)

testName = ['electricity','energy','power','demand','rate','coil air outlet','coil outlet']

testABB = ['KWHR','ENGRY','PWR','ENGRY','PWR','S','OC']

updateABB(testABB,testName)

In [8]:
iwName = ['dewpoint','relative humidity','humidity ratio','wind','direction','global','solar',
          'zone','electricity','water','mass flow rate']
iwABB = ['DWP','HM','HM','W','DIREC','GLO','S','R','TOTKW','W','FL']
updateABB(iwABB,iwName)

iwName = ['humidity ratio','relative humidity','mass flow rate']
iwABB = ['HUM','HUM','FLO']
updateABB(iwABB,iwName)

In [9]:
sdeName = ['temperature','dewpoint','relative humidity','humidity ratio','pressure','barometric','wind','direction',
          'solar','radiation','precipitation','electricity','energy','purchased','net','produced']
sdeABB = ['TEMPERATURE','DEW','HUMIDITY','HUMIDITY','PRESSURE','BAROMETRIC','WIND','DIRECTION','SOLAR',
         'RADIATION','PRECIPITATION','POWER','ENERGY','RECEIVED','BALANCE','DELIVERED']
updateABB(sdeABB,sdeName)

sdeABB = ['KWH','SP']
sdeName = ['electricity','pressure']
updateABB(sdeABB,sdeName)

# ABBdict[ABBdict['name'].str.contains('dew')]

# map

In [10]:
def match(s,OPCtag,th):
    
    ngram = []
    for i in range(3,0,-1):
        grams = ngrams(s.split(' '),i)
        for gram in grams:
            ngram.append(' '.join(list(gram)))
    
#     ngram.sort(key = lambda s: len(s),reverse = True)
    ABBs = []
    while len(ngram)>0:
        i = ngram[0]
        ngram.remove(i)
#         print(ngram)
        try:
            ABBs.append(ABBdict.loc[ABBdict['name'] == i,'abb'].values[0])
            ngram = [s for s in ngram if (s not in i)]
        except:
            pass

    details = []
    for i in OPCtag.index:
        r = 0
        if i >187 and i <332:
            idx = 0
            m = 0
            for j in range(len(OPCtag['Xgrams'][i])):
                used = [0]*len(ABBs)    # so that a set of ABBs is only matched once
                count = 0
                for gram in OPCtag.loc[i,'Xgrams'][j]:
                    for c in range(len(ABBs)):
                        if gram in ABBs[c] and used[c]==0:
                            used[c] = 1
                            count += len(gram)
                            break
                if count > m:
                    m = count # to avoid division by zero error
                    # harmonic average of ratio on both side
                    r = 2/(len(ABBs)/sum(used) + len(OPCtag['tag'][i])/m)
                    idx = j
            if r >= .01:
                details.append([round(r,4),OPCtag['tag'][i],tuple(OPCtag['Xgrams'][i][idx])])
        else:
            used = [0]*len(ABBs)
            m=0
            for s in OPCtag['Xgrams'][i].split(' '):
                for c in range(len(ABBs)):
                    if s in ABBs[c] and used[c]==0:
                        used[c] = 1
                        m += 1
                        break
            if m > 0:
                r = 2/(len(ABBs)/ sum(used) + len(OPCtag['Xgrams'][i].split(' '))/m)
                details.append([round(r,4),OPCtag['tag'][i],tuple(OPCtag['Xgrams'][i].split(' '))])

    details = list(set(tuple(x) for x in details))
    details.sort()
    matches = [i[1] for i in details][-th:] #top 10 vs top 7 (10%) tested

    return [matches,details,ABBs]

In [25]:
def check(result):
    for i in result.index:
        if len(result.loc[i,'matches']) == 0:
            if result.loc[i,'truth'] in [np.nan]:
                result.loc[i,'test'] = 'TN'
            else:
                result.loc[i,'test'] = 'FN'
        elif result.loc[i,'truth'] in [np.nan]:
            result.loc[i,'test'] = 'FP'
        elif result.loc[i,'truth'] in result.loc[i,'matches']:
            result.loc[i,'test'] = 'TP'
        else:
            result.loc[i,'test'] = 'FP1'
    return result

# reading E+ outputs

In [12]:
truth = pd.read_csv('truthSDE.csv')
truth.drop(columns=['alter'],inplace = True)
# truth[truth['truth'].str.len()>1]

In [13]:
top7 = pd.DataFrame(columns = ['input','abbs','matches'])
ls = truth['input'].unique()
# add some input variables to map
for i in range(len(ls)):
    test = match(ls[i],OPCtag,7)
#     print(i,end=' ')
#     print(test[1])
    top7.loc[i,'input'] = ls[i]
    top7.loc[i,'abbs'] = test[2]
    top7.loc[i,'matches'] = test[0]
#     result.loc[i,'detail'] = test[1]
top7 = top7.join(truth.set_index('input'), on='input')

In [14]:
t7 = check(top7)
t7['test'].value_counts()

FP     427
TP      26
TN      10
FP1      3
Name: test, dtype: int64

In [15]:
top7[top7['test']=='FP1']

Unnamed: 0,input,abbs,matches,truth,test
0,site outdoor air drybulb temperature,"[{LOC}, {O}, {A}, {DB}, {TEMP, TMP, TEMPERATUR...","[RA_CO2, SA-TSP, SA_CO2, OC-T, OC_T, RA_T, SA_T]",TEMP,FP1
25,zone mean air temperature,"[{RM, R}, {A}, {TEMP, TMP, TEMPERATURE, T}]","[RA_CO2, SA-TSP, SA_RH, TEMP, SA_T, TRP, RA_T]",STATUS TEMP LEVEL,FP1
283,zone air node temperature,"[{RM, R}, {A}, {TEMP, TMP, TEMPERATURE, T}]","[RA_CO2, SA-TSP, SA_RH, TEMP, SA_T, TRP, RA_T]",STATUS TEMP LEVEL,FP1


In [18]:
def check1(result):
    for i in result.index:
        if len(result.loc[i,'matches']) == 0:
            if result.loc[i,'truth'] in [np.nan]:
                result.loc[i,'test'] = 'TN'
            else:
                result.loc[i,'test'] = 'FN'
        elif result.loc[i,'truth'] in [np.nan]:
            result.loc[i,'test'] = 'FP'
        elif result.loc[i,'truth'] == result.loc[i,'matches'][-1]:
            result.loc[i,'test'] = 'TP'
        else:
            result.loc[i,'test'] = 'FP1'
    return result

t7 = check1(top7)
t7['test'].value_counts()

FP     427
FP1     15
TP      14
TN      10
Name: test, dtype: int64

# baseline


In [19]:
def onNgram(s,OPCtag,th):
    
    ngram = []
    for i in range(3,0,-1):
        grams = ngrams(s.split(' '),i)
        for gram in grams:
            ngram.append(' '.join(list(gram)))
    
#     ngram.sort(key = lambda s: len(s),reverse = True)
    ABBs = []
    while len(ngram)>0:
        i = ngram[0]
        ngram.remove(i)
        try:
            ABBs.append(ABBdict.loc[ABBdict['name'] == i,'abb'].values[0])
            ngram = [s for s in ngram if (s not in i)]
        except:
            pass
    
    detail = []
    for i in OPCtag.index:
        r = 0
        used = [0]*len(ABBs)    # so that a set of ABBs is only matched once
        if i >187 and i <332:
            count = 0
            for gram in OPCtag['ngram'][i]:
                for c in range(len(ABBs)):
                    if gram in ABBs[c] and used[c]==0:
                        used[c] = 1
                        count += 1
                        break
    # criteria passed: # of hits, # of total hit characters in tag, ratio of hit sets over total
            if count >= 1:
                # harmonic average of ratio on both side
    #             r = 2/(len(ABBs)/sum(used) + len(OPCtag['ngram'][i])/count)
    #             Jaccard similarity
                r = count/(len(ABBs)+len(OPCtag['ngram'][i])-count)
                detail.append([round(r,4),OPCtag['tag'][i]])  #,tuple(OPCtag['ngram'][i])
        else:
            m=0
            for s in OPCtag['ngram'][i].split(' '):
                for c in range(len(ABBs)):
                    if s in ABBs[c] and used[c]==0:
                        used[c] = 1
                        m += 1
                        break
            if m > 0:
                r = m/(len(ABBs)+len(OPCtag['ngram'][i].split(' '))-m)
                detail.append([round(r,4),OPCtag['tag'][i],tuple(OPCtag['Xgrams'][i].split(' '))])

            
    detail = list(set(tuple(x) for x in detail))
    detail.sort()
    matches = [i[1] for i in detail][-th:] #top 10 vs top 7 (10%) tested
            
    return [matches,detail,ABBs]

# test = match(str.lower('Zone Mean Air Temperature'),OPCtag)
# set(tuple(x) for x in test[0])

In [20]:
OPCtag['ngram'] = OPCtag['tag']
for i in range(188,332):
    ngram = []
    for n in range(1,min(7,len(OPCtag['tag'][i])+1)):
        grams = ngrams(OPCtag.loc[i,('tag')],n)
        for gram in grams:
            ngram.append(''.join(list(gram)))
    OPCtag.at[i,'ngram'] = ngram
# OPCtag[-30:]

In [21]:
top7 = pd.DataFrame(columns = ['input','abbs','matches'])
ls = truth['input'].unique()
# add some input variables to map
for i in range(len(ls)):
    test = onNgram(ls[i],OPCtag,5)
#     print(i,end=' ')
#     print(test[1])
    top7.loc[i,'input'] = ls[i]
    top7.loc[i,'abbs'] = test[2]
    top7.loc[i,'matches'] = test[0]
#     result.loc[i,'detail'] = test[1]
top7 = top7.join(truth.set_index('input'), on='input')

In [26]:
t7 = check(top7)
t7['test'].value_counts()

FP     427
TP      26
TN      10
FP1      3
Name: test, dtype: int64

In [24]:
t7 = check1(top7)
t7['test'].value_counts()

FP     427
FP1     18
TP      11
TN      10
Name: test, dtype: int64

In [27]:
top7[top7['test']=='FP1']

Unnamed: 0,input,abbs,matches,truth,test
94,demand side inlet 1 setpoint temperature,"[{ENGRY}, {S, FWD}, {SP}, {TEMP, TMP, TEMPERAT...","[DEW POINT TEMPERATURE, SA_T, STATUS TEMP LEVE...",SA-TSP,FP1
148,supply side outlet setpoint temperature,"[{S}, {SP}, {TEMP, TMP, TEMPERATURE, T}]","[SA_T, DEW POINT TEMPERATURE, STATUS TEMP LEVE...",SA-TSP,FP1
457,cooling coil outlet temperature,"[{S, OC}, {CLG, CL}, {TEMP, TMP, TEMPERATURE, T}]","[SA_T, DEW POINT TEMPERATURE, STATUS TEMP LEVE...",OC_T,FP1


In [22]:
match('zone mean air temperature',OPCtag,5)

[['SA_RH', 'TEMP', 'SA_T', 'TRP', 'RA_T'],
 [(0.1667, 'ST_P_STPT', ('S', 'T', 'P', 'S', 'T', 'P', 'T')),
  (0.2222, 'SA_CO2', ('S', 'A', 'C', 'O', '2')),
  (0.25, 'PFILT', ('P', 'F', 'I', 'L', 'T')),
  (0.25, 'SA_SP', ('S', 'A', 'S', 'P')),
  (0.25, 'SFILT', ('S', 'F', 'I', 'L', 'T')),
  (0.2857, 'FA_C', ('F', 'A', 'C')),
  (0.2857, 'OC-T', ('O', 'C', '-', 'T')),
  (0.2857, 'OC_T', ('O', 'C', 'T')),
  (0.3077, 'EC_MOTOR_C', ('E', 'C', 'M', 'O', 'T', 'O', 'R', 'C')),
  (0.3077, 'SA_PRES_SP', ('S', 'A', 'P', 'R', 'E', 'S', 'S', 'P')),
  (0.3333, 'AF1', ('A', 'F', '1')),
  (0.3333, 'AF2', ('A', 'F', '2')),
  (0.3333, 'AF3', ('A', 'F', '3')),
  (0.3333, 'AF4', ('A', 'F', '4')),
  (0.3333, 'DEW POINT TEMPERATURE', ('DEW', 'POINT', 'TEMPERATURE')),
  (0.3333, 'STATUS TEMP LEVEL', ('STATUS', 'TEMP', 'LEVEL')),
  (0.3333, 'STS', ('S', 'T', 'S')),
  (0.4, 'AF', ('A', 'F')),
  (0.4444, 'RA_CO2', ('R', 'A', 'C', 'O', '2')),
  (0.4444, 'SA-TSP', ('S', 'A', '-', 'T', 'S', 'P')),
  (0.5, 'SA_RH', ('

In [23]:
onNgram('zone mean air temperature',OPCtag,5)

[['DEW POINT TEMPERATURE', 'STATUS TEMP LEVEL', 'TRP', 'RA_T', 'TEMP'],
 [(0.0244, 'ST_P_STPT'),
  (0.0435, 'EC_MOTOR_C'),
  (0.0435, 'SA_CO2'),
  (0.0435, 'SA_PRES_SP'),
  (0.0588, 'PFILT'),
  (0.0588, 'SA_SP'),
  (0.0588, 'SFILT'),
  (0.0833, 'FA_C'),
  (0.0833, 'OC-T'),
  (0.0833, 'OC_T'),
  (0.0909, 'RA_CO2'),
  (0.0909, 'SA-TSP'),
  (0.125, 'AF1'),
  (0.125, 'AF2'),
  (0.125, 'AF3'),
  (0.125, 'AF4'),
  (0.125, 'SA_RH'),
  (0.125, 'STS'),
  (0.1818, 'SA_T'),
  (0.2, 'AF'),
  (0.2, 'DEW POINT TEMPERATURE', ('DEW', 'POINT', 'TEMPERATURE')),
  (0.2, 'STATUS TEMP LEVEL', ('STATUS', 'TEMP', 'LEVEL')),
  (0.2857, 'TRP'),
  (0.3, 'RA_T'),
  (0.3333, 'TEMP', ('TEMP',))],
 [{'R', 'RM'}, {'A'}, {'T', 'TEMP', 'TEMPERATURE', 'TMP'}]]