In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import matplotlib.dates as mdates
import re
import sklearn.cluster
from nltk import ngrams
from eppy import modeleditor
from eppy.modeleditor import IDF
IDF.setiddname('C:/EnergyPlusV8-8-0/Energy+.idd')

import warnings
warnings.simplefilter(action='ignore', category=DeprecationWarning)

%matplotlib inline

# reading E+ outputs

In [2]:
idf1 = IDF('../Ventus E+/ventus5.24.idf')
names = []
className = ['AIRLOOPHVAC','AIRCONDITIONER:VARIABLEREFRIGERANTFLOW','ZONE','PLANTLOOP','BOILER:HOTWATER'
             ,'DISTRICTCOOLING','ZONELIST','ZONEHVAC:FOURPIPEFANCOIL']#,'COIL:COOLING:WATER'
for n in className:
    for i in idf1.idfobjects[n]:
        try:
            names.append(str.lower(i.Name))
        except:
            names.append(str.lower(i.Heat_Pump_Name))
names.append('environment')
names.append('whole building')
names.append('facility')
names.append('building')
names.append('plant')
names.append('hvac')

In [3]:
EP = open('../Ventus E+/ventus5.24.mtr','r')
content = EP.read()
EP.close()
content = content.split('\nEnd of Data Dictionary')[0].split('\n')[6:]
mtr = pd.DataFrame(columns = ['ID','location','measure','unit','tag','resolution'])
utility = ['Electricity','Gas','Gasoline','Diesel','Coal','FuelOil#1','FuelOil#2','Propane','OtherFuel1',
           'OtherFuel2','Water','Steam','DistrictCooling','DistrictHeating','ElectricityPurchased',
           'ElectricitySurplusSold','ElectricityNet','EnergyTransfer','Carbon Equivalent']
usage = ['InteriorLights','ExteriorLights','InteriorEquipment','ExteriorEquipment','Fans','Pumps','Heating',
         'Cooling','HeatRejection','Humidifier','HeatRecovery','DHW','Cogeneration','Refrigeration','WaterSystems',
         'HeatingCoils','CoolingCoils','Chillers','Boilers','Baseboard','HeatRecoveryForCooling','HeatRecoveryForHeating',
         'PlantLoopHeatingDemand','PlantLoopCoolingDemand']
for i in range(len(content)):
    mtr.loc[i,'resolution'] = str.lower(content[i].split(' !')[1].split(' ')[0])
    string = content[i].split(' !')[0]
    mtr.loc[i,'ID'] = int(string.split(',')[0])
    try:
        mtr.loc[i,'location'] = [s for s in names if s in str.lower(string)][0]
    except:
        mtr.loc[i,'location'] = 'building'
    source = [s for s in utility if s in string]
    use = [s for s in usage if s in string]
    source.sort(key = lambda s: len(s))
    use.sort(key = lambda s: len(s))
    measure = ''.join([test[-1] for test in [source,use] if len(test)>0])
    for idx in range(len(measure)-1,0,-1):
        if measure[idx]>='A'and measure[idx]<='Z':
            measure = measure[:idx]+' '+measure[idx:]
    mtr.loc[i,'measure'] = str.lower(measure)
    mtr.loc[i,'unit'] = str.lower(string[string.find('['):])
    mtr.loc[i,'tag'] = 'meter'
mtr.set_index(['ID'],inplace=True)
mtr.head()

Unnamed: 0_level_0,location,measure,unit,tag,resolution
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
62,facility,electricity,[j],meter,monthly
7356,facility,gas,[j],meter,monthly
1214,building,electricity interior equipment,[j],meter,monthly
92,building,electricity interior lights,[j],meter,monthly
1234,offices,electricity interior equipment,[j],meter,monthly


In [4]:
# reading eso
EP = open('../Ventus E+/ventus5.24.eso','r')
content = EP.read()
EP.close()
content = content.split('\nEnd of Data Dictionary')[0].split('\n')[6:]
eso = pd.DataFrame(columns = ['ID','location','measure','unit','tag','resolution'])
for i in range(len(content)):
    elements =content[i].split(' !')[0].split(',')
    if not (int(elements[0]) in mtr.index):
#         if not any(s in str.lower(elements[-2]) for s in names):
#             print(content[i])
        eso.loc[i,'ID'] = int(elements[0])
        ls = [i for i in names if i in str.lower(elements[-2])]
        ls.sort(key = lambda s: len(s))
#         print(elements[0] + ' ' +ls[-1])
        eso.loc[i,'location'] = ls[-1]
        if len(ls[-1]) == len(elements[-2]):
            eso.loc[i,'measure'] = str.lower(elements[-1][:elements[-1].find('[')-1])
        else:
            eso.loc[i,'measure'] = str.lower(elements[-2]).replace(ls[-1]+' ','') +' '+str.lower(elements[-1][:elements[-1].find('[')-1])
        eso.loc[i,'measure'] = eso.loc[i,'measure'].replace('system node ','')
        eso.loc[i,'unit'] = str.lower(elements[-1][elements[-1].find('['):])
        eso.loc[i,'resolution'] = str.lower(content[i].split(' !')[1].split(' ')[0])
        eso.loc[i,'tag'] = 'variable'

eso.set_index(['ID'],inplace=True)
eso.head()
# ID can be different for different model

Unnamed: 0_level_0,location,measure,unit,tag,resolution
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
6,environment,site outdoor air drybulb temperature,[c],variable,monthly
7,environment,site outdoor air dewpoint temperature,[c],variable,monthly
8,environment,site outdoor air humidity ratio,[kgwater/kgdryair],variable,monthly
9,environment,site outdoor air relative humidity,[%],variable,monthly
10,environment,site outdoor air barometric pressure,[pa],variable,monthly


In [5]:
inEP = pd.concat([mtr,eso]) 
inEP.drop_duplicates(inplace=True)
inEP.sort_index(inplace=True)
len(inEP)

4068

# on Tagnames

In [6]:
def locIndex(s):
    l = re.split('[0123456789_]',s)[1:]
    m = max(l, key=len)
    i = re.search(m, s).start()
    return i

In [7]:
raw = pd.read_csv('OPCtag.csv')
raw.set_index(['SN'],inplace=True)
raw['type']=raw['OPC Tag'].apply(lambda x: x.split('/')[3])
raw['tag']=raw['OPC Tag'].apply(lambda x: x.split('/')[4].split('.')[0])
OPC = raw.copy()
OPC.head()

Unnamed: 0_level_0,OPC Tag,Description,type,tag
SN,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,/FACILITY/VENTUS/VENTUS_BLK1/AV1_3CLGVLV_TP_SP...,Time Program Control Set Point,VENTUS_BLK1,AV1_3CLGVLV_TP_SP
2,/FACILITY/VENTUS/VENTUS_BLK1/AV1_3VSD_TP_SP.PO...,Time Program Control Set Point,VENTUS_BLK1,AV1_3VSD_TP_SP
3,/FACILITY/VENTUS/VENTUS_BLK1/AV1_3CLGVLV_TP_EN...,Time Program Control,VENTUS_BLK1,AV1_3CLGVLV_TP_ENA
4,/FACILITY/VENTUS/VENTUS_BLK1/AV1_3CLGVLV_VSD_T...,Time Program,VENTUS_BLK1,AV1_3CLGVLV_VSD_TP
5,/FACILITY/VENTUS/VENTUS_BLK1/AV1_2FANSPEED.POI...,Fan Speed Monitoring,VENTUS_BLK1,AV1_2FANSPEED


In [8]:
OPC['loc']=OPC['tag'].apply(lambda x: x[:locIndex(x)])
OPC['measure']=OPC['tag'].apply(lambda x: x[locIndex(x):])
unique = list(OPC['measure'].unique())
OPC['label'] = OPC['measure'].apply(lambda x: unique.index(x))
OPC.tail()

Unnamed: 0_level_0,OPC Tag,Description,type,tag,loc,measure,label
SN,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
811,/FACILITY/VENTUS/VENTUS_PMBM/VPWQ1-6PMKWHR.PRE...,Power Meter Kilowatt Hour,VENTUS_PMBM,VPWQ1-6PMKWHR,VPWQ1-6,PMKWHR,65
812,/FACILITY/VENTUS/VENTUS_PMBM/VLTQ1-7PMCURRC.PR...,Power Meter Current C,VENTUS_PMBM,VLTQ1-7PMCURRC,VLTQ1-7,PMCURRC,53
813,/FACILITY/VENTUS/VENTUS_PMBM/VLTQ1-1PMKWATT.PR...,Power Meter Kilowatt,VENTUS_PMBM,VLTQ1-1PMKWATT,VLTQ1-1,PMKWATT,59
814,/FACILITY/VENTUS/VENTUS_PMBM/VPWQ1-8PMVOLTBC.P...,Power Meter Voltage B-C,VENTUS_PMBM,VPWQ1-8PMVOLTBC,VPWQ1-8,PMVOLTBC,55
815,/FACILITY/VENTUS/VENTUS_PMBM/VPWQ1-12PMVOLTAB....,Power Meter Voltage A-B,VENTUS_PMBM,VPWQ1-12PMVOLTAB,VPWQ1-12,PMVOLTAB,54


# clustering for measure extraction

In [9]:
from weighted_levenshtein import lev
from sklearn.metrics.cluster import adjusted_rand_score

# set the weighting matrix
substitute_costs = np.ones((128, 128), dtype=np.float64)
for i in range(10):
    for j in range(10):
        substitute_costs[ord(str(i)), ord(str(j))] = 0.25

In [10]:
OPCtag = raw.copy()
names = np.asarray(OPCtag['tag'])
weighted_lev = -1*np.array([[lev(w1,w2,substitute_costs=substitute_costs) for w1 in names] for w2 in names])
weighted_lev

array([[ -0.  ,  -5.  ,  -3.  , ..., -15.25, -15.  , -15.25],
       [ -5.  ,  -0.  ,  -8.  , ..., -13.25, -14.25, -14.25],
       [ -3.  ,  -8.  ,  -0.  , ..., -16.  , -16.  , -16.25],
       ...,
       [-15.25, -13.25, -16.  , ...,  -0.  ,  -7.25,  -8.  ],
       [-15.  , -14.25, -16.  , ...,  -7.25,  -0.  ,  -3.25],
       [-15.25, -14.25, -16.25, ...,  -8.  ,  -3.25,  -0.  ]])

In [11]:
score = pd.DataFrame()
left = pd.DataFrame()
for i in range(5,30):
    for j in range(2,4):
        test = sklearn.cluster.DBSCAN(eps=i, min_samples=j).fit(weighted_lev)
        score.loc[j,i]=adjusted_rand_score(test.labels_,OPC['label'])
        left.loc[j,i]=sum(test.labels_==-1)
print(score)
left

         5         6         7         8         9         10        11  \
2  0.140611  0.350934  0.510598  0.577372  0.761455  0.823174  0.879549   
3  0.060777  0.230467  0.440299  0.516447  0.692097  0.753673  0.810634   

         12        13        14    ...           20        21        22  \
2  0.878137  0.878137  0.878137    ...     0.858823  0.848101  0.848101   
3  0.809418  0.809418  0.809418    ...     0.822296  0.812746  0.814068   

         23        24        25        26        27        28        29  
2  0.846924  0.836794  0.839784  0.840298  0.832999  0.811139  0.791686  
3  0.819910  0.810362  0.827632  0.833529  0.827613  0.807715  0.788410  

[2 rows x 25 columns]


Unnamed: 0,5,6,7,8,9,10,11,12,13,14,...,20,21,22,23,24,25,26,27,28,29
2,156.0,30.0,9.0,8.0,8.0,8.0,7.0,7.0,7.0,7.0,...,7.0,7.0,6.0,5.0,5.0,4.0,3.0,3.0,2.0,2.0
3,268.0,128.0,73.0,64.0,64.0,64.0,63.0,63.0,63.0,63.0,...,47.0,47.0,46.0,41.0,41.0,28.0,21.0,19.0,16.0,16.0


In [11]:
def all_substr(data):
    substr = []
    i=0
    while i < len(data[0]):
        for j in range(len(data[0])-i+1,0,-1):
            if all(data[0][i:i+j] in x for x in data):
                substr.append(data[0][i:i+j])
                i=i+j-1
                break
        i+=1
    return substr

In [12]:
from itertools import combinations
def razors(string):
    m = len(string)-1
    result = []
    for k in range(min(m+1,7)):
        for bits in combinations(range(m), k):
            s = [1] * m
            for bit in bits:
                s[bit] = 0
            result.append(s)
    return result

def Xgrams(string):
    string = ''.join(re.split('[ _,:]',string))
    ngrams = []
    for item in razors(string):
        ngram = []
        flag = 0
        for i in range(len(item)):
            if item[i]:
                ngram.append(string[flag:i+1])
                flag = i+1
        ngram.append(string[flag:])
        ngram.sort(key = len,reverse=True)   # so that the longer substrings get matched first
        ngrams.append(ngram)
    return ngrams

In [13]:
OPCtag = raw.copy()
names = np.asarray(OPCtag['tag'])
weighted_lev = -1*np.array([[lev(w1,w2,substitute_costs=substitute_costs) for w1 in names] for w2 in names])

density = sklearn.cluster.DBSCAN(eps=11, min_samples=2).fit(weighted_lev)
clusters = {}
for cluster_id in np.unique(density.labels_):
    cluster = names[np.nonzero(density.labels_==cluster_id)]
    clusters[cluster_id] = cluster

extraction = {}
for cluster_id in list(clusters.keys()):
    extraction[cluster_id] = all_substr(list(clusters[cluster_id]))[-1]
    
OPCtag.drop(columns=['Description'],inplace = True)
OPCtag['label'] = density.labels_
OPCtag['measure'] = OPCtag['label'].apply(lambda x: extraction[x])
OPCtag['loc'] = OPCtag['measure']
for i in range(1,len(OPCtag)+1):
    s = OPCtag.loc[i,('tag')]
    OPCtag.loc[i,('loc')] = s[:s.index(OPCtag.loc[i,('loc')])]

OPCtag['Xgrams'] = OPCtag['measure'].apply(lambda x: Xgrams(x))
OPCtag.head()

Unnamed: 0_level_0,OPC Tag,type,tag,label,measure,loc,Xgrams
SN,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,/FACILITY/VENTUS/VENTUS_BLK1/AV1_3CLGVLV_TP_SP...,VENTUS_BLK1,AV1_3CLGVLV_TP_SP,0,CLGVLV_TP_SP,AV1_3,"[[C, L, G, V, L, V, T, P, S, P], [CL, G, V, L,..."
2,/FACILITY/VENTUS/VENTUS_BLK1/AV1_3VSD_TP_SP.PO...,VENTUS_BLK1,AV1_3VSD_TP_SP,1,VSD_TP_SP,AV1_3,"[[V, S, D, T, P, S, P], [VS, D, T, P, S, P], [..."
3,/FACILITY/VENTUS/VENTUS_BLK1/AV1_3CLGVLV_TP_EN...,VENTUS_BLK1,AV1_3CLGVLV_TP_ENA,2,CLGVLV_TP_ENA,AV1_3,"[[C, L, G, V, L, V, T, P, E, N, A], [CL, G, V,..."
4,/FACILITY/VENTUS/VENTUS_BLK1/AV1_3CLGVLV_VSD_T...,VENTUS_BLK1,AV1_3CLGVLV_VSD_TP,3,CLGVLV_VSD_TP,AV1_3,"[[C, L, G, V, L, V, V, S, D, T, P], [CL, G, V,..."
5,/FACILITY/VENTUS/VENTUS_BLK1/AV1_2FANSPEED.POI...,VENTUS_BLK1,AV1_2FANSPEED,4,FANSPEED,AV1_2,"[[F, A, N, S, P, E, E, D], [FA, N, S, P, E, E,..."


# create dict

In [14]:
def updateABB(abb,full):
    global ABBdict
    for i in full:
        if i in list(ABBdict['name']):
            ABBdict['abb'].loc[ABBdict['name']==i].item().add(abb[full.index(i)])
        else:
            ABBdict = ABBdict.append({'abb': set([abb[full.index(i)]]),'name':i}, ignore_index=True)

In [35]:
dict1 = pd.read_csv('HVACacronym.csv')
dict2 = pd.read_csv('HVACacronym1.csv')

dict1['full']=dict1['full'].apply(lambda x: str.lower(x))
dict2['name']=dict2['name'].apply(lambda x: str.lower(x))

ABBdict = pd.concat([dict1.rename({'full':'name'}, axis='columns'),dict2])
ABBdict['abb']= ABBdict[['abb']].values.tolist()
ABBdict = pd.DataFrame(ABBdict.groupby('name')['abb'].sum())
ABBdict['name']=ABBdict.index
ABBdict.rename(index=str, columns={0: "abb"},inplace=True)
ABBdict['abb']= ABBdict['abb'].apply(lambda x: set(x))

ABBdict.reset_index(drop=True,inplace=True)

In [36]:
abb = ['OC','TMP','TEMP','SA','STATIC','RA','RH','RM','BM','FLWR','RET','KWATT','KWHR','PM','ENGRY','FWD','PWR','AHU',
      'FAN','SPEED','BYP','CLG','WATER','VLV','TP','SMK','FA','PM','BM','CURR','VOLT','PW']
full=['off coil','temperature','temperature','supply air','pressure','return air','relative humidity','room',
      'btu meter','flow rate','return','power','energy','power meter','energy','supply','power','ahu',
      'fan','speed','bypass','cooling','water','valve','time program','smoke','fresh air','power meter','btu meter',
      'current','voltage','power']
updateABB(abb,full)

EPname = ['zone','air','energy','heating','cooling','air terminal','pump','boiler','condenser','setpoint',
          'heat exchanger','humidifier','heater','water','people','window','infiltration','equipment','outdoor',
          'drybulb','dewpoint','speed','angle','radiation','pressure','precipitation','occupant','lights',
          'humidity ratio','facility','mass flow rate','interior','gas','office','room','level','extract',
          'fan','supply','air loop','site','power','radiant','fan coil unit','indoor','heat pump',
          'vav','vrf','return','inlet','outlet','relief air','rate']

EPabb = ['RM','A','KWHR','HT','CL','VAV','PMP','B','COND','SP','HX','HUMID','HTR','WTR','PPL','WD','INFIL',
        'EQUIP','O','DB','DP','SPD','AGL','R','PR','P','OCC','LT','HR','FAC','FLWR','INTR','G','OFC','RM','LV',
        'E','F','S','AHU','LOC','KWATT','R','FCU','I','CU','VAV','VRF','R','S','R','EA','KW']
updateABB(EPabb,EPname)

NODEname = ['supply side inlet','supply side outlet','coil air outlet','mixed inlet','splitter outlet',
           'coil outlet','inlet','outlet','supply inlet']

NODEabb = ['R','S','OC','R','S','S','FWD','RET','R']

updateABB(NODEabb,NODEname)

testName = ['electricity','energy','power','demand','rate','coil air outlet','coil outlet']

testABB = ['KWHR','ENGRY','PWR','ENGRY','PWR','S','OC']

updateABB(testABB,testName)

In [17]:
def match(s,OPCtag,th):
    
    ngram = []
    for i in range(3,0,-1):
        grams = ngrams(s.split(' '),i)
        for gram in grams:
            ngram.append(' '.join(list(gram)))
    
#     ngram.sort(key = lambda s: len(s),reverse = True)
    ABBs = []
    while len(ngram)>0:
        i = ngram[0]
        ngram.remove(i)
#         print(ngram)
        try:
            ABBs.append(ABBdict.loc[ABBdict['name'] == i,'abb'].values[0])
            ngram = [s for s in ngram if (s not in i)]
        except:
            pass

    details = []
    for i in OPCtag.index:
        idx = 0
        m = 0
        r = 0
        for j in range(len(OPCtag['Xgrams'][i])):
            used = [0]*len(ABBs)    # so that a set of ABBs is only matched once
            count = 0
            for gram in OPCtag.loc[i,'Xgrams'][j]:
                for c in range(len(ABBs)):
                    if gram in ABBs[c] and used[c]==0:
                        used[c] = 1
                        count += len(gram)
                        break
            if count > m:
                m = count # to avoid division by zero error
                # harmonic average of ratio on both side
                r = 2/(len(ABBs)/sum(used) + len(OPCtag['measure'][i])/m)
                idx = j
# criteria passed: # of hits, # of total hit characters in tag, ratio of hit sets over total
        if r >= .01:
            details.append([round(r,4),OPCtag['measure'][i],tuple(OPCtag['Xgrams'][i][idx])])

    details = list(set(tuple(x) for x in details))
    details.sort()
    matches = [i[1] for i in details][-th:] #top 10 vs top 7 (10%) tested

    return [matches,details,ABBs]

In [18]:
def check(result):
    for i in result.index:
        if len(result.loc[i,'matches']) == 0:
            if np.nan in result.loc[i,'truth']:
                result.loc[i,'test'] = 'TN'
            else:
                result.loc[i,'test'] = 'FN'
        elif np.nan in result.loc[i,'truth']:
            result.loc[i,'test'] = 'FP'
        else:
            for m in result.loc[i,'truth']:
                if m in result.loc[i,'matches']:
                    result.loc[i,'test'] = 'TP'
                    break
                else:
                    result.loc[i,'test'] = 'FP1'
    return result

In [25]:
truth = pd.read_csv('truth_o.csv')
for i in truth.index:
    if truth.loc[i,'alter'] in [np.nan]:
        truth.loc[i,'truth'] = [truth.loc[i,'truth']]
    else:
        truth.loc[i,'truth'] = [truth.loc[i,'truth'],truth.loc[i,'alter']]
truth.drop(columns=['alter'],inplace = True)
# truth

In [28]:
top5.drop(columns=['truth'],inplace = True)

In [37]:
top5 = pd.DataFrame(columns = ['input','abbs','matches'])
ls = inEP['measure'].unique()
# add some input variables to map
for i in range(len(ls)):
    test = match(ls[i],OPCtag,5)
#     print(i,end=' ')
#     print(test[1])
    top5.loc[i,'input'] = ls[i]
    top5.loc[i,'abbs'] = test[2]
    top5.loc[i,'matches'] = test[0]
#     result.loc[i,'detail'] = test[1]


In [38]:
top5 = top5.join(truth.set_index('input'), on='input')
t5 = check(top5)
t5['test'].value_counts()

FP     410
TP      49
TN       4
FP1      3
Name: test, dtype: int64

In [39]:
top5[top5['test']=='FP1']

Unnamed: 0,input,abbs,matches,truth,test
94,demand side inlet 1 setpoint temperature,"[{ENGRY}, {FWD, S}, {SP}, {TMP, T, TEMP}]","[RATEMPSP, RMTEMPSP, SATEMP, ATEMPSP, TMPSP]",[SATEMPSP],FP1
100,supply side outlet 1 setpoint temperature,"[{S}, {SP}, {TMP, T, TEMP}]","[RATEMPSP, RMTEMPSP, SATEMP, ATEMPSP, TMPSP]",[SATEMPSP],FP1
148,supply side outlet setpoint temperature,"[{S}, {SP}, {TMP, T, TEMP}]","[RATEMPSP, RMTEMPSP, SATEMP, ATEMPSP, TMPSP]",[SATEMPSP],FP1


## the three errors were caused by wrong measure extraction ("SATEMPSP")

In [40]:
def check1(result):
    for i in result.index:
        if len(result.loc[i,'matches']) == 0:
            if np.nan in result.loc[i,'truth']:
                result.loc[i,'test'] = 'TN'
            else:
                result.loc[i,'test'] = 'FN'
        elif np.nan in result.loc[i,'truth']:
            result.loc[i,'test'] = 'FP'
        else:
            for m in result.loc[i,'truth']:
                if m == result.loc[i,'matches'][-1]:
                    result.loc[i,'test'] = 'TP'
                    break
                else:
                    result.loc[i,'test'] = 'FP1'
    return result
t7 = check1(top5)
t7['test'].value_counts()

FP     410
TP      27
FP1     25
TN       4
Name: test, dtype: int64

# baseline

In [41]:
OPCtag['ngram'] = OPCtag['measure']
for i in OPCtag.index:
    ngram = []
    for n in range(1,min(7,len(OPCtag['measure'][i])+1)):
        grams = ngrams(OPCtag.loc[i,('measure')],n)
        for gram in grams:
            ngram.append(''.join(list(gram)))
    OPCtag.at[i,'ngram'] = ngram
OPCtag.tail()

Unnamed: 0_level_0,OPC Tag,type,tag,label,measure,loc,Xgrams,ngram
SN,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
811,/FACILITY/VENTUS/VENTUS_PMBM/VPWQ1-6PMKWHR.PRE...,VENTUS_PMBM,VPWQ1-6PMKWHR,88,PMKWHR,VPWQ1-6,"[[P, M, K, W, H, R], [PM, K, W, H, R], [MK, P,...","[P, M, K, W, H, R, PM, MK, KW, WH, HR, PMK, MK..."
812,/FACILITY/VENTUS/VENTUS_PMBM/VLTQ1-7PMCURRC.PR...,VENTUS_PMBM,VLTQ1-7PMCURRC,79,C,VLTQ1-7PM,[[C]],[C]
813,/FACILITY/VENTUS/VENTUS_PMBM/VLTQ1-1PMKWATT.PR...,VENTUS_PMBM,VLTQ1-1PMKWATT,104,PMKWATT,VLTQ1-1,"[[P, M, K, W, A, T, T], [PM, K, W, A, T, T], [...","[P, M, K, W, A, T, T, PM, MK, KW, WA, AT, TT, ..."
814,/FACILITY/VENTUS/VENTUS_PMBM/VPWQ1-8PMVOLTBC.P...,VENTUS_PMBM,VPWQ1-8PMVOLTBC,89,PMVOLTBC,VPWQ1-8,"[[P, M, V, O, L, T, B, C], [PM, V, O, L, T, B,...","[P, M, V, O, L, T, B, C, PM, MV, VO, OL, LT, T..."
815,/FACILITY/VENTUS/VENTUS_PMBM/VPWQ1-12PMVOLTAB....,VENTUS_PMBM,VPWQ1-12PMVOLTAB,80,PMVOLTAB,VPWQ1-12,"[[P, M, V, O, L, T, A, B], [PM, V, O, L, T, A,...","[P, M, V, O, L, T, A, B, PM, MV, VO, OL, LT, T..."


In [42]:
def onNgram(s,OPCtag,th):
    
    ngram = []
    for i in range(3,0,-1):
        grams = ngrams(s.split(' '),i)
        for gram in grams:
            ngram.append(' '.join(list(gram)))
    
#     ngram.sort(key = lambda s: len(s),reverse = True)
    ABBs = []
    while len(ngram)>0:
        i = ngram[0]
        ngram.remove(i)
        try:
            ABBs.append(ABBdict.loc[ABBdict['name'] == i,'abb'].values[0])
            ngram = [s for s in ngram if (s not in i)]
        except:
            pass
    
    detail = []
    for i in OPCtag.index:
        r = 0
        used = [0]*len(ABBs)    # so that a set of ABBs is only matched once
        count = 0
        for gram in OPCtag['ngram'][i]:
            for c in range(len(ABBs)):
                if gram in ABBs[c] and used[c]==0:
                    used[c] = 1
                    count += 1
                    break
# criteria passed: # of hits, # of total hit characters in tag, ratio of hit sets over total
        if count >= 1:
            # harmonic average of ratio on both side
#             r = 2/(len(ABBs)/sum(used) + len(OPCtag['ngram'][i])/count)
#             Jaccard similarity
            r = count/(len(ABBs)+len(OPCtag['ngram'][i])-count)
            detail.append([round(r,4),OPCtag['measure'][i]])  #,tuple(OPCtag['ngram'][i])

    detail = list(set(tuple(x) for x in detail))
    detail.sort()
    matches = [i[1] for i in detail][-th:] #top 10 vs top 7 (10%) tested
            
    return [matches,detail,ABBs]

# test = match(str.lower('Zone Mean Air Temperature'),OPCtag)
# set(tuple(x) for x in test[0])

In [43]:
top5 = pd.DataFrame(columns = ['input','abbs','matches'])
ls = inEP['measure'].unique()
for i in range(len(ls)):
    test = onNgram(ls[i],OPCtag,5)
#     print(i,end=' ')
#     print(test[1])
    top5.loc[i,'input'] = ls[i]
    top5.loc[i,'abbs'] = test[2]
    top5.loc[i,'matches'] = test[0]
#     result.loc[i,'detail'] = test[1]

In [44]:
top5 = top5.join(truth.set_index('input'), on='input')
t5 = check(top5)
t5['test'].value_counts()

FP     410
FP1     27
TP      25
TN       4
Name: test, dtype: int64

In [45]:
t7 = check1(top5)
t7['test'].value_counts()

FP     410
FP1     39
TP      13
TN       4
Name: test, dtype: int64

In [83]:
top7[top7['input'].str.contains('air outlet node temperature')]

Unnamed: 0,input,abbs,matches,test,truth
157,ahu extract fan air outlet node temperature,"[{AHU}, {E}, {F, FAN}, {A}, {RET, R}, {TEMP, T...","[WATERDETALM, RMTEMP, SATEMP, FADMPR, RATEMPLO...",FP,[nan]
169,ahu cooling coil air outlet node temperature,"[{OC}, {AHU}, {CL, CLG}, {TEMP, TMP, T}]","[CLGVLV_VSD_TP, CLGVLV_TP_SP, TMP, OCTEMPSPACT...",TP,[OCTEMP]


In [32]:
match('ahu cooling water inlet node mass flow rate',OPCtag,7)

[['BMFWDTMP',
  'WATERDETALM',
  'CLGVLV',
  'SS',
  'BMFLWR',
  'CLGVLV_VSD_TP',
  'CLGVLV_TP_SP'],
 [(0.1111,
   'FADMPRPURGPOS',
   ('F', 'A', 'D', 'M', 'P', 'R', 'P', 'U', 'R', 'G', 'P', 'O', 'S')),
  (0.125,
   'OCTEMPSPACT',
   ('O', 'C', 'T', 'E', 'M', 'P', 'S', 'P', 'A', 'C', 'T')),
  (0.125,
   'OCTEMPSPOFF',
   ('O', 'C', 'T', 'E', 'M', 'P', 'S', 'P', 'O', 'F', 'F')),
  (0.125,
   'VSDPURGESPD',
   ('V', 'S', 'D', 'P', 'U', 'R', 'G', 'E', 'S', 'P', 'D')),
  (0.1333, 'SASTATICSP', ('S', 'A', 'S', 'T', 'A', 'T', 'I', 'C', 'S', 'P')),
  (0.1333, 'VSD_TP_ENA', ('V', 'S', 'D', 'T', 'P', 'E', 'N', 'A')),
  (0.1429, 'OCTEMPSPB', ('O', 'C', 'T', 'E', 'M', 'P', 'S', 'P', 'B')),
  (0.1429, 'RMTEMPSPB', ('R', 'M', 'T', 'E', 'M', 'P', 'S', 'P', 'B')),
  (0.1429, 'VSDMINSPD', ('V', 'S', 'D', 'M', 'I', 'N', 'S', 'P', 'D')),
  (0.1429, 'VSD_TP_SP', ('V', 'S', 'D', 'T', 'P', 'S', 'P')),
  (0.1538, 'FANSPEED', ('F', 'A', 'N', 'S', 'P', 'E', 'E', 'D')),
  (0.1538, 'RATEMPSP', ('R', 'A', 'T', '

In [34]:
match('cooling coil outlet temperature',OPCtag,7)

[['RMTEMPSP', 'OCTEMPSPB', 'STS', 'ATEMPSP', 'TMPSP', 'SATEMP', 'OCTEMP'],
 [(0.125,
   'FADMPRPURGPOS',
   ('F', 'A', 'D', 'M', 'P', 'R', 'P', 'U', 'R', 'G', 'P', 'O', 'S')),
  (0.1429,
   'VSDPURGESPD',
   ('V', 'S', 'D', 'P', 'U', 'R', 'G', 'E', 'S', 'P', 'D')),
  (0.1429,
   'WATERDETALM',
   ('W', 'A', 'T', 'E', 'R', 'D', 'E', 'T', 'A', 'L', 'M')),
  (0.1667, 'VSDMINSPD', ('V', 'S', 'D', 'M', 'I', 'N', 'S', 'P', 'D')),
  (0.1818, 'FANSPEED', ('F', 'A', 'N', 'S', 'P', 'E', 'E', 'D')),
  (0.1818, 'PMVOLTAB', ('P', 'M', 'V', 'O', 'L', 'T', 'A', 'B')),
  (0.1818, 'PMVOLTBC', ('P', 'M', 'V', 'O', 'L', 'T', 'B', 'C')),
  (0.1818, 'PMVOLTCA', ('P', 'M', 'V', 'O', 'L', 'T', 'C', 'A')),
  (0.2, 'PMKWATT', ('P', 'M', 'K', 'W', 'A', 'T', 'T')),
  (0.2, 'PMPWFTR', ('P', 'M', 'P', 'W', 'F', 'T', 'R')),
  (0.2, 'RACO2SP', ('R', 'A', 'C', 'O', '2', 'S', 'P')),
  (0.2, 'RUNTIME', ('R', 'U', 'N', 'T', 'I', 'M', 'E')),
  (0.2, 'TIMEPRG', ('T', 'I', 'M', 'E', 'P', 'R', 'G')),
  (0.2857, 'TRIP', ('T'