#### packages used in the workflow

In [1]:
# to store and manipulate the data
import numpy as np
import pandas as pd
# to cluster the BMS points
from weighted_levenshtein import lev
import sklearn.cluster
from sklearn.metrics.cluster import adjusted_rand_score
# to edit and feature the strings
import re
from itertools import combinations 
from nltk import ngrams
# to inter with EnergyPlus model
from eppy import modeleditor
from eppy.modeleditor import IDF
IDF.setiddname('C:/EnergyPlusV8-8-0/Energy+.idd')

import warnings
warnings.simplefilter(action='ignore', category=DeprecationWarning)

%matplotlib inline

# core information extraction from EnergyPlus (EP) variables

#### extracting the customized or default location names using IDF

In [3]:
idf1 = IDF('../.idf') # file name removed for confidentiality reasons
names = []
className = ['AIRLOOPHVAC','AIRCONDITIONER:VARIABLEREFRIGERANTFLOW','ZONE','PLANTLOOP','BOILER:HOTWATER'
             ,'DISTRICTCOOLING','ZONELIST','ZONEHVAC:FOURPIPEFANCOIL']
for n in className:
    for i in idf1.idfobjects[n]:
        try:
            names.append(str.lower(i.Name))
        except:
            names.append(str.lower(i.Heat_Pump_Name))
names.append('environment')
names.append('whole building')
names.append('facility')
names.append('building')
names.append('plant')
names.append('hvac')

#### reading the output:meters variables from .mtr 

In [4]:
EP = open('../.mtr','r') # file name removed for confidentiality reasons
content = EP.read()
EP.close()

# the semantic rule is designed for .mtr file of EP V8.8.0 (can be incompatible for other versions) 
content = content.split('\nEnd of Data Dictionary')[0].split('\n')[6:] # just keep the data dictionary and split the samples
mtr = pd.DataFrame(columns = ['ID','location','measure','unit','tag','resolution']) # define the dataframe to store the variables
utility = ['Electricity','Gas','Gasoline','Diesel','Coal','FuelOil#1','FuelOil#2','Propane','OtherFuel1',
           'OtherFuel2','Water','Steam','DistrictCooling','DistrictHeating','ElectricityPurchased',
           'ElectricitySurplusSold','ElectricityNet','EnergyTransfer','Carbon Equivalent']
usage = ['InteriorLights','ExteriorLights','InteriorEquipment','ExteriorEquipment','Fans','Pumps','Heating',
         'Cooling','HeatRejection','Humidifier','HeatRecovery','DHW','Cogeneration','Refrigeration','WaterSystems',
         'HeatingCoils','CoolingCoils','Chillers','Boilers','Baseboard','HeatRecoveryForCooling','HeatRecoveryForHeating',
         'PlantLoopHeatingDemand','PlantLoopCoolingDemand']
for i in range(len(content)):
    mtr.loc[i,'resolution'] = str.lower(content[i].split(' !')[1].split(' ')[0])
    string = content[i].split(' !')[0]
    mtr.loc[i,'ID'] = int(string.split(',')[0])
    try:
        mtr.loc[i,'location'] = [s for s in names if s in str.lower(string)][0]
    except:
        mtr.loc[i,'location'] = 'building'
    source = [s for s in utility if s in string]
    use = [s for s in usage if s in string]
    source.sort(key = lambda s: len(s))
    use.sort(key = lambda s: len(s))
    measure = ''.join([test[-1] for test in [source,use] if len(test)>0])
    for idx in range(len(measure)-1,0,-1):
        if measure[idx]>='A'and measure[idx]<='Z':
            measure = measure[:idx]+' '+measure[idx:]
    mtr.loc[i,'measure'] = str.lower(measure)
    mtr.loc[i,'unit'] = str.lower(string[string.find('['):])
    mtr.loc[i,'tag'] = 'meter'
mtr.set_index(['ID'],inplace=True)
mtr.head()

Unnamed: 0_level_0,location,measure,unit,tag,resolution
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
62,facility,electricity,[j],meter,monthly
7356,facility,gas,[j],meter,monthly
1214,building,electricity interior equipment,[j],meter,monthly
92,building,electricity interior lights,[j],meter,monthly
1234,offices,electricity interior equipment,[j],meter,monthly


#### reading the output:variable variables from .eso 

In [7]:
EP = open('../.eso','r') # file name removed for confidentiality reasons
content = EP.read()
EP.close()

# the semantic rule is designed for .eso file of EP V8.8.0 (can be incompatible for other versions)
content = content.split('\nEnd of Data Dictionary')[0].split('\n')[6:]
eso = pd.DataFrame(columns = ['ID','location','measure','unit','tag','resolution'])
for i in range(len(content)):
    elements =content[i].split(' !')[0].split(',')
    if not (int(elements[0]) in mtr.index):  # skip the meter variables already processed in the last section
        eso.loc[i,'ID'] = int(elements[0])
        ls = [i for i in names if i in str.lower(elements[-2])]
        ls.sort(key = lambda s: len(s))
        eso.loc[i,'location'] = ls[-1]
        if len(ls[-1]) == len(elements[-2]):
            eso.loc[i,'measure'] = str.lower(elements[-1][:elements[-1].find('[')-1])
        else:
            eso.loc[i,'measure'] = str.lower(elements[-2]).replace(ls[-1]+' ','') +' '+str.lower(elements[-1][:elements[-1].find('[')-1])
        eso.loc[i,'measure'] = eso.loc[i,'measure'].replace('system node ','')
        eso.loc[i,'unit'] = str.lower(elements[-1][elements[-1].find('['):])
        eso.loc[i,'resolution'] = str.lower(content[i].split(' !')[1].split(' ')[0])
        eso.loc[i,'tag'] = 'variable'

eso.set_index(['ID'],inplace=True)
eso.head()

Unnamed: 0_level_0,location,measure,unit,tag,resolution
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
6,environment,site outdoor air drybulb temperature,[c],variable,monthly
7,environment,site outdoor air dewpoint temperature,[c],variable,monthly
8,environment,site outdoor air humidity ratio,[kgwater/kgdryair],variable,monthly
9,environment,site outdoor air relative humidity,[%],variable,monthly
10,environment,site outdoor air barometric pressure,[pa],variable,monthly


In [8]:
# concate the meters and variables to get all the EP variables
inEP = pd.concat([mtr,eso]) 
inEP.drop_duplicates(inplace=True)
inEP.sort_index(inplace=True)
len(inEP)

4068

# extracting core information from BMS point tags

#### initializing and preliminary trimming (the trimmed tag still contained location information that require clustering to filter off)

In [10]:
raw = pd.read_csv('../.csv') # file name removed for confidentiality reasons
raw.set_index(['SN'],inplace=True)
raw['tag']=raw['OPC Tag'].apply(lambda x: x.split('/')[4].split('.')[0]) # simply trimming off the irrelevant parts
OPC = raw.copy()
OPC[['tag','Description']].head() # raw tags with information about the buidling are hidden for confidentiality reasons

Unnamed: 0_level_0,tag,Description
SN,Unnamed: 1_level_1,Unnamed: 2_level_1
1,AV1_3CLGVLV_TP_SP,Time Program Control Set Point
2,AV1_3VSD_TP_SP,Time Program Control Set Point
3,AV1_3CLGVLV_TP_ENA,Time Program Control
4,AV1_3CLGVLV_VSD_TP,Time Program
5,AV1_2FANSPEED,Fan Speed Monitoring


#### generating the ground truth of core information extraction

In [16]:
def locIndex(s): # manually identify the position of the core information in the string (just for this dataset)
    l = re.split('[0123456789_]',s)[1:]
    m = max(l, key=len)
    i = re.search(m, s).start()
    return i

OPC['loc']=OPC['tag'].apply(lambda x: x[:locIndex(x)])
OPC['measure']=OPC['tag'].apply(lambda x: x[locIndex(x):])
unique = list(OPC['measure'].unique())
OPC['label'] = OPC['measure'].apply(lambda x: unique.index(x))
OPC.iloc[:,3:].tail() # the measure and label column be used to evaluate the results of the clustering based extraction

Unnamed: 0_level_0,tag,loc,measure,label
SN,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
811,VPWQ1-6PMKWHR,VPWQ1-6,PMKWHR,65
812,VLTQ1-7PMCURRC,VLTQ1-7,PMCURRC,53
813,VLTQ1-1PMKWATT,VLTQ1-1,PMKWATT,59
814,VPWQ1-8PMVOLTBC,VPWQ1-8,PMVOLTBC,55
815,VPWQ1-12PMVOLTAB,VPWQ1-12,PMVOLTAB,54


#### vectorization of the BMS points

In [17]:
# define the weighting matrix for weighted levenshtein distance calculation 
# (difference in digits is more likely to be relevant to sensor location 
# rather than sensor type and therefore less important)
substitute_costs = np.ones((128, 128), dtype=np.float64)
for i in range(10):
    for j in range(10):
        substitute_costs[ord(str(i)), ord(str(j))] = 0.25

In [18]:
OPCtag = raw.copy()
names = np.asarray(OPCtag['tag']) # transform the tags into a array to apply the package for distance calculation 
weighted_lev = -1*np.array([[lev(w1,w2,substitute_costs=substitute_costs) for w1 in names] for w2 in names])
weighted_lev

array([[ -0.  ,  -5.  ,  -3.  , ..., -15.25, -15.  , -15.25],
       [ -5.  ,  -0.  ,  -8.  , ..., -13.25, -14.25, -14.25],
       [ -3.  ,  -8.  ,  -0.  , ..., -16.  , -16.  , -16.25],
       ...,
       [-15.25, -13.25, -16.  , ...,  -0.  ,  -7.25,  -8.  ],
       [-15.  , -14.25, -16.  , ...,  -7.25,  -0.  ,  -3.25],
       [-15.25, -14.25, -16.25, ...,  -8.  ,  -3.25,  -0.  ]])

#### clustering based extraction

In [19]:
# the function to extract the common substring for strings in one cluster 
def all_substr(data):
    substr = []
    i=0
    while i < len(data[0]):
        for j in range(len(data[0])-i+1,0,-1):
            if all(data[0][i:i+j] in x for x in data):
                substr.append(data[0][i:i+j])
                i=i+j-1
                break
        i+=1
    return substr

In [20]:
# DBSCAN clustering on the vectorized BMS points, parameter defined intuitively
density = sklearn.cluster.DBSCAN(eps=11, min_samples=2).fit(weighted_lev)
# store the clustering result in the dictionary "clusters"
clusters = {}
for cluster_id in np.unique(density.labels_):
    cluster = names[np.nonzero(density.labels_==cluster_id)]
    clusters[cluster_id] = cluster

# extract the representative substring from each cluster and store in the dictionary "extraction" 
extraction = {}
for cluster_id in list(clusters.keys()):
    extraction[cluster_id] = all_substr(list(clusters[cluster_id]))[-1]

# store the results in the dataframe
OPCtag.drop(columns=['Description'],inplace = True)
OPCtag['label'] = density.labels_
OPCtag['measure'] = OPCtag['label'].apply(lambda x: extraction[x])
OPCtag['loc'] = OPCtag['measure']
for i in range(1,len(OPCtag)+1):
    s = OPCtag.loc[i,('tag')]
    OPCtag.loc[i,('loc')] = s[:s.index(OPCtag.loc[i,('loc')])]

# Tokenization of BMS points using the X-gram algorithm

In [23]:
# function to create all to possible ways to clip the point tags
def razors(string):
    m = len(string)-1
    result = []
    for k in range(min(m+1,7)):
        for bits in combinations(range(m), k):
            s = [1] * m
            for bit in bits:
                s[bit] = 0
            result.append(s)
    return result
# function to generate a list of X-grams of the BMS tag
def Xgrams(string):
    string = ''.join(re.split('[ _,:]',string))
    ngrams = []
    for item in razors(string):
        ngram = []
        flag = 0
        for i in range(len(item)):
            if item[i]:
                ngram.append(string[flag:i+1])
                flag = i+1
        ngram.append(string[flag:])
        ngram.sort(key = len,reverse=True)   # so that the longer substrings get matched first in the next step
        ngrams.append(ngram)
    return ngrams

OPCtag['Xgrams'] = OPCtag['measure'].apply(lambda x: Xgrams(x))
OPCtag.iloc[:,2:].head()

Unnamed: 0_level_0,tag,label,measure,loc,Xgrams
SN,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,AV1_3CLGVLV_TP_SP,0,CLGVLV_TP_SP,AV1_3,"[[C, L, G, V, L, V, T, P, S, P], [CL, G, V, L,..."
2,AV1_3VSD_TP_SP,1,VSD_TP_SP,AV1_3,"[[V, S, D, T, P, S, P], [VS, D, T, P, S, P], [..."
3,AV1_3CLGVLV_TP_ENA,2,CLGVLV_TP_ENA,AV1_3,"[[C, L, G, V, L, V, T, P, E, N, A], [CL, G, V,..."
4,AV1_3CLGVLV_VSD_TP,3,CLGVLV_VSD_TP,AV1_3,"[[C, L, G, V, L, V, V, S, D, T, P], [CL, G, V,..."
5,AV1_2FANSPEED,4,FANSPEED,AV1_2,"[[F, A, N, S, P, E, E, D], [FA, N, S, P, E, E,..."


# Defining the dictionary used to tokenize the EnergyPlus variables 

In [26]:
# function to add abbreviations into the dictionary
def updateABB(abb,full):
    global ABBdict
    for i in full:
        if i in list(ABBdict['name']):
            ABBdict['abb'].loc[ABBdict['name']==i].item().add(abb[full.index(i)])
        else:
            ABBdict = ABBdict.append({'abb': set([abb[full.index(i)]]),'name':i}, ignore_index=True)

In [28]:
# initialize the dictionary using two abbreviation dataset grabbed from websites
dict1 = pd.read_csv('../HVACacronym.csv')
dict2 = pd.read_csv('../HVACacronym1.csv')

dict1['full']=dict1['full'].apply(lambda x: str.lower(x))
dict2['name']=dict2['name'].apply(lambda x: str.lower(x))

ABBdict = pd.concat([dict1.rename({'full':'name'}, axis='columns'),dict2])
ABBdict['abb']= ABBdict[['abb']].values.tolist()
ABBdict = pd.DataFrame(ABBdict.groupby('name')['abb'].sum())
ABBdict['name']=ABBdict.index
ABBdict.rename(index=str, columns={0: "abb"},inplace=True)
ABBdict['abb']= ABBdict['abb'].apply(lambda x: set(x))

ABBdict.reset_index(drop=True,inplace=True)

In [29]:
# update the dictionary several times according to the case studies 
# (abbreviations for the same work cannot be added at the same time)
# (the dictionary will get more comprehensive overtime)
abb = ['OC','TMP','TEMP','SA','STATIC','RA','RH','RM','BM','FLWR','RET','KWATT','KWHR','PM','ENGRY','FWD','PWR','AHU','FAN','SPEED','BYP','CLG','WATER','VLV','TP','SMK','FA','PM','BM','CURR','VOLT','PW']
full=['off coil','temperature','temperature','supply air','pressure','return air','relative humidity','room','btu meter','flow rate','return','power','energy','power meter','energy','supply','power','ahu','fan','speed','bypass','cooling','water','valve','time program','smoke','fresh air','power meter','btu meter','current','voltage','power']
updateABB(abb,full)

EPname = ['zone','air','energy','heating','cooling','air terminal','pump','boiler','condenser','setpoint','heat exchanger','humidifier','heater','water','people','window','infiltration','equipment','outdoor','drybulb','dewpoint','speed','angle','radiation','pressure','precipitation','occupant','lights','humidity ratio','facility','mass flow rate','interior','gas','office','room','level','extract','fan','supply','air loop','site','power','radiant','fan coil unit','indoor','heat pump','vav','vrf','return','inlet','outlet','relief air','rate']
EPabb = ['RM','A','KWHR','HT','CL','VAV','PMP','B','COND','SP','HX','HUMID','HTR','WTR','PPL','WD','INFIL','EQUIP','O','DB','DP','SPD','AGL','R','PR','P','OCC','LT','HR','FAC','FLWR','INTR','G','OFC','RM','LV','E','F','S','AHU','LOC','KWATT','R','FCU','I','CU','VAV','VRF','R','S','R','EA','KW']
updateABB(EPabb,EPname)

NODEname = ['supply side inlet','supply side outlet','coil air outlet','mixed inlet','splitter outlet','coil outlet','inlet','outlet','supply inlet']
NODEabb = ['R','S','OC','R','S','S','FWD','RET','R']
updateABB(NODEabb,NODEname)

testName = ['electricity','energy','power','demand','rate','coil air outlet','coil outlet']
testABB = ['KWHR','ENGRY','PWR','ENGRY','PWR','S','OC']
updateABB(testABB,testName)

iwName = ['dewpoint','relative humidity','humidity ratio','wind','direction','global','solar','zone','electricity','water','mass flow rate']
iwABB = ['DWP','HM','HM','W','DIREC','GLO','S','R','TOTKW','W','FL']
updateABB(iwABB,iwName)

iwName = ['humidity ratio','relative humidity','mass flow rate']
iwABB = ['HUM','HUM','FLO']
updateABB(iwABB,iwName)

sdeName = ['temperature','dewpoint','relative humidity','humidity ratio','pressure','barometric','wind','direction','solar','radiation','precipitation','electricity','energy','purchased','net','produced']
sdeABB = ['TEMPERATURE','DEW','HUMIDITY','HUMIDITY','PRESSURE','BAROMETRIC','WIND','DIRECTION','SOLAR','RADIATION','PRECIPITATION','POWER','ENERGY','RECEIVED','BALANCE','DELIVERED']
updateABB(sdeABB,sdeName)

sdeABB = ['KWH','SP']
sdeName = ['electricity','pressure']
updateABB(sdeABB,sdeName)

CSLabb = ['DEWPOINT','H','H','OCCUPANCY','USAGE','SUPPLY','CFM','STPT','ZN']
CSLname = ['dewpoint','humidity ratio','relative humidity','occupant','energy','inlet','flow rate','setpoint','zone']
updateABB(CSLabb,CSLname)

CSLabb = ['KCFM','SETPOINT','RETURN','RETURN','ZONE','SUPPLY','RETURN']
CSLname = ['flow rate','setpoint','supply inlet','outlet','zone','supply side outlet','return']
updateABB(CSLabb,CSLname)

CSL3abb = ['FLOW','RETURN']
CSL3name = ['flow rate','supply side inlet']
updateABB(CSL3abb,CSL3name)

# Fuzzy string matching based on the tokenization results

In [46]:
# the main function to propose the top mapping recommendations for the EnergyPlus variable
def match(s,OPCtag,th): # s: the EP variable to map to; OPCtag: the list of BMS point tags; th: the number of mappings given

    # tokenize the EP variable by looking up the words and phrases in the dictionary
    ngram = []
    for i in range(3,0,-1):
        grams = ngrams(s.split(' '),i)
        for gram in grams:
            ngram.append(' '.join(list(gram)))
    
    ABBs = []
    while len(ngram)>0:
        i = ngram[0]
        ngram.remove(i)
        try:
            ABBs.append(ABBdict.loc[ABBdict['name'] == i,'abb'].values[0])
            ngram = [s for s in ngram if (s not in i)]
        except:
            pass
    
#     go through the BMS points and evaluate the similarity by matched ratio 
    details = []
    for i in OPCtag.index:
        idx = 0 # initialize the index of the X-gram segmentation
        Nmatched = 0 # initialize the max number of matched points of a BMS point
        r = 0 # initialize the matched ratio
        for j in range(len(OPCtag['Xgrams'][i])):
            used = [0]*len(ABBs)    # so that a set of ABBs is only matched once
            count = 0
            for gram in OPCtag.loc[i,'Xgrams'][j]:
                for c in range(len(ABBs)):
                    if gram in ABBs[c] and used[c]==0:
                        used[c] = 1
                        count += len(gram)
                        break
            if count > Nmatched: # to avoid division by zero error
                Nmatched = count
                # harmonic average of ratio on both side
                r = 2/(len(ABBs)/sum(used) + len(OPCtag['measure'][i])/Nmatched)
                idx = j
        if r >= .01:
            details.append([round(r,4),OPCtag['measure'][i],tuple(OPCtag['Xgrams'][i][idx])])

    details = list(set(tuple(x) for x in details))
    details.sort()  # ranking the BMS points by the matched ratio
    matches = [i[1] for i in details][-th:]

    return [matches,details,ABBs]

In [52]:
# function to evaluate the mapping result
def check(result,th):
    for i in result.index:
        if len(result.loc[i,'matches']) == 0:
            if np.nan in result.loc[i,'truth']:
                result.loc[i,'test'] = 'TN' # the variable doesn't have a match and no BMS point was mapped 
            else:
                result.loc[i,'test'] = 'FN' # the variable has a match but BMS points weren't mapped 
        elif np.nan in result.loc[i,'truth']:
            result.loc[i,'test'] = 'FP' # the variable doesn't have a match but BMS points were wrongly mapped  
        else:
            for m in result.loc[i,'truth']:
                if m in result.loc[i,'matches'][-th:]:
                    result.loc[i,'test'] = 'TP' # the variable has a match and BMS points were correctly mapped 
                    break
                else:
                    result.loc[i,'test'] = 'FP1' # the variable has a match but BMS points were wrongly mapped 
    return result

In [35]:
# read the ground truth from the manual mapping result
truth = pd.read_csv('../.csv') # file name removed for confidentiality reasons
for i in truth.index:
    if truth.loc[i,'alter'] in [np.nan]:
        truth.loc[i,'truth'] = [truth.loc[i,'truth']]
    else:
        truth.loc[i,'truth'] = [truth.loc[i,'truth'],truth.loc[i,'alter']]
truth.drop(columns=['alter'],inplace = True)
# truth

In [47]:
# store the mapping results in a dataframe 
top5 = pd.DataFrame(columns = ['input','abbs','matches'])
ls = inEP['measure'].unique()
for i in range(len(ls)):
    test = match(ls[i],OPCtag,5)
    top5.loc[i,'input'] = ls[i]
    top5.loc[i,'abbs'] = test[2]
    top5.loc[i,'matches'] = test[0]

In [55]:
top5 = top5.join(truth.set_index('input'), on='input')
t5 = check(top5,5)
t5['test'].value_counts()

FP     410
TP      49
TN       4
FP1      3
Name: test, dtype: int64

### the three errors were caused by wrong measure extraction ("ATEMPSP" was extracted instead of "SATEMPSP")

In [49]:
top5[top5['test']=='FP1']

Unnamed: 0,input,abbs,matches,truth,test
94,demand side inlet 1 setpoint temperature,"[{ENGRY}, {FWD, S}, {SP}, {T, TEMP, TMP}]","[RATEMPSP, RMTEMPSP, SATEMP, ATEMPSP, TMPSP]",[SATEMPSP],FP1
100,supply side outlet 1 setpoint temperature,"[{S}, {SP}, {T, TEMP, TMP}]","[RATEMPSP, RMTEMPSP, SATEMP, ATEMPSP, TMPSP]",[SATEMPSP],FP1
148,supply side outlet setpoint temperature,"[{S}, {SP}, {T, TEMP, TMP}]","[RATEMPSP, RMTEMPSP, SATEMP, ATEMPSP, TMPSP]",[SATEMPSP],FP1


In [51]:
ACC_top5 = 49/52
ACC_top5

0.9423076923076923

In [57]:
# check the top-1 accuracy
t1 = check(top5,1)
t1['test'].value_counts()

FP     410
TP      27
FP1     25
TN       4
Name: test, dtype: int64

# implement the baseline method for comparison

#### tokenizing BMS points using N-gram

In [59]:
OPCtag['ngram'] = OPCtag['measure']
for i in OPCtag.index:
    ngram = []
    for n in range(1,min(7,len(OPCtag['measure'][i])+1)):
        grams = ngrams(OPCtag.loc[i,('measure')],n)
        for gram in grams:
            ngram.append(''.join(list(gram)))
    OPCtag.at[i,'ngram'] = ngram
OPCtag.iloc[:,2:].tail()

Unnamed: 0_level_0,tag,label,measure,loc,Xgrams,ngram
SN,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
811,VPWQ1-6PMKWHR,88,PMKWHR,VPWQ1-6,"[[P, M, K, W, H, R], [PM, K, W, H, R], [MK, P,...","[P, M, K, W, H, R, PM, MK, KW, WH, HR, PMK, MK..."
812,VLTQ1-7PMCURRC,79,C,VLTQ1-7PM,[[C]],[C]
813,VLTQ1-1PMKWATT,104,PMKWATT,VLTQ1-1,"[[P, M, K, W, A, T, T], [PM, K, W, A, T, T], [...","[P, M, K, W, A, T, T, PM, MK, KW, WA, AT, TT, ..."
814,VPWQ1-8PMVOLTBC,89,PMVOLTBC,VPWQ1-8,"[[P, M, V, O, L, T, B, C], [PM, V, O, L, T, B,...","[P, M, V, O, L, T, B, C, PM, MV, VO, OL, LT, T..."
815,VPWQ1-12PMVOLTAB,80,PMVOLTAB,VPWQ1-12,"[[P, M, V, O, L, T, A, B], [PM, V, O, L, T, A,...","[P, M, V, O, L, T, A, B, PM, MV, VO, OL, LT, T..."


#### Fuzzy string matching based on N-gram and Jaccard similarity (all the same as fucntion *match* except the criterion)

In [61]:
def onNgram(s,OPCtag,th):
    
    ngram = []
    for i in range(3,0,-1):
        grams = ngrams(s.split(' '),i)
        for gram in grams:
            ngram.append(' '.join(list(gram)))
    
    ABBs = []
    while len(ngram)>0:
        i = ngram[0]
        ngram.remove(i)
        try:
            ABBs.append(ABBdict.loc[ABBdict['name'] == i,'abb'].values[0])
            ngram = [s for s in ngram if (s not in i)]
        except:
            pass
    
    detail = []
    for i in OPCtag.index:
        r = 0
        used = [0]*len(ABBs)    # so that a set of ABBs is only matched once
        count = 0
        for gram in OPCtag['ngram'][i]:
            for c in range(len(ABBs)):
                if gram in ABBs[c] and used[c]==0:
                    used[c] = 1
                    count += 1
                    break
        if count >= 1:
#             Jaccard similarity
            r = count/(len(ABBs)+len(OPCtag['ngram'][i])-count)
            detail.append([round(r,4),OPCtag['measure'][i]])

    detail = list(set(tuple(x) for x in detail))
    detail.sort()
    matches = [i[1] for i in detail][-th:]
            
    return [matches,detail,ABBs]

In [62]:
top5 = pd.DataFrame(columns = ['input','abbs','matches'])
ls = inEP['measure'].unique()
for i in range(len(ls)):
    test = onNgram(ls[i],OPCtag,5)
    top5.loc[i,'input'] = ls[i]
    top5.loc[i,'abbs'] = test[2]
    top5.loc[i,'matches'] = test[0]

In [63]:
top5 = top5.join(truth.set_index('input'), on='input')
t5 = check(top5,5)
t5['test'].value_counts()

FP     410
FP1     27
TP      25
TN       4
Name: test, dtype: int64

In [65]:
t1 = check(top5,1)
t1['test'].value_counts()

FP     410
FP1     39
TP      13
TN       4
Name: test, dtype: int64