In [105]:
import rdflib
import pandas as pd
import json
import re
from functools import reduce
import os
from brick_parser import equipTagsetList as equip_tagsets,\
                         pointTagsetList as point_tagsets,\
                         locationTagsetList as location_tagsets

In [106]:
split_dict = {
    'ztmp': ['z', 'tmp'],
    'zco': ['z', 'co'],
    'zco2': ['z', 'co2'],
    'airflow': ['air', 'flow'],
    'hwv': ['hw', 'v']
}

In [107]:
entity_dict = {
    'vav': 'vav',
    'CMU': 'rightidentifier',
    'scsc': 'rightidentifier',
    'gates': 'building-ghc',
    'floor': 'floor',
    'fcu': 'fcu',
    'corridor': 'room-corridor',
    #'central': 'leftidentifier',
    #'west': 'leftidentifier',
    'room': 'room',
    'crac': 'crac',
    #'office': 'leftidentifier',
    'chilled water wystem': 'chilled_water_system',
    'fan coil': 'fan_coil_unit'
    
}

In [108]:
def parse_sentence(sentence):                                                   
    parsed = re.findall("([a-zA-Z]+|\d+|[^0-9a-z])", sentence.lower())
    for key, splitted in split_dict.items():
        if key in parsed:
            idx = parsed.index(key)
            parsed = parsed[:idx] + splitted + parsed[idx+1:]
    return parsed

In [109]:
raw_df = pd.read_csv('metadata/CMU_GHC.csv')
with open('metadata/ghc_points.json', 'r') as fp:
    point_dict = json.load(fp)
labelFilename = 'metadata/ghc_label_dict_justseparate.json'
if os.path.isfile(labelFilename):
    with open(labelFilename, 'r') as fp:
        labelListDict = json.load(fp)
else:
    labelListDict = {}
    
with open('brick/equip_subclass_dict.json', 'r') as fp:
    equip_subclass_dict = json.load(fp)

In [110]:
sentence_dict = dict()
char_sentence_dict = dict()
truths_dict = dict()
def lister(x):
    return [one_x for one_x in x]
def adder(x,y):
    return x+y
for i, row in raw_df.iterrows():
    srcid = str(i)
    truths = list()
    bas_raw = row['bas_raw']
    sentence = parse_sentence(bas_raw.replace('/', '\n'))
    point_key = bas_raw.split('/')[-1].lower().replace(' ', '_')
    try:
        point_type = point_dict[point_key]
    except:
        short_point_key = '_'.join(point_key.split('_')[-2:])
        try:
            point_type = point_dict[short_point_key]
        except:
            print(point_key, '##', bas_raw)
    if point_type != 'none':
        truths.append(point_type)
    if labelListDict.get(srcid):
        # equip labels
        labels = labelListDict[srcid]
        for label in labels:
            if equip_subclass_dict.get(label):
                equip = label
                subclasses = equip_subclass_dict[label]
                for sub_label in labels:
                    if sub_label==label:
                        continue
                    if sub_label + '_' + label in subclasses:
                        equip = sub_label + '_' + label
                truths.append(equip)
        for label in labels:
            if label.split('-')[0] in location_tagsets:
                location = label
                truths.append(location)
        truths_dict[srcid] = truths
    sentence_dict[srcid] = sentence
    char_sentence_dict[srcid] = reduce(adder, map(lister, sentence))

In [111]:
with open('metadata/ghc_sentence_dict_justseparate.json', 'w') as fp:
    json.dump(sentence_dict, fp, indent=2)
with open('metadata/ghc_ground_truth.json', 'w') as fp:
    json.dump(truths_dict, fp, indent=2)
with open('metadata/ghc_char_sentence_dict_justseparate.json', 'w') as fp:
    json.dump(char_sentence_dict, fp, indent=2)