# Goal: end to end inference and evaluation

given a csv, make predictions and evaluate predictions, then return results in a csv

In [1]:
import pandas as pd

data_path = f"../make_data/test.xlsx"
# Ensure to include 'ships_idx' in the fields list
fields = ['ships_idx', 'tag_name', 'tag_description', 'thing', 'property']

# Load the dataset
try:
    df = pd.read_excel(data_path, usecols=fields)
except UnicodeDecodeError:
    df = pd.read_excel(data_path, usecols=fields, encoding='ISO-8859-1')


# df = df.dropna().reset_index(drop=True)
selected_columns = ['thing', 'property', 'tag_description']
df[selected_columns] = df[selected_columns].astype("string")
df['ships_idx'] = df['ships_idx'].astype("Int64")
df = df.fillna('')

In [2]:
len(df)

59091

# alternate slower tag_description processing strategy

In [3]:
import tqdm as tqdm

In [4]:
tag_desc= df['tag_description'].to_list()

In [5]:
def find_matching_strings(strings, diff_count):
    
    family_of_f = []
    ignore_list = set()
    first_time = True
    counter = 0 
    for i in tqdm.tqdm(range(len(strings))):
        # print("i is ", i)
        if (strings[i] in ignore_list):
            continue
        family = []
        for j in range(i + 1,len(strings)):
            # print("j is ", j)
            if (strings[j] in ignore_list):
                continue
            mismatch_count = 0

            # find number of corresponding differences
            if len(strings[i]) == len(strings[j]):
                for k in range(len(strings[i])):
                    if strings[i][k] != strings[j][k]:
                        mismatch_count += 1
                        if mismatch_count > diff_count:
                            break
            else:
                continue

            if (mismatch_count > 0 and mismatch_count <= diff_count and first_time):
                family.append((strings[i], i))
                family.append((strings[j], j))
                ignore_list.add(strings[i])
                ignore_list.add(strings[j])
                first_time = False
            elif (mismatch_count > 0 and mismatch_count <= diff_count and (not first_time)):
                family.append((strings[j], j))
                ignore_list.add(strings[j])
            else:
               pass 
        if (len(family)) > 0:
            family_of_f.append(family)
        first_time = True
        
    return family_of_f

In [6]:
family_list = find_matching_strings(tag_desc, 2)

  0%|          | 31/59091 [00:00<03:19, 296.38it/s]

100%|██████████| 59091/59091 [00:56<00:00, 1052.72it/s]


In [7]:
import pickle

with open('family_list.pkl', 'wb') as file: 
      
    # A new file will be created 
    pickle.dump(family_list, file) 

In [5]:
# Open the file in binary mode 
import pickle
with open('family_list.pkl', 'rb') as file: 
      
    # Call load method to deserialze 
    family_list = pickle.load(file) 

In [6]:
len(family_list)

7700

In [7]:
def find_diff_char(strings):
    reference = strings[0][0]
    flank_pos = set()
    for sample in strings:
        for i in range(len(reference)):
            if reference[i] != sample[0][i]:
                flank_pos.add(i)
    return flank_pos


In [8]:
# function to insert <attn>
def mutate_list(tag_desc, family):
    indices = find_diff_char(family)
    for num in range(len(indices)):
        # re-compute differences in family after a single pass
        updated_indices = find_diff_char(family)
        # since we already know how many differences there are
        # and the positions are by index positions
        # we just take the values and sort and take the num-th value
        updated_indices = sorted(list(updated_indices))
        for index, tuple in enumerate(family):
            word = tuple[0]
            id = updated_indices[num]
            part1 = word[:id]
            part2 = word[id]
            part3 = word[id + 1:]
            tag_desc[tuple[1]] = part1 + "<attn>" + part2 + "<attn>" + part3
            word = tag_desc[tuple[1]]
            # update family item too
            family[index] = (word, tuple[1])


    # for tuple in family:
    #     print(tag_desc[tuple[1]])



In [9]:
# check if it works
# mutate_list(tag_desc, family_list[0])
# family_list[0]

In [10]:
for family in family_list:
    mutate_list(tag_desc, family)

In [20]:
family_list[200]

[('<attn>E<attn><attn>C<attn><attn>S<attn> ABNORMAL', 847),
 ('<attn>P<attn><attn>M<attn><attn>S<attn> ABNORMAL', 4181),
 ('<attn>E<attn><attn>/<attn><attn>G<attn> ABNORMAL', 57840)]

In [22]:
family_list[116]

[('M/E COMMON ALM FROM EICU-<attn>A<attn> ', 456),
 ('M/E COMMON ALM FROM EICU-<attn>B<attn> ', 457)]

In [15]:
# save result back to df
new_df = pd.DataFrame({'tag_description_with_tokens': tag_desc})
df = pd.concat([df, new_df], axis=1)
df['tag_description_with_tokens'] = df['tag_description_with_tokens'].astype('string')

In [16]:
# construct dataset
from datasets import Dataset

def process_df(df):
    output_list = [{
        'translation': {
            'ships_idx': row['ships_idx'],
            'tag_description': row['tag_description_with_tokens'],
            'thing_property': f"<THING_START>{row['thing']}<THING_END><PROPERTY_START>{row['property']}<PROPERTY_END>",
            'answer_thing': f"{row['thing']}",
            'answer_property':f"{row['property']}",
        }
    } for _, row in df.iterrows()]

    return output_list

test_dataset = Dataset.from_list(process_df(df))

In [17]:
from transformers.pipelines.pt_utils import KeyDataset
from transformers import pipeline
from tqdm import tqdm

# model_checkpoint = "train_tp_checkpoint_80/checkpoint-4640"
# model_checkpoint = "train_tp_checkpoint_40/checkpoint-2760"
# model_checkpoint = "checkpoint_attention_token_20/checkpoint-1380"
model_checkpoint = "checkpoint_attention_token_40/checkpoint-2760"

# model_checkpoint = "checkpoint_reference_20/checkpoint-1380"

from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("t5-base", return_tensors="pt")
# Define additional special tokens
additional_special_tokens = ["<THING_START>", "<THING_END>", "<PROPERTY_START>", "<PROPERTY_END>", "<attn>"]
# Add the additional special tokens to the tokenizer
tokenizer.add_special_tokens({"additional_special_tokens": additional_special_tokens})
# tokenizer.add_special_tokens({'sep_token': "<SEP>"})


pipe = pipeline("translation_XX_to_YY", model=model_checkpoint, tokenizer=tokenizer, return_tensors=True, max_length=64, device=0)

# check what token-ids the special tokens are
# tokenizer.encode("<THING_START><THING_END><PROPERTY_START><PROPERTY_END>")


In [18]:
def extract_seq(tokens, start_value, end_value):
    if start_value not in tokens or end_value not in tokens:
        return None  # Or handle this case according to your requirements
    start_id = tokens.index(start_value)
    end_id = tokens.index(end_value)

    return tokens[start_id+1:end_id]


In [19]:
# problem, what if end tokens are not in?
def process_tensor_output(output):
    tokens = output[0]['translation_token_ids'].tolist()
    thing_seq = extract_seq(tokens, 32100, 32101) # 32100 = <THING_START>, 32101 = <THING_END>
    property_seq = extract_seq(tokens, 32102, 32103) # 32102 = <PROPERTY_START>, 32103 = <PROPERTY_END>
    p_thing = None
    p_property = None
    if (thing_seq is not None):
        p_thing =  tokenizer.decode(thing_seq, skip_special_tokens=True)
    if (property_seq is not None):
        p_property =  tokenizer.decode(property_seq, skip_special_tokens=True)
    return p_thing, p_property

In [20]:
p_thing_list = []
p_property_list = []
print("making inference on test set")
for out in tqdm(pipe(KeyDataset(test_dataset["translation"], "tag_description"), batch_size=256)):
    p_thing, p_property = process_tensor_output(out)
    p_thing_list.append(p_thing)
    p_property_list.append(p_property)
print("inference done")

making inference on test set


59091it [02:32, 388.59it/s]                    

inference done





In [21]:
answer_thing = [ item['answer_thing'] for item in test_dataset["translation"]]
answer_property = [ item['answer_property'] for item in test_dataset["translation"]]
def correctness_test(input, reference):
    assert(len(input) == len(reference))
    correctness_list = []
    for i in range(0,len(input)):
        if (input[i] == reference[i]):
            correctness_list.append(True)
        else:
            correctness_list.append(False)
    return correctness_list

# compare with answer to evaluate correctness
thing_correctness = correctness_test(p_thing_list, answer_thing)
print("thing prediction accuracy", sum(thing_correctness)/len(thing_correctness))
property_correctness = correctness_test(p_property_list, answer_property)
print("property prediction accuracy", sum(property_correctness)/len(property_correctness))

dict = {'p_thing': p_thing_list, 
        'p_property': p_property_list,
        'p_thing_correct': thing_correctness,
        'p_property_correct': property_correctness}
df_pred = pd.DataFrame(dict)

thing prediction accuracy 0.15027669188201248
property prediction accuracy 0.14665515899206308


In [None]:
df_pred

In [13]:
# load dataset again, this time with all fields, except last 5 fields
data_path = f"../make_data/test.xlsx"
# Load the dataset

fields = ['thing',  'property', 'ships_idx', 'tag_name', 'equip_type_code', 'tag_description',
        'tx_period', 'tx_type', 'on_change_yn', 'scaling_const', 'signal_type', 'min',
        'max', 'unit', 'data_type', 'description', 'updated_time', 'status_code',
        'is_timeout']

df_orig = pd.read_excel(data_path, usecols=fields)
# try:
#     df_orig = pd.read_csv(data_path, usecols=fields, skipinitialspace=True)
# except UnicodeDecodeError:
#     df_orig = pd.read_csv(data_path, usecols=fields, skipinitialspace=True, encoding='ISO-8859-1')

columns_to_check = ['ships_idx', 'tag_name', 'tag_description', 'thing', 'property']
# df_orig = df_orig.dropna(subset=columns_to_check).reset_index(drop=True)



# combine prediction dataframe
df_final = pd.concat([df_orig, df_pred], axis=1)
# df_final.to_csv('test_with_predictions.csv')
# df_final.to_excel('test_with_predictions.xlsx')
df_final.to_parquet('test_with_predictions.parquet', index=None)

In [14]:
df_final.to_excel('test_with_predictions.xlsx')