# Goal: end to end inference and evaluation

given a csv, make predictions and evaluate predictions, then return results in a csv

In [1]:
import pandas as pd

data_path = f"../make_data/test.xlsx"
# Ensure to include 'ships_idx' in the fields list
fields = ['ships_idx', 'tag_name', 'tag_description', 'thing', 'property']

# Load the dataset
try:
    df = pd.read_excel(data_path, usecols=fields)
except UnicodeDecodeError:
    df = pd.read_excel(data_path, usecols=fields, encoding='ISO-8859-1')


# df = df.dropna().reset_index(drop=True)
selected_columns = ['thing', 'property', 'tag_description']
df[selected_columns] = df[selected_columns].astype("string")
df['ships_idx'] = df['ships_idx'].astype("Int64")
df = df.fillna('')

In [2]:
len(df)

59091

# introduce the tag_description processing strategy

In [3]:
tag_desc_unsorted = df['tag_description'].to_list()

In [4]:
tag_desc= sorted(tag_desc_unsorted)

In [79]:
def check_num_of_plus_and_minus(diff_list):
    # Initialize counters for '+' and '-' signs
    plus_count = 0
    minus_count = 0

    # Iterate over each string in the list
    for item in diff_list:
        # Count the occurrences of '+' and '-' signs in the string
        plus_count += item[:1].count('+')
        minus_count += item[:1].count('-')
    if (plus_count == 1) and (minus_count == 1):
        return True
    else:
        return False

In [80]:
import difflib

# a simple rule is to use the list comprehension and find if the difference is only 2 characters, 
# this function returns all the indices that belong to a common set of sequences
def find_single_char_diff(tag_desc):
    seq_started = False
    seq_index = -1 # add 1 to become 0 when start of new seq
    output_list = []
    for i in range(0,len(tag_desc) - 2):
        case_a = tag_desc[i]
        case_b = tag_desc[i + 1]
        diff_list = [li for li in difflib.ndiff(case_a, case_b) if li[0] != ' ']
        # start of new sequence
        if check_num_of_plus_and_minus(diff_list) and (not seq_started):
            seq_started = True
            # create new nested list
            output_list.append([])
            # create new sequence index
            seq_index = seq_index + 1

        # check for end of sequence
        if (not check_num_of_plus_and_minus(diff_list)) and (seq_started):
            seq_started = False
            # add last element
            output_list[seq_index].append(i)

        # if still seq_started even after check for end of sequence, then add 
        if seq_started:
            output_list[seq_index].append(i)
    return output_list

obj_families_index = find_single_char_diff(tag_desc)

In [81]:
obj_families = [[tag_desc[i] for i in index_group] for index_group in obj_families_index]

In [82]:
obj_families

[[' CYL. LINER TEMP CYL 1 MAN', ' CYL. LINER TEMP CYL 2 MAN'],
 [' CYL. LINER TEMP CYL 3 EXH', ' CYL. LINER TEMP CYL 4 EXH'],
 [' EXH GAS TEMP AT TURBINE O TC 1',
  ' EXH GAS TEMP AT TURBINE O TC 2',
  ' EXH GAS TEMP AT TURBINE O TC 3',
  ' EXH GAS TEMP AT TURBINE O TC 4'],
 [' EXH GAS TEMP AT TURBINE OUT TC 1', ' EXH GAS TEMP AT TURBINE OUT TC 2'],
 [' EXH GAS TEMP AT TURBINE OUT TC 2', ' EXH GAS TEMP AT TURBINE OUT TC 3'],
 [' EXH GAS TEMP AT TURBINE OUT TC 3', ' EXH GAS TEMP AT TURBINE OUT TC 4'],
 [' NO.1 L.F.O BUNKER TK(P) TEMP', ' NO.1 L.F.O BUNKER TK(S) TEMP'],
 [' NO.2 G/E HEAVY LOAD BIT 03', ' NO.2 G/E HEAVY LOAD BIT 04'],
 [' NO.2 G/E PRIORITY BIT4',
  ' NO.2 G/E PRIORITY BIT5',
  ' NO.2 G/E PRIORITY BIT6',
  ' NO.2 G/E PRIORITY BIT7'],
 [' NO.2 L.F.O BUNKER TK(P) TEMP', ' NO.2 L.F.O BUNKER TK(S) TEMP'],
 [' TURBOCHARGER SPEED TC 1',
  ' TURBOCHARGER SPEED TC 2',
  ' TURBOCHARGER SPEED TC 3',
  ' TURBOCHARGER SPEED TC 4'],
 ['#1 G/E DOSING UREA FLOW HIGH', '#1 G/E DOSING UREA

In [83]:
def find_index_of_difference(str1, str2):
    # Iterate through the characters of the strings
    for i, (char1, char2) in enumerate(zip(str1, str2)):
        if char1 != char2:
            return i  # Return the index where the characters differ

    # If no difference found, return -1 (or any other suitable value)
    return -1

# perform attention token insertion for each word at the same index for a given family
# note: we are guaranteed that the tokens differ only at a position
for family in obj_families_index:
    # identify which position to insert the special token at
    position = find_index_of_difference(tag_desc[family[0]], tag_desc[family[1]])
    # then insert token into every word in the family
    for word_index in family:
        # Slice the original string into two parts at the specified position
        word = tag_desc[word_index]
        part1 = word[:position]
        part2 = word[position]
        part3 = word[position + 1:]
        tag_desc[word_index] = part1 + "<attn1>" + part2 + "<attn1>" + part3


# perform 2nd round of 1-char diff on the 1st round families

In [84]:
# take only the first char of each list
input_list = [family[0] for family in obj_families]
families_index_2 = find_single_char_diff(input_list)

In [85]:
obj_families = [[tag_desc[i] for i in index_group] for index_group in obj_families_index]
[obj_families[i] for i in families_index_2[10]]

[['AIR LOCK ALARM GTD OPEN OF NO. 1<attn1>A<attn1> HOLD  ',
  'AIR LOCK ALARM GTD OPEN OF NO. 1<attn1>F<attn1> HOLD  ',
  'AIR LOCK ALARM GTD OPEN OF NO. 1<attn1>M<attn1> HOLD  '],
 ['AIR LOCK ALARM GTD OPEN OF NO. 2<attn1>A<attn1> HOLD  ',
  'AIR LOCK ALARM GTD OPEN OF NO. 2<attn1>F<attn1> HOLD  ']]

In [86]:
print(families_index_2[0])
# grab the first word
family_2 = families_index_2[0]
# grab a family from the family of families
print(obj_families[family_2[0]])
# grab the first word of the chosen family
# print(obj_families[family_2[0]][0])

print(obj_families[family_2[1]])

[3, 4, 5]
[' EXH GAS TEMP AT TURBINE OUT TC <attn1>1<attn1>', ' EXH GAS TEMP AT TURBINE OUT TC <attn1>2<attn1>']
[' EXH GAS TEMP AT TURBINE OUT TC <attn1>2<attn1>', ' EXH GAS TEMP AT TURBINE OUT TC <attn1>3<attn1>']


In [88]:
families_index_2[197]

[2023]

In [89]:
families_index_2[195]

[1942, 1943]

In [90]:
# perform attention token insertion for each word at the same index for a given family
# note: we are guaranteed that the tokens differ only at a position
i = 0
for family_2 in families_index_2:
    print(i)
    i = i + 1
    # if (i == 197):
    #     break
    # identify which position to insert the special token at
    position = find_index_of_difference(obj_families[family_2[0]][0], obj_families[family_2[1]][0])
    # then insert token into every word in the family of families (family_2)
    for family_index in family_2:
        for word_index in obj_families_index[family_index]:
            # Slice the original string into two parts at the specified position
            word = tag_desc[word_index]
            part1 = word[:position]
            part2 = word[position]
            part3 = word[position + 1:]
            tag_desc[word_index] = part1 + "<attn2>" + part2 + "<attn2>" + part3


0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196


In [91]:
obj_families = [[tag_desc[i] for i in index_group] for index_group in obj_families_index]


['1MGE EG TEMP CYL 0<attn2>1<attn2><attn1>A<attn1>', '1MGE EG TEMP CYL 0<attn2>1<attn2><attn1>B<attn1>']
['1MGE EG TEMP CYL 0<attn2>2<attn2><attn1>A<attn1>', '1MGE EG TEMP CYL 0<attn2>2<attn2><attn1>B<attn1>']
['1MGE EG TEMP CYL 0<attn2>3<attn2><attn1>A<attn1>', '1MGE EG TEMP CYL 0<attn2>3<attn2><attn1>B<attn1>']
['1MGE EG TEMP CYL 0<attn2>4<attn2><attn1>A<attn1>', '1MGE EG TEMP CYL 0<attn2>4<attn2><attn1>B<attn1>']


In [92]:

# you can print obj_families to check - it looks fine to me
for i in families_index_2[50]:
    print(obj_families[i])

['ECS INSULATION LEVEL - ACU<attn1><attn2>1<attn2><attn1>', 'ECS INSULATION LEVEL - ACU<attn1><attn2>2<attn2><attn1>']
['ECS INSULATION LEVEL - ACU<attn1><attn2>2<attn2><attn1>', 'ECS INSULATION LEVEL - ACU<attn1><attn2>3<attn2><attn1>']


In [3]:
# construct dataset
from datasets import Dataset

def process_df(df):
    output_list = [{
        'translation': {
            'ships_idx': row['ships_idx'],
            'tag_description': row['tag_description'],
            'thing_property': f"<THING_START>{row['thing']}<THING_END><PROPERTY_START>{row['property']}<PROPERTY_END>",
            'answer_thing': f"{row['thing']}",
            'answer_property':f"{row['property']}",
        }
    } for _, row in df.iterrows()]

    return output_list

test_dataset = Dataset.from_list(process_df(df))

In [7]:
from transformers.pipelines.pt_utils import KeyDataset
from transformers import pipeline
from tqdm import tqdm

# model_checkpoint = "train_tp_checkpoint_80/checkpoint-4640"
model_checkpoint = "train_tp_checkpoint_40/checkpoint-2760"

from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("t5-base", return_tensors="pt")
# Define additional special tokens
additional_special_tokens = ["<THING_START>", "<THING_END>", "<PROPERTY_START>", "<PROPERTY_END>"]
# Add the additional special tokens to the tokenizer
tokenizer.add_special_tokens({"additional_special_tokens": additional_special_tokens})
# tokenizer.add_special_tokens({'sep_token': "<SEP>"})


pipe = pipeline("translation_XX_to_YY", model=model_checkpoint, tokenizer=tokenizer, return_tensors=True, max_length=64, device=0)

# check what token-ids the special tokens are
# tokenizer.encode("<THING_START><THING_END><PROPERTY_START><PROPERTY_END>")


In [8]:
def extract_seq(tokens, start_value, end_value):
    if start_value not in tokens or end_value not in tokens:
        return None  # Or handle this case according to your requirements
    start_id = tokens.index(start_value)
    end_id = tokens.index(end_value)

    return tokens[start_id+1:end_id]


In [9]:
# problem, what if end tokens are not in?
def process_tensor_output(output):
    tokens = output[0]['translation_token_ids'].tolist()
    thing_seq = extract_seq(tokens, 32100, 32101) # 32100 = <THING_START>, 32101 = <THING_END>
    property_seq = extract_seq(tokens, 32102, 32103) # 32102 = <PROPERTY_START>, 32103 = <PROPERTY_END>
    p_thing = None
    p_property = None
    if (thing_seq is not None):
        p_thing =  tokenizer.decode(thing_seq)
    if (property_seq is not None):
        p_property =  tokenizer.decode(property_seq)
    return p_thing, p_property

In [10]:
p_thing_list = []
p_property_list = []
print("making inference on test set")
for out in tqdm(pipe(KeyDataset(test_dataset["translation"], "tag_description"), batch_size=256)):
    p_thing, p_property = process_tensor_output(out)
    p_thing_list.append(p_thing)
    p_property_list.append(p_property)
print("inference done")

making inference on test set


59091it [01:55, 510.10it/s]                    

inference done





In [12]:
answer_thing = [ item['answer_thing'] for item in test_dataset["translation"]]
answer_property = [ item['answer_property'] for item in test_dataset["translation"]]
def correctness_test(input, reference):
    assert(len(input) == len(reference))
    correctness_list = []
    for i in range(0,len(input)):
        if (input[i] == reference[i]):
            correctness_list.append(True)
        else:
            correctness_list.append(False)
    return correctness_list

# compare with answer to evaluate correctness
thing_correctness = correctness_test(p_thing_list, answer_thing)
print("thing prediction accuracy", sum(thing_correctness)/len(thing_correctness))
property_correctness = correctness_test(p_property_list, answer_property)
print("property prediction accuracy", sum(property_correctness)/len(property_correctness))

dict = {'p_thing': p_thing_list, 
        'p_property': p_property_list,
        'p_thing_correct': thing_correctness,
        'p_property_correct': property_correctness}
df_pred = pd.DataFrame(dict)

thing prediction accuracy 0.15518437663942056
property prediction accuracy 0.15032746103467534


In [13]:
# load dataset again, this time with all fields, except last 5 fields
data_path = f"../make_data/test.xlsx"
# Load the dataset

fields = ['thing',  'property', 'ships_idx', 'tag_name', 'equip_type_code', 'tag_description',
        'tx_period', 'tx_type', 'on_change_yn', 'scaling_const', 'signal_type', 'min',
        'max', 'unit', 'data_type', 'description', 'updated_time', 'status_code',
        'is_timeout']

df_orig = pd.read_excel(data_path, usecols=fields)
# try:
#     df_orig = pd.read_csv(data_path, usecols=fields, skipinitialspace=True)
# except UnicodeDecodeError:
#     df_orig = pd.read_csv(data_path, usecols=fields, skipinitialspace=True, encoding='ISO-8859-1')

columns_to_check = ['ships_idx', 'tag_name', 'tag_description', 'thing', 'property']
# df_orig = df_orig.dropna(subset=columns_to_check).reset_index(drop=True)



# combine prediction dataframe
df_final = pd.concat([df_orig, df_pred], axis=1)
# df_final.to_csv('test_with_predictions.csv')
# df_final.to_excel('test_with_predictions.xlsx')
df_final.to_parquet('test_with_predictions.parquet', index=None)

In [14]:
df_final.to_excel('test_with_predictions.xlsx')