In [120]:
import json
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

In [121]:
tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-large")
model = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-large")

In [122]:
import pm4py

log = pm4py.read_xes("../input/event_logs/Road_Traffic_Fine_Management_Process.xes")
df = pm4py.convert_to_dataframe(log)

column_names = list(df.columns)
print(', '.join(column_names))

event_names = list(df["concept:name"].unique())
print(', '.join(event_names)) 

parsing log, completed traces :: 100%|██████████| 150370/150370 [00:13<00:00, 11131.87it/s]


amount, org:resource, dismissal, concept:name, vehicleClass, totalPaymentAmount, lifecycle:transition, time:timestamp, article, points, case:concept:name, expense, notificationType, lastSent, paymentAmount, matricola
Create Fine, Send Fine, Insert Fine Notification, Add penalty, Send for Credit Collection, Payment, Insert Date Appeal to Prefecture, Send Appeal to Prefecture, Receive Result Appeal from Prefecture, Notify Result Appeal to Offender, Appeal to Judge


In [123]:
#Funciona considerablemente bien.
#Problema: si da yes a varias actividades
for event_name in event_names:
    paraphrasing_prompt = f"notify fine to offender\n{event_name}\nAre these two sentences paraphrases of each other?"
    inputs = tokenizer(paraphrasing_prompt, return_tensors="pt").input_ids
    outputs = model.generate(inputs)
    result = tokenizer.decode(outputs[0], skip_special_tokens=True)
    print(f"{event_name}: {result}")



Create Fine: yes
Send Fine: yes
Insert Fine Notification: yes
Add penalty: yes
Send for Credit Collection: no
Payment: no
Insert Date Appeal to Prefecture: no
Send Appeal to Prefecture: no
Receive Result Appeal from Prefecture: no
Notify Result Appeal to Offender: no
Appeal to Judge: no


In [124]:
#Prueba con zero-shot: devuelve un resultado pero no es el correcto
options_text = "\n-".join(event_names)
options_text = "-" + options_text
prompt = f"Sentence:\nnotify fine to offender\nWhat is the most similar paraphrase?\nOPTIONS:\n{options_text}"
inputs = tokenizer(prompt, return_tensors="pt").input_ids
outputs = model.generate(inputs, max_length=1000)
result = tokenizer.decode(outputs[0], skip_special_tokens=True)
print(result)

Send Fine


Próximo: Probar a poner fewshot con parafrases de cada nombre de actividad
¿Se usa actualmente sentence transformer?

In [125]:
examples = '''
- Create Fine: Generate the fine, Establish Fine, Create the fine
- Send Fine: Dispatch the fine, Transmit the fine, Forward the fine
- Insert Fine Notification: Enter the notification for the fine, Add the fine notice, Include the fine notification
- Add Penalty: Impose the penalty, Levy the penalty, Apply the penalty
- Send for Credit Collection: Forward for debt collection, Send for payment recovery, Transfer for credit recovery
- Payment: Remittance, Settlement, Clearing of dues
- Insert Date Appeal to Prefecture: Enter the date of the appeal to the prefecture, Add the date for the prefecture appeal, Input the appeal date to the regional authority
- Send Appeal to Prefecture: Dispatch the appeal to the prefecture, Transmit the plea to the regional authority, Forward the appeal to the regional authority
- Receive Result Appeal from Prefecture: Obtain the result of the appeal from the prefecture, Receive the appeal outcome from the regional authority, Get the appeal decision from the regional authority
- Notify Result Appeal to Offender: Inform the offender of the appeal result, Advise the offender of the appeal outcome, Notify the appellant of the appeal decision
- Appeal to Judge: Contest before the judge, Challenge the decision to the judicial authority, Plead to the judge
'''

In [126]:
# Prueba con few-shot: de vez en cuando devuelve el resultado correcto
options_text = "\n-".join(event_names)
options_text = "-" + options_text
prompt = f"Sentence:\nfines with appeal\nWhat is the most similar activity name?\nOPTIONS:\n{options_text}\n\nEXAMPLES:\n{examples}"
inputs = tokenizer(prompt, return_tensors="pt").input_ids
outputs = model.generate(inputs, max_length=1000)
result = tokenizer.decode(outputs[0], skip_special_tokens=True)
print(result)

Appeal to Judge


In [127]:
with open('../input/metrics_dataset-traffic-test.json', 'r') as f:
    training_data_a = json.load(f)

In [128]:
data = training_data_a['metrics']

In [129]:
slot_dict = {   
    'TSE': 'from_condition',
    'TEE': 'to_condition',
    'CE': 'count_condition',
    'TBE': ['from_condition', 'to_condition']
}

parsing_values = []
real_matchings = []
for example in data:
    slots = example['slots']
    try:
        goldstandard = example['goldstandard']
        matching = goldstandard['traffic']
    except:
        continue
    for slot in slots:
        if slot in slot_dict:
            matching_name = slot_dict[slot]
            try:
                if isinstance(matching_name, list):
                    matching_values = []
                    for name in matching_name:
                        matching_value = matching[name]
                        if "attribute" in matching_value:
                            if matching_value['attribute'] == 'ACTIVITY':
                                real_matching = matching_value['value']
                                slot_value = slots[slot]
                                parsing_values.append(slot_value)
                                real_matchings.append(real_matching)
                                matching_values.append(real_matching)
                                break       
                        
                else:  
                    matching_value = matching[matching_name]
                    if matching_value['attribute'] == 'ACTIVITY':
                        real_matching = matching_value['value']
                        slot_value = slots[slot]
                        parsing_values.append(slot_value)
                        real_matchings.append(real_matching)
            except:
                continue

In [130]:
predicteds = []
for parsing_value, expected_matching in zip(parsing_values, real_matchings):
    options_text = "\n-".join(event_names)
    options_text = "-" + options_text
    prompt = f"Sentence:\n{parsing_value}\nWhat is the most similar activity name?\nOPTIONS:\n{options_text}"
    inputs = tokenizer(prompt, return_tensors="pt").input_ids
    outputs = model.generate(inputs, max_length=1000)
    result = tokenizer.decode(outputs[0], skip_special_tokens=True)
    predicteds.append(result)
    if result != expected_matching:
        print(f"Expected: {expected_matching}, Predicted: {result}")

from sklearn.metrics import accuracy_score
accuracy_score(real_matchings, predicteds)

Expected: Insert Date Appeal to Prefecture, Predicted: Send Appeal to Prefecture
Expected: Payment, Predicted: Send Appeal to Prefecture
Expected: Notify Result Appeal to Offender, Predicted: Insert Fine Notification
Expected: Insert Fine Notification, Predicted: Send Fine
Expected: Insert Fine Notification, Predicted: Receive Result Appeal from Prefecture
Expected: Insert Date Appeal to Prefecture, Predicted: Send Appeal to Prefecture
Expected: Add penalty, Predicted: Create Fine
Expected: Add penalty, Predicted: Create Fine


0.6666666666666666

In [133]:
# With 3 examples

predicteds = []
for parsing_value, expected_matching in zip(parsing_values, real_matchings):
    options_text = "\n-".join(event_names)
    options_text = "-" + options_text
    prompt = f"Sentence:\n{parsing_value}\nWhat is the most similar activity name?\n\nOPTIONS:\n{options_text}\n\nEXAMPLES:\n{examples}"
    inputs = tokenizer(prompt, return_tensors="pt").input_ids
    outputs = model.generate(inputs, max_length=1000)
    result = tokenizer.decode(outputs[0], skip_special_tokens=True)
    predicteds.append(result)
    if result != expected_matching:
        print(f"Parsing: {parsing_value}")
        print(f"Expected: {expected_matching}, Predicted: {result}")
        print()

from sklearn.metrics import accuracy_score
accuracy_score(real_matchings, predicteds)

Parsing: appeal
Expected: Insert Date Appeal to Prefecture, Predicted: Appeal to Judge

Parsing: notification
Expected: Notify Result Appeal to Offender, Predicted: Insert Fine Notification

Parsing: notify fine to offender
Expected: Insert Fine Notification, Predicted: Notify Result Appeal to Offender

Parsing: fines with appeal
Expected: Insert Date Appeal to Prefecture, Predicted: Appeal to Judge

Parsing: the prefecture to solve an appeal
Expected: Send Appeal to Prefecture, Predicted: Receive Result Appeal from Prefecture: Obtain the result of the appeal from the prefecture, Receive the appeal outcome from the regional authority, Get the appeal decision from the regional authority

Parsing: the prefecture
Expected: Send Appeal to Prefecture, Predicted: Receive Result Appeal from Prefecture: Obtain the result of the appeal from the prefecture, Receive the appeal outcome from the regional authority, Get the appeal decision from the regional authority



0.75

In [117]:
# Matching AttributeName and GBC with names of attributes
log = pm4py.read_xes("../input/event_logs/DomesticDeclarations.xes")
df = pm4py.convert_to_dataframe(log)

parsing log, completed traces :: 100%|██████████| 10500/10500 [00:00<00:00, 10830.98it/s]


case:Amount


In [119]:
attribute_names = list(df.columns)

slot_value ="costs"
options_text = "\n-".join(attribute_names)
options_text = "-" + options_text
prompt = f"Sentence:\n{slot_value}\nWhat is the most similar attribute name?\n\nOPTIONS:\n{options_text}\n\nEXAMPLES:\n{examples}"
inputs = tokenizer(prompt, return_tensors="pt").input_ids
outputs = model.generate(inputs, max_length=1000)
result = tokenizer.decode(outputs[0], skip_special_tokens=True)
print(result)

case:Amount
