In [1]:
import json
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-large")
model = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-large")

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
with open('../input/metrics_dataset-traffic-test.json', 'r') as f:
    training_data_a = json.load(f)

with open('../input/metrics_dataset-domesticDeclarations.json', 'r') as f:
    training_data_b = json.load(f)

with open('../input/metrics_dataset.json', 'r') as f:
    training_data_c = json.load(f)

In [4]:
examples = "-time: average time from opened to closed, Elapsed time between an action requirement is notified until the action is performed\n-count: number of activities reopened, number of users within the black list\n-data: Average of declared costs detailed per trip, the value of state"

In [5]:
data = training_data_a['metrics']
data.extend(training_data_b['metrics'])
data.extend(training_data_c['metrics'])

real_labels = []
predicted_labels = []

counter = 0
for phrase in data:
    if phrase['type'] not in ['time', 'count', 'data']:
        continue
    counter += 1
    # print(f"Predicting {counter}/{len(data)}")
    real_labels.append(phrase['type'])

    # prompt = f"Sentence:\n{phrase['description']}\nIs this sentence time metric, count metric or data metric?\nOPTIONS:\n-time\n-count\n-data\n\nEXAMPLES\n{examples}"
    prompt = f"Given this sentence:\n{phrase['description']}\nChoose the correct label:\n-time\n-count\n-data\n\nEXAMPLES\n{examples}"
    # prompt = f"Given this sentence:\n{phrase['description']}\nClassify this sentence as:\n-time\n-count\n-data\n\nEXAMPLES\n{examples}"
    input_ids = tokenizer(prompt, return_tensors="pt").input_ids

    outputs = model.generate(input_ids)
    #chek if the output is correct, if not print the prompt, the real label and the predicted label
    if tokenizer.decode(outputs[0], skip_special_tokens=True) != phrase['type']:
        print(f"Prompt: {prompt}")
        print(f"Real label: {phrase['type']}")
        print(f"Predicted label: {tokenizer.decode(outputs[0], skip_special_tokens=True)}")
        print("\n")

    predicted_labels.append(tokenizer.decode(outputs[0], skip_special_tokens=True))    



Prompt: Given this sentence:
Occasions in Numbers how often the Credit collection is needed 
Choose the correct label:
-time
-count
-data

EXAMPLES
-time: average time from opened to closed, Elapsed time between an action requirement is notified until the action is performed
-count: number of activities reopened, number of users within the black list
-data: Average of declared costs detailed per trip, the value of state
Real label: count
Predicted label: time


Prompt: Given this sentence:
Percentage of fines that are paid before notification
Choose the correct label:
-time
-count
-data

EXAMPLES
-time: average time from opened to closed, Elapsed time between an action requirement is notified until the action is performed
-count: number of activities reopened, number of users within the black list
-data: Average of declared costs detailed per trip, the value of state
Real label: count
Predicted label: time


Prompt: Given this sentence:
Number of fines with amounts above 100 euros
Choo

In [98]:
import pandas as pd
precision, recall, f1, _ = precision_recall_fscore_support(real_labels, predicted_labels, average='macro')
acc = accuracy_score(real_labels, predicted_labels)

df = pd.read_csv('./results.csv')
df = df.append({'type': 'text-classification', 'accuracy': acc, 'f1': f1, 'precision': precision, 'recall': recall, 'prompt': prompt}, ignore_index=True)
df.to_csv('./results.csv', index=False)

ValueError: Found input variables with inconsistent numbers of samples: [2, 1]

In [59]:
from sklearn.metrics import classification_report

print(classification_report(real_labels, predicted_labels))

              precision    recall  f1-score   support

       count       0.97      0.90      0.93        39
        data       1.00      0.50      0.67         4
        time       0.88      1.00      0.93        35

    accuracy                           0.92        78
   macro avg       0.95      0.80      0.84        78
weighted avg       0.93      0.92      0.92        78



In [99]:
data = training_data_a['metrics']
data.extend(training_data_b['metrics'])
data.extend(training_data_c['metrics'])

In [100]:

real_labels = []
predicted_labels = []

counter = 0

slots_examples= '''
1. Sentence: Average time of an order grouped by customer
Tags: TBE (an order)

2. Sentence: Average time from opened to closed
Tags: TSE (opened), TEE (closed)

3. Sentence: Average time to resolve an incident due to a problem
Tags: TBE (to resolve an incident due to a problem) 

4. Sentence: Average real time from opening to resolution of the incident
Tags: TSE (opening), TEE (resolution of the incident)
'''

for phrase in data:
    if phrase['type'] not in ['time']:
        continue
    counter += 1
    print(f"Predicting {counter}/{len(data)}")
    
    output = ""
    for slot in phrase['slots']:
        if slot in ['TSE', 'TEE', 'TBE']:
            output += f"\n{slot}: {phrase['slots'][slot]}"
    
    if output == "":
        continue

    # prompt = f"Given the following sentence:\n{phrase['description']}\n\nIdentify the parts of the sentence that correspond to any of the following entities:\n-TSE (Time Start Element)\n-TEE (Time End Element)\n-TBE (Time Base Element)\n\nFor each entity, provide the corresponding text in the given sentence.\n\nFor example:\n{slots_examples}"
    prompt = f'''Identify in the following time metric sentence the FROM and TO elements:

    Sentence: {phrase['description']}

    Examples:
    {slots_examples}
    '''
    print(prompt)
    
    input_ids = tokenizer(prompt, return_tensors="pt").input_ids
    outputs = model.generate(input_ids)
    print(output)
    print(tokenizer.decode(outputs[0], skip_special_tokens=True))
    #predicted_labels.append(tokenizer.decode(outputs[0], skip_special_tokens=True)) 

Predicting 1/141
Identify in the given sentence time-related elements and assign the appropriate tags.

        Sentence: Time until the fine is sent

        Tags:
        TBE: Time base element
        TEE: Time end element
        TSE: Time start element

        Examples:
        
1. Sentence: Average time of an order grouped by customer
Tags: TBE (an order)

2. Sentence: Average time from opened to closed
Tags: TSE (opened), TEE (closed)

3. Sentence: Average time to resolve an incident due to a problem
Tags: TBE (to resolve an incident due to a problem) 

4. Sentence: Average real time from opening to resolution of the incident
Tags: TSE (opening), TEE (resolution of the incident)

    


KeyboardInterrupt: 