<a href="" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Common imports

In [25]:
import evaluate
import datasets
import numpy as np

# Loading the Instruct Dataset

In [26]:
dataset = datasets.load_from_disk('./predicting_with_models/models_prediction_dataset')
dataset

Dataset({
    features: ['REQID_ex', 'completion', 'query', 'class', 'task', 'text', 'label', 'mistral_ai_instruct_7b_chat_hf_preds', 'falcon_7b_base_preds', 'falcon_7b_instruct_preds', 'llama2_7b_chat_hf_preds', 'zephyr_7b_beta_preds', 'open_ai_gpt4', 'openai_compe_gpt4o_24_11_20'],
    num_rows: 34
})

In [27]:
# splitting the human written requirements

references = dataset['completion']

# Putting all Metrics Togather

In [28]:
def evaluate_model(references, predictions):
    bleu = evaluate.load('sacrebleu')
    rouge = evaluate.load('rouge')
    ter = evaluate.load("ter")
    bertscore = evaluate.load('bertscore')
    frugalscore = evaluate.load("frugalscore", "moussaKam/frugalscore_medium_roberta_bert-score")

    bleu_results = bleu.compute(predictions = predictions, references = references)
    rouge_results = rouge.compute(predictions = predictions, references = references)
    ter_results = ter.compute(predictions = predictions, references = references) # case_sensitive = False, ignore_punct = True
    bertscore_results = bertscore.compute(predictions = predictions, references = references, model_type = "xlm-mlm-en-2048", lang = 'en')
    frugalscore_results = frugalscore.compute(predictions=predictions, references=references, batch_size = 2, max_length = 512, device = "cpu")
    return {'bleu': bleu_results, 'rouge': rouge_results, 'ter': ter_results, 'bert_score': bertscore_results, 'frugal_score': frugalscore_results}

# Evaluating Single Requirements Zephyr Generated

In [29]:
print(dataset['query'][10])
print("*" * 30) 
print("Original: ", dataset['completion'][10])
print("Generated: ", dataset['zephyr_7b_beta_preds'][10])

results = evaluate_model([dataset['completion'][10]], [dataset['zephyr_7b_beta_preds'][10]])

print('\033[1m BLEU Score:\033[0m \t', results['bleu']['score'])
print('.' * 150)

for score in list(results['rouge'].keys()):
    print(f'\033[1m {score}:\033[0m \t', results['rouge'][score])
print('.' * 150)

print('\033[1m TER Score:\033[0m \t', results['ter']['score'])
print('\033[1m TER #Edits:\033[0m \t', results['ter']['num_edits'])
print('\033[1m TER Ref. Length:\033[0m \t', results['ter']['ref_length'])
print('.' * 150)

for metric in list(results['bert_score'].keys())[:-1]:
    pairwise_metric = results['bert_score'][metric]
    averaged_metric = np.sum(pairwise_metric)/len(pairwise_metric)
    print(f'\033[1m {metric}:\033[0m \t', averaged_metric)
print('.' * 150)

pairwise_frugal_score = results['frugal_score']['scores']
averaged_frugal_score = np.sum(results['frugal_score']['scores'])/len(results['frugal_score']['scores'])
print('\033[1m FRUGAL Score:\033[0m \t', averaged_frugal_score)
print('.' * 150)

Give me an availability requirement that outlines yearly 24/7 uptime for the website for the project.
******************************
Original:  The website shall be available for use 24 hours per day 365 days per year.
Generated:  The website shall be available 24 hours a day, 365 days per year except for scheduled maintenance days.



huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

[1m BLEU Score:[0m 	 34.6697783111003
......................................................................................................................................................
[1m rouge1:[0m 	 0.7878787878787877
[1m rouge2:[0m 	 0.5806451612903226
[1m rougeL:[0m 	 0.7272727272727272
[1m rougeLsum:[0m 	 0.7272727272727272
......................................................................................................................................................
[1m TER Score:[0m 	 60.0
[1m TER #Edits:[0m 	 9
[1m TER Ref. Length:[0m 	 15.0
......................................................................................................................................................
[1m precision:[0m 	 0.9161564707756042
[1m recall:[0m 	 0.9383234977722168
[1m f1:[0m 	 0.9271074533462524
......................................................................................................................................................
[1

# Evaluating Trained Models using Five NLP Human Correlation Metrics

## Evaluating GPT 4

In [30]:
results = evaluate_model(references, dataset['open_ai_gpt4'])

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Map:   0%|          | 0/34 [00:00<?, ? examples/s]

In [31]:
# used to generate a two formated columns for BERT and FRUGAL to be used for SPIDER chart on paper

gpt4_frugal_score = results['frugal_score']['scores']
gpt4_bert_score = results['bert_score']['f1']
dataset_for_spider_chart = dataset.add_column('gpt_4_frugal_score', gpt4_frugal_score)
dataset_for_spider_chart = dataset_for_spider_chart.add_column('gpt4_bert_score', gpt4_bert_score)

In [32]:
print('\033[1m BLEU Score:\033[0m \t', results['bleu']['score'])
print('.' * 150)

for score in list(results['rouge'].keys()):
    print(f'\033[1m {score}:\033[0m \t', results['rouge'][score])
print('.' * 150)

print('\033[1m TER Score:\033[0m \t', results['ter']['score'])
print('\033[1m TER #Edits:\033[0m \t', results['ter']['num_edits'])
print('\033[1m TER Ref. Length:\033[0m \t', results['ter']['ref_length'])
print('.' * 150)

for metric in list(results['bert_score'].keys())[:-1]:
    pairwise_metric = results['bert_score'][metric]
    averaged_metric = np.sum(pairwise_metric)/len(pairwise_metric)
    print(f'\033[1m {metric}:\033[0m \t', averaged_metric)
print('.' * 150)

pairwise_frugal_score = results['frugal_score']['scores']
averaged_frugal_score = np.sum(results['frugal_score']['scores'])/len(results['frugal_score']['scores'])
print('\033[1m FRUGAL Score:\033[0m \t', averaged_frugal_score)
print('.' * 150)

[1m BLEU Score:[0m 	 3.217775308187738
......................................................................................................................................................
[1m rouge1:[0m 	 0.21219613842714666
[1m rouge2:[0m 	 0.09392426726748879
[1m rougeL:[0m 	 0.16818299252459562
[1m rougeLsum:[0m 	 0.17634244553547063
......................................................................................................................................................
[1m TER Score:[0m 	 537.129300118624
[1m TER #Edits:[0m 	 4528
[1m TER Ref. Length:[0m 	 843.0
......................................................................................................................................................
[1m precision:[0m 	 0.8391319986651925
[1m recall:[0m 	 0.8900714709478266
[1m f1:[0m 	 0.8636762079070596
.....................................................................................................................................

## Evaluating GPT4o-2024-11-20 (latest as of 21-12.2024)

In [33]:
results = evaluate_model(references, dataset['openai_compe_gpt4o_24_11_20'])

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Map:   0%|          | 0/34 [00:00<?, ? examples/s]

In [34]:
# used to generate a two formated columns for BERT and FRUGAL to be used for SPIDER chart on paper

gpt4_frugal_score = results['frugal_score']['scores']
gpt4_bert_score = results['bert_score']['f1']
dataset_for_spider_chart = dataset_for_spider_chart.add_column('openai_compe_gpt4o_24_11_20_frugal_score', gpt4_frugal_score)
dataset_for_spider_chart = dataset_for_spider_chart.add_column('openai_compe_gpt4o_24_11_20_bert_score', gpt4_bert_score)

In [35]:
print('\033[1m BLEU Score:\033[0m \t', results['bleu']['score'])
print('.' * 150)

for score in list(results['rouge'].keys()):
    print(f'\033[1m {score}:\033[0m \t', results['rouge'][score])
print('.' * 150)

print('\033[1m TER Score:\033[0m \t', results['ter']['score'])
print('\033[1m TER #Edits:\033[0m \t', results['ter']['num_edits'])
print('\033[1m TER Ref. Length:\033[0m \t', results['ter']['ref_length'])
print('.' * 150)

for metric in list(results['bert_score'].keys())[:-1]:
    pairwise_metric = results['bert_score'][metric]
    averaged_metric = np.sum(pairwise_metric)/len(pairwise_metric)
    print(f'\033[1m {metric}:\033[0m \t', averaged_metric)
print('.' * 150)

pairwise_frugal_score = results['frugal_score']['scores']
averaged_frugal_score = np.sum(results['frugal_score']['scores'])/len(results['frugal_score']['scores'])
print('\033[1m FRUGAL Score:\033[0m \t', averaged_frugal_score)
print('.' * 150)

[1m BLEU Score:[0m 	 2.0219287149549072
......................................................................................................................................................
[1m rouge1:[0m 	 0.1583864937577551
[1m rouge2:[0m 	 0.06966690199808939
[1m rougeL:[0m 	 0.1266496459627222
[1m rougeLsum:[0m 	 0.1347586325676159
......................................................................................................................................................
[1m TER Score:[0m 	 762.0403321470937
[1m TER #Edits:[0m 	 6424
[1m TER Ref. Length:[0m 	 843.0
......................................................................................................................................................
[1m precision:[0m 	 0.8187058673185461
[1m recall:[0m 	 0.883557461640414
[1m f1:[0m 	 0.8497847266056958
.......................................................................................................................................

## Evaluating Zephyr 7b beta

In [36]:
results = evaluate_model(references, dataset['zephyr_7b_beta_preds'])

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Map:   0%|          | 0/34 [00:00<?, ? examples/s]

In [37]:
# used to generate a two formated columns for BERT and FRUGAL to be used for SPIDER chart on paper

zephyr_frugal_score = results['frugal_score']['scores']
zephyr_bert_score = results['bert_score']['f1']
dataset_for_spider_chart = dataset_for_spider_chart.add_column('zephyr_frugal_score', zephyr_frugal_score)
dataset_for_spider_chart = dataset_for_spider_chart.add_column('zephyr_bert_score', zephyr_bert_score)

In [38]:
print('\033[1m BLEU Score:\033[0m \t', results['bleu']['score'])
print('.' * 150)

for score in list(results['rouge'].keys()):
    print(f'\033[1m {score}:\033[0m \t', results['rouge'][score])
print('.' * 150)

print('\033[1m TER Score:\033[0m \t', results['ter']['score'])
print('\033[1m TER #Edits:\033[0m \t', results['ter']['num_edits'])
print('\033[1m TER Ref. Length:\033[0m \t', results['ter']['ref_length'])
print('.' * 150)

for metric in list(results['bert_score'].keys())[:-1]:
    pairwise_metric = results['bert_score'][metric]
    averaged_metric = np.sum(pairwise_metric)/len(pairwise_metric)
    print(f'\033[1m {metric}:\033[0m \t', averaged_metric)
print('.' * 150)

pairwise_frugal_score = results['frugal_score']['scores']
averaged_frugal_score = np.sum(results['frugal_score']['scores'])/len(results['frugal_score']['scores'])
print('\033[1m FRUGAL Score:\033[0m \t', averaged_frugal_score)
print('.' * 150)

[1m BLEU Score:[0m 	 12.264924145523484
......................................................................................................................................................
[1m rouge1:[0m 	 0.42851236439172186
[1m rouge2:[0m 	 0.20262459923753579
[1m rougeL:[0m 	 0.3674005948889094
[1m rougeLsum:[0m 	 0.3667809761480655
......................................................................................................................................................
[1m TER Score:[0m 	 108.89679715302492
[1m TER #Edits:[0m 	 918
[1m TER Ref. Length:[0m 	 843.0
......................................................................................................................................................
[1m precision:[0m 	 0.8904935454621035
[1m recall:[0m 	 0.8960548902259153
[1m f1:[0m 	 0.8930980773533091
.....................................................................................................................................

In [39]:
results = evaluate_model(references, dataset['mistral_ai_instruct_7b_chat_hf_preds'])

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Map:   0%|          | 0/34 [00:00<?, ? examples/s]

In [40]:
# used to generate a two formated columns for BERT and FRUGAL to be used for SPIDER chart on paper

mistralai_frugal_score = results['frugal_score']['scores']
mistralai_bert_score = results['bert_score']['f1']
dataset_for_spider_chart = dataset_for_spider_chart.add_column('mistralai_frugal_score', mistralai_frugal_score)
dataset_for_spider_chart = dataset_for_spider_chart.add_column('mistralai_bert_score', mistralai_bert_score)

In [41]:
print('\033[1m BLEU Score:\033[0m \t', results['bleu']['score'])
print('.' * 150)

for score in list(results['rouge'].keys()):
    print(f'\033[1m {score}:\033[0m \t', results['rouge'][score])
print('.' * 150)

print('\033[1m TER Score:\033[0m \t', results['ter']['score'])
print('\033[1m TER #Edits:\033[0m \t', results['ter']['num_edits'])
print('\033[1m TER Ref. Length:\033[0m \t', results['ter']['ref_length'])
print('.' * 150)

for metric in list(results['bert_score'].keys())[:-1]:
    pairwise_metric = results['bert_score'][metric]
    averaged_metric = np.sum(pairwise_metric)/len(pairwise_metric)
    print(f'\033[1m {metric}:\033[0m \t', averaged_metric)
print('.' * 150)

pairwise_frugal_score = results['frugal_score']['scores']
averaged_frugal_score = np.sum(results['frugal_score']['scores'])/len(results['frugal_score']['scores'])
print('\033[1m FRUGAL Score:\033[0m \t', averaged_frugal_score)
print('.' * 150)

[1m BLEU Score:[0m 	 3.1655487335449854
......................................................................................................................................................
[1m rouge1:[0m 	 0.24405415489707774
[1m rouge2:[0m 	 0.1074544050386387
[1m rougeL:[0m 	 0.19902962294033172
[1m rougeLsum:[0m 	 0.21559164497381814
......................................................................................................................................................
[1m TER Score:[0m 	 487.4258600237248
[1m TER #Edits:[0m 	 4109
[1m TER Ref. Length:[0m 	 843.0
......................................................................................................................................................
[1m precision:[0m 	 0.8448562376639422
[1m recall:[0m 	 0.8912202593158273
[1m f1:[0m 	 0.8671604289728052
....................................................................................................................................

## Evaluating Falcon Base 7b

In [42]:
results = evaluate_model(references, dataset['falcon_7b_base_preds'])

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Map:   0%|          | 0/34 [00:00<?, ? examples/s]

In [43]:
# used to generate a two formated columns for BERT and FRUGAL to be used for SPIDER chart on paper

falcon_base_frugal_score = results['frugal_score']['scores']
falcon_base_bert_score = results['bert_score']['f1']
dataset_for_spider_chart = dataset_for_spider_chart.add_column('falcon_base_frugal_score', falcon_base_frugal_score)
dataset_for_spider_chart = dataset_for_spider_chart.add_column('falcon_base_bert_score', falcon_base_bert_score)

In [44]:
print('\033[1m BLEU Score:\033[0m \t', results['bleu']['score'])
print('.' * 150)

for score in list(results['rouge'].keys()):
    print(f'\033[1m {score}:\033[0m \t', results['rouge'][score])
print('.' * 150)

print('\033[1m TER Score:\033[0m \t', results['ter']['score'])
print('\033[1m TER #Edits:\033[0m \t', results['ter']['num_edits'])
print('\033[1m TER Ref. Length:\033[0m \t', results['ter']['ref_length'])
print('.' * 150)

for metric in list(results['bert_score'].keys())[:-1]:
    pairwise_metric = results['bert_score'][metric]
    averaged_metric = np.sum(pairwise_metric)/len(pairwise_metric)
    print(f'\033[1m {metric}:\033[0m \t', averaged_metric)
print('.' * 150)

pairwise_frugal_score = results['frugal_score']['scores']
averaged_frugal_score = np.sum(results['frugal_score']['scores'])/len(results['frugal_score']['scores'])
print('\033[1m FRUGAL Score:\033[0m \t', averaged_frugal_score)
print('.' * 150)

[1m BLEU Score:[0m 	 2.263719074338972
......................................................................................................................................................
[1m rouge1:[0m 	 0.2387465003662007
[1m rouge2:[0m 	 0.07281206093697933
[1m rougeL:[0m 	 0.1989111532743889
[1m rougeLsum:[0m 	 0.20628888113524013
......................................................................................................................................................
[1m TER Score:[0m 	 290.98457888493476
[1m TER #Edits:[0m 	 2453
[1m TER Ref. Length:[0m 	 843.0
......................................................................................................................................................
[1m precision:[0m 	 0.8033859992728514
[1m recall:[0m 	 0.8234570236767039
[1m f1:[0m 	 0.8587363923297209
.....................................................................................................................................

## Evaluating Falcon Instruct 7b

In [45]:
results = evaluate_model(references, dataset['falcon_7b_instruct_preds'])

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Map:   0%|          | 0/34 [00:00<?, ? examples/s]

In [46]:
# used to generate a two formated columns for BERT and FRUGAL to be used for SPIDER chart on paper

falcon_frugal_score = results['frugal_score']['scores']
falcon_bert_score = results['bert_score']['f1']
dataset_for_spider_chart = dataset_for_spider_chart.add_column('falcon_frugal_score', falcon_frugal_score)
dataset_for_spider_chart = dataset_for_spider_chart.add_column('falcon_bert_score', falcon_bert_score)

In [47]:
print('\033[1m BLEU Score:\033[0m \t', results['bleu']['score'])
print('.' * 150)

for score in list(results['rouge'].keys()):
    print(f'\033[1m {score}:\033[0m \t', results['rouge'][score])
print('.' * 150)

print('\033[1m TER Score:\033[0m \t', results['ter']['score'])
print('\033[1m TER #Edits:\033[0m \t', results['ter']['num_edits'])
print('\033[1m TER Ref. Length:\033[0m \t', results['ter']['ref_length'])
print('.' * 150)

for metric in list(results['bert_score'].keys())[:-1]:
    pairwise_metric = results['bert_score'][metric]
    averaged_metric = np.sum(pairwise_metric)/len(pairwise_metric)
    print(f'\033[1m {metric}:\033[0m \t', averaged_metric)
print('.' * 150)

pairwise_frugal_score = results['frugal_score']['scores']
averaged_frugal_score = np.sum(results['frugal_score']['scores'])/len(results['frugal_score']['scores'])
print('\033[1m FRUGAL Score:\033[0m \t', averaged_frugal_score)
print('.' * 150)

[1m BLEU Score:[0m 	 3.0469204599271507
......................................................................................................................................................
[1m rouge1:[0m 	 0.2837586315727416
[1m rouge2:[0m 	 0.1123260649543652
[1m rougeL:[0m 	 0.2351724241542409
[1m rougeLsum:[0m 	 0.23719027391774433
......................................................................................................................................................
[1m TER Score:[0m 	 303.7959667852906
[1m TER #Edits:[0m 	 2561
[1m TER Ref. Length:[0m 	 843.0
......................................................................................................................................................
[1m precision:[0m 	 0.8550121696556315
[1m recall:[0m 	 0.8839603189159843
[1m f1:[0m 	 0.8689493677195381
......................................................................................................................................

## Evaluating Llama2 7b 

In [48]:
results = evaluate_model(references, dataset['llama2_7b_chat_hf_preds'])

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Map:   0%|          | 0/34 [00:00<?, ? examples/s]

In [49]:
# used to generate a two formated columns for BERT and FRUGAL to be used for SPIDER chart on paper

llama_frugal_score = results['frugal_score']['scores']
llama_bert_score = results['bert_score']['f1']
dataset_for_spider_chart = dataset_for_spider_chart.add_column('llama_frugal_score', llama_frugal_score)
dataset_for_spider_chart = dataset_for_spider_chart.add_column('llama_bert_score', llama_bert_score)

In [50]:
print('\033[1m BLEU Score:\033[0m \t', results['bleu']['score'])
print('.' * 150)

for score in list(results['rouge'].keys()):
    print(f'\033[1m {score}:\033[0m \t', results['rouge'][score])
print('.' * 150)

print('\033[1m TER Score:\033[0m \t', results['ter']['score'])
print('\033[1m TER #Edits:\033[0m \t', results['ter']['num_edits'])
print('\033[1m TER Ref. Length:\033[0m \t', results['ter']['ref_length'])
print('.' * 150)

for metric in list(results['bert_score'].keys())[:-1]:
    pairwise_metric = results['bert_score'][metric]
    averaged_metric = np.sum(pairwise_metric)/len(pairwise_metric)
    print(f'\033[1m {metric}:\033[0m \t', averaged_metric)
print('.' * 150)

pairwise_frugal_score = results['frugal_score']['scores']
averaged_frugal_score = np.sum(results['frugal_score']['scores'])/len(results['frugal_score']['scores'])
print('\033[1m FRUGAL Score:\033[0m \t', averaged_frugal_score)
print('.' * 150)

[1m BLEU Score:[0m 	 2.348697647210191
......................................................................................................................................................
[1m rouge1:[0m 	 0.2388015061693396
[1m rouge2:[0m 	 0.0940685502709552
[1m rougeL:[0m 	 0.18948837289602968
[1m rougeLsum:[0m 	 0.1985124547383369
......................................................................................................................................................
[1m TER Score:[0m 	 466.54804270462637
[1m TER #Edits:[0m 	 3933
[1m TER Ref. Length:[0m 	 843.0
......................................................................................................................................................
[1m precision:[0m 	 0.8162222627331229
[1m recall:[0m 	 0.8587226306690889
[1m f1:[0m 	 0.8597163996275734
......................................................................................................................................

In [51]:
# saving dataset with special columns for spider chart
dataset_for_spider_chart.save_to_disk('./dataset_for_spider_chart')

Saving the dataset (0/1 shards):   0%|          | 0/34 [00:00<?, ? examples/s]