In [1]:
import evaluate
import datasets
import numpy as np

# Loading the Instruct Dataset

In [2]:
evaluation_set = datasets.load_from_disk('./evaluation_set_for_nlp_metrics/models_prediction_dataset')
evaluation_set

Dataset({
    features: ['REQID_ex', 'completion', 'query', 'class', 'task', 'text', 'label', 'mistral_ai_instruct_7b_chat_hf_preds', 'falcon_7b_base_preds', 'falcon_7b_instruct_preds', 'llama2_7b_chat_hf_preds', 'zephyr_7b_beta_preds', 'openai_compe_gpt4o_24_11_20', 'chatgpt4o_frugal_score', 'chatgpt4o_bert_score', 'zephyr_frugal_score', 'zephyr_bert_score', 'mistralai_frugal_score', 'mistralai_bert_score', 'falcon_base_frugal_score', 'falcon_base_bert_score', 'falcon_frugal_score', 'falcon_bert_score', 'llama_frugal_score', 'llama_bert_score'],
    num_rows: 34
})

In [3]:
# splitting the human written requirements

references = evaluation_set['completion']

# Utility Functions

In [4]:
def evaluate_model(references, predictions):
    # Setup BERT Score
    bertscore = evaluate.load('bertscore')
    bertscore_results = bertscore.compute(predictions = predictions, references = references, model_type = "xlm-mlm-en-2048", lang = 'en')
    
    # Setup FRUGAL Score
    frugalscore = evaluate.load("frugalscore", "moussaKam/frugalscore_medium_roberta_bert-score")
    frugalscore_results = frugalscore.compute(predictions=predictions, references=references, batch_size = 2, max_length = 512, device = "cpu")
    
    return {'bert_score': bertscore_results, 'frugal_score': frugalscore_results}

In [5]:
def display_model_evaluation_results(results):
    # Display BERT Score
    print('\033[1m BERT Score: \033[0m')
    for metric in list(results['bert_score'].keys())[:-1]:
        pairwise_metric = results['bert_score'][metric]
        averaged_metric = np.sum(pairwise_metric)/len(pairwise_metric)
        print(f'   {metric}:   ', averaged_metric)
    
    # Display FRUGAL
    print('\033[1m FRUGAL Score: \033[0m')
    pairwise_frugal_score = results['frugal_score']['scores']
    averaged_frugal_score = np.sum(results['frugal_score']['scores'])/len(results['frugal_score']['scores'])
    print('   Score:   ', averaged_frugal_score)

# Evaluating ChatGPT-4o

In [6]:
chat_gpt_4o_results = evaluate_model(references, evaluation_set['openai_compe_gpt4o_24_11_20'])

  return self.fget.__get__(instance, owner)()


Map:   0%|          | 0/34 [00:00<?, ? examples/s]

  trainer = Trainer(self.model, training_args, tokenizer=self.tokenizer)


In [7]:
display_model_evaluation_results(chat_gpt_4o_results)

[1m BERT Score: [0m
   precision:    0.8187058673185461
   recall:    0.883557461640414
   f1:    0.8497847266056958
[1m FRUGAL Score: [0m
   Score:    0.865664145525764


In [8]:
# # Writing model results in a new column for paper visualization

# evaluation_set = evaluation_set.add_column('chatgpt4o_frugal_score', chat_gpt_4o_results['frugal_score']['scores'])
# evaluation_set = evaluation_set.add_column('chatgpt4o_bert_score', chat_gpt_4o_results['bert_score']['f1'])

# Evaluating ReqBrain-zephyr-7b-beta

In [9]:
zephyr_7b_beta_results = evaluate_model(references, evaluation_set['zephyr_7b_beta_preds'])

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Map:   0%|          | 0/34 [00:00<?, ? examples/s]

  trainer = Trainer(self.model, training_args, tokenizer=self.tokenizer)


In [10]:
display_model_evaluation_results(zephyr_7b_beta_results)

[1m BERT Score: [0m
   precision:    0.8904935454621035
   recall:    0.8960548902259153
   f1:    0.8930980773533091
[1m FRUGAL Score: [0m
   Score:    0.9120348762063419


In [11]:
# # Writing model results in a new column for paper visualization

# evaluation_set = evaluation_set.add_column('zephyr_frugal_score', zephyr_7b_beta_results['frugal_score']['scores'])
# evaluation_set = evaluation_set.add_column('zephyr_bert_score', zephyr_7b_beta_results['bert_score']['f1'])

# Evaluating ReqBrain-Mistral-7B-Instruct-v0.2

In [12]:
mistralai_7b_instruct_results = evaluate_model(references, evaluation_set['mistral_ai_instruct_7b_chat_hf_preds'])

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Map:   0%|          | 0/34 [00:00<?, ? examples/s]

In [13]:
display_model_evaluation_results(mistralai_7b_instruct_results)

[1m BERT Score: [0m
   precision:    0.8448562376639422
   recall:    0.8912202593158273
   f1:    0.8671604289728052
[1m FRUGAL Score: [0m
   Score:    0.888142417458927


In [14]:
# # Writing model results in a new column for paper visualization

# evaluation_set = evaluation_set.add_column('mistralai_frugal_score', mistralai_7b_instruct_results['frugal_score']['scores'])
# evaluation_set = evaluation_set.add_column('mistralai_bert_score', mistralai_7b_instruct_results['bert_score']['f1'])

# Evaluating ReqBrain-falcon-7b

In [15]:
falcon_7b_results = evaluate_model(references, evaluation_set['falcon_7b_base_preds'])

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Map:   0%|          | 0/34 [00:00<?, ? examples/s]

In [16]:
display_model_evaluation_results(falcon_7b_results)

[1m BERT Score: [0m
   precision:    0.8033859992728514
   recall:    0.8234570236767039
   f1:    0.8587363923297209
[1m FRUGAL Score: [0m
   Score:    0.8855502184699563


In [17]:
# # Writing model results in a new column for paper visualization

# evaluation_set = evaluation_set.add_column('falcon_base_frugal_score', falcon_7b_results['frugal_score']['scores'])
# evaluation_set = evaluation_set.add_column('falcon_base_bert_score', falcon_7b_results['bert_score']['f1'])

# Evaluating ReqBrain-falcon-7b-instruct

In [18]:
falcon_7b_instruct_results = evaluate_model(references, evaluation_set['falcon_7b_instruct_preds'])

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Map:   0%|          | 0/34 [00:00<?, ? examples/s]

In [19]:
display_model_evaluation_results(falcon_7b_instruct_results)

[1m BERT Score: [0m
   precision:    0.8550121696556315
   recall:    0.8839603189159843
   f1:    0.8689493677195381
[1m FRUGAL Score: [0m
   Score:    0.8859289393705481


In [20]:
# # Writing model results in a new column for paper visualization

# evaluation_set = evaluation_set.add_column('falcon_frugal_score', falcon_7b_instruct_results['frugal_score']['scores'])
# evaluation_set = evaluation_set.add_column('falcon_bert_score', falcon_7b_instruct_results['bert_score']['f1'])

# Evaluating ReqBrain-Llama-2-7b-chat-hf

In [21]:
llama_2_7b_chat_hf_results = evaluate_model(references, evaluation_set['llama2_7b_chat_hf_preds'])

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Map:   0%|          | 0/34 [00:00<?, ? examples/s]

In [22]:
display_model_evaluation_results(llama_2_7b_chat_hf_results)

[1m BERT Score: [0m
   precision:    0.8162222627331229
   recall:    0.8587226306690889
   f1:    0.8597163996275734
[1m FRUGAL Score: [0m
   Score:    0.8812636768116671


In [23]:
# # Writing model results in a new column for paper visualization

# evaluation_set = evaluation_set.add_column('llama_frugal_score', llama_2_7b_chat_hf_results['frugal_score']['scores'])
# evaluation_set = evaluation_set.add_column('llama_bert_score', llama_2_7b_chat_hf_results['bert_score']['f1'])

In [24]:
# # saving dataset with special columns for spider chart

# evaluation_set.save_to_disk('./evaluation_set_for_nlp_metrics/models_prediction_dataset')