# Evaluation of Answer Generation performance with BERTscore



In [1]:
!pip install -q -U bitsandbytes
!pip install -q -U git+https://github.com/huggingface/transformers.git
!pip install -q -U git+https://github.com/huggingface/peft.git
!pip install -q -U git+https://github.com/huggingface/accelerate.git
!pip install -q datasets
!pip install -q bert_score
!pip install -q faiss-gpu
!pip install -q langchain
!pip install -q sentence_transformers

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m105.0/105.0 MB[0m [31m15.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
  Building wheel for transformers (pyproject.toml) ... [?25l[?25hdone
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m270.9/270.9 kB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for peft (pyproject.toml) ... [?25l[?25hdone
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
  Building wheel for accelerate (pyproject.toml) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━

In [2]:
import os
import pandas as pd
import numpy as np

from datasets import Dataset

In [23]:
import json
from datasets import load_metric

# Load csv dataset
def load_dataset_from_csv(file_path):
    return pd.read_csv(file_path)

# Calculate BERTScore
def bertscore_(results_file_path, output_file_path, csv_file_path, bertscore):
    with open(results_file_path, 'r') as file:
        predictions = json.load(file)

    test_dataset = load_dataset_from_csv(csv_file_path)
    ref_ans = test_dataset['Answer'].tolist()

    results = bertscore.compute(predictions=predictions, references=ref_ans, lang="en", model_type='microsoft/deberta-xlarge-mnli')

    with open(output_file_path, 'w') as file:
        json.dump(results, file)

    print(f"Results saved to {output_file_path}")



In [21]:
test_dataset = "/content/Test_dataset.csv"
bertscore = load_metric("bertscore")

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


In [25]:
results_file_path = '/content/Answer_HyDE_with_reranker.json'
output_file_path = '/content/Bertscore_Answer_HyDE_with_reranker.json'

bertscore_(results_file_path, output_file_path, test_dataset, bertscore)


Results saved to /content/Bertscore_Answer_HyDE_with_reranker.json


In [26]:
results_file_path = '/content/Answer_QA_RAG.json'
output_file_path = '/content/Bertscore_Answer_QA_RAG.json'

bertscore_(results_file_path, output_file_path, test_dataset, bertscore)

Results saved to /content/Bertscore_Answer_QA_RAG.json


In [27]:
results_file_path = '/content/Answer_Only_question.json'
output_file_path = '/content/Bertscore_Answer_Only_question.json'

bertscore_(results_file_path, output_file_path, test_dataset, bertscore)

Results saved to /content/Bertscore_Answer_Only_question.json


In [28]:
results_file_path =  '/content/Answer_Multiquery_questions.json'
output_file_path = '/content/Bertscore_Answer_Multiquery_questions.json'

bertscore_(results_file_path, output_file_path, test_dataset, bertscore)

Results saved to /content/Bertscore_Answer_Multiquery_questions.json


In [29]:
results_file_path = '/content/Answer_Only_hypothetical_answer.json'
output_file_path = '/content/Bertscore_Answer_Only_hypothetical_answer.json'

bertscore_(results_file_path, output_file_path, test_dataset, bertscore)

Results saved to /content/Bertscore_Answer_Only_hypothetical_answer.json


#### Analyze

In [30]:
import matplotlib.pyplot as plt
import seaborn as sns

# Function to load JSON data
def load_json(filename):
    with open(filename, 'r') as file:
        return json.load(file)

# Load BERTScore results
Bertscore_Answer_HyDE_with_reranker = load_json('/content/Bertscore_Answer_HyDE_with_reranker.json')
Bertscore_Answer_QA_RAG = load_json('/content/Bertscore_Answer_QA_RAG.json')
Bertscore_Answer_Only_question = load_json('/content/Bertscore_Answer_Only_question.json')
Bertscore_Answer_Multiquery_questions = load_json('/content/Bertscore_Answer_Multiquery_questions.json')
Bertscore_Answer_Only_hypothetical_answer = load_json('/content/Bertscore_Answer_Only_hypothetical_answer.json')

In [31]:
def calculate_statistics(data):
    statistics = {}
    for key in ['precision', 'recall', 'f1']:
        statistics[key] = np.mean(data[key])
    return statistics

Bertscore_Answer_HyDE_with_reranker_stats = calculate_statistics(Bertscore_Answer_HyDE_with_reranker)
Bertscore_Answer_QA_RAG_stats = calculate_statistics(Bertscore_Answer_QA_RAG)
Bertscore_Answer_Only_question_stats = calculate_statistics(Bertscore_Answer_Only_question)
Bertscore_Answer_Multiquery_questions_stats = calculate_statistics(Bertscore_Answer_Multiquery_questions)
Bertscore_Answer_Only_hypothetical_answer_stats = calculate_statistics(Bertscore_Answer_Only_hypothetical_answer)

stats_df = pd.DataFrame({'QA_RAG':Bertscore_Answer_QA_RAG_stats, 'Multiquery_questions':Bertscore_Answer_Multiquery_questions_stats, 'HyDE_with_reranker':Bertscore_Answer_HyDE_with_reranker_stats, 'Only_question':Bertscore_Answer_Only_question_stats, 'Only_hypothetical_answer':Bertscore_Answer_Only_hypothetical_answer_stats})

stats_df.transpose()

Unnamed: 0,precision,recall,f1
QA_RAG,0.550603,0.64538,0.591428
Multiquery_questions,0.532219,0.629463,0.572945
HyDE_with_reranker,0.539636,0.64139,0.582352
Only_question,0.540208,0.635625,0.580522
Only_hypothetical_answer,0.538593,0.642369,0.58268
