In [None]:
import pandas as pd
# load data
test_df = pd.read_csv("test_dataset.csv", delimiter="\t")
res_df = pd.read_csv("result_df.csv", delimiter="\t")

In [None]:
answers_df = pd.DataFrame({
    'Questions': test_df['Question'],
    'True answers': test_df['Answers'],
    'Vicuna-7b': res_df['vicuna-7'],
    'Llama2-13b': res_df['llama2-13'],
    'Mistral-7b': res_df['mistral-7b'],
    'Vicuna-13b': res_df['vicuna-13b'],
    'Vicuna-33b': res_df['vicuna-33b'],
    'Zephyr-7b': res_df['zephyr-7b']
})

# Calculate ROUGE, SacreBLEU, Perplexity, Toxicity

In [None]:
from evaluate import load

sacrebleu = load("sacrebleu")
rouge = load('rouge')
toxicity = load("toxicity", module_type="measurement")
perplexity = load("perplexity", module_type="metric")

In [None]:
def calculate_metrics(predictions, references):
  sacrebleu_metric = sacrebleu.compute(predictions=predictions, references=[[a] for a in references])
  rouge_metric = rouge.compute(predictions=predictions, references=references)
  toxic_metric = toxicity.compute(predictions=predictions, aggregation="ratio")
  perplexity_metric = perplexity.compute(model_id='gpt2', predictions=predictions)

  return sacrebleu_metric, rouge_metric, toxic_metric, perplexity_metric

In [None]:
llama2_res = calculate_metrics(res_df['llama2-13'], test_df['Answers'])

In [None]:
vicuna7_res = calculate_metrics(res_df['vicuna-7'], test_df['Answers'])

In [None]:
vicuna_13b_res = calculate_metrics(res_df['vicuna-13b'], test_df['Answers'])

In [None]:
vicuna_33b_res = calculate_metrics(res_df['vicuna-33b'], test_df['Answers'])

In [None]:
mistral_7b_res = calculate_metrics(res_df['mistral-7b'], test_df['Answers'])

In [None]:
zephyr_7b_res = calculate_metrics(res_df['zephyr-7b'], test_df['Answers'])

In [None]:
rows = [vicuna7_res, llama2_res, vicuna_13b_res, vicuna_33b_res, mistral_7b_res, zephyr_7b_res]
column_names = ['sacrebleu', 'rouge', 'toxic', 'perplexity']
row_names = ['vicuna_7b', 'llama2_13b', 'vicuna_13b', 'vicuna_33b' 'mistral_7b', 'zephyr_7b']
df_res = pd.DataFrame(rows, columns=column_names, index=row_names)
df_res.to_csv('metrics.csv' index=True)

# Calculate Cosine Distance

In [None]:
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
model = SentenceTransformer('all-MiniLM-L6-v2')

In [None]:
text_data = pd.concat([answers_df['Questions'], answers_df.drop(columns='Questions').stack()]).unique()
text_embeddings = model.encode(text_data)
embedding_dict = dict(zip(text_data, text_embeddings))

for column in answers_df.columns[1:]:
    similarities = []
    for i, row in answers_df.iterrows():
        question_embedding = embedding_dict[row['Questions']]
        answer_embedding = embedding_dict[row[column]]
        similarity = cosine_similarity([question_embedding], [answer_embedding])
        similarities.append(similarity[0][0])
    answers_df['Similarity with ' + column] = similarities

In [None]:
cos_sim = answers_df[['Similarity with Vicuna-7b','Similarity with Llama2-13b', 'Similarity with Mistral-7b', 'Similarity with Vicuna-13b', 'Similarity with Vicuna-33b', 'Similarity with Zephyr-7b']]

In [None]:
#Plot HeatMap

import plotly.graph_objects as go

fig = go.Figure(data=go.Heatmap(
    z=cos_sim,
    x=[el[15:] for el in cos_sim.columns],
    y=list(range(1, len(cos_sim))),
    colorscale='ice',
    zmin=0, zmax=1,
    colorbar_title='Cosine Similarity'
))
fig.update_layout(
    title='Cosine Similarities between True Answers and Model Predictions',
    xaxis_title='LLMs',
    xaxis_tickangle=45,
    yaxis_title='Question Number',
    width=600,
    height=700,
    plot_bgcolor='white'
)
fig.show()

In [None]:
# Plot BoxPlots

fig = go.Figure()
colors = px.colors.qualitative.Plotly

for i, col in enumerate(cos_sim.columns):
    fig.add_trace(go.Box(y=cos_sim[col],
                         name=col[15:],
                         marker_color=colors[i % len(colors)],
                         boxpoints='all',
                         jitter=0.3,
                         pointpos=-1.8,
                         marker=dict(opacity=0.5) ))
fig.update_layout(
    title='Boxplot of Cosine Similarities for Each Model',
    xaxis_title='Models',
    yaxis_title='Cosine Similarity',
    xaxis=dict(tickangle=45),
    width=1000,
    height=600
)
fig.show()

# LLM-as-a-Judge

In [None]:
# !pip install huggingface_hub datasets pandas tqdm -q

In [None]:
import re
import pandas as pd
from tqdm.auto import tqdm
from huggingface_hub import InferenceClient, notebook_login

tqdm.pandas()
pd.set_option("display.max_colwidth", None)

notebook_login()

In [None]:
repo_id = "mistralai/Mixtral-8x7B-Instruct-v0.1"

llm_client = InferenceClient(
    model=repo_id,
    timeout=120,
)

In [None]:
JUDGE_PROMPT = """
You will be given a set of user_question, true_answer and 6 alternative answers (alternative_answers) to this question.
The titles of these six answers in input order are: Llama2-13b, Mistral-7b, Vicuna-7b, Vicuna-13b, Vicuna-33b, Zephyr-7b, use these titles later
Your task is to evaluate and rank these six alternative answers for each entry. The ranking should be from best to worst based on how closely each answer matches the true answer in terms of accuracy, relevance, and completeness.

To accomplish this, you should:

Analyze the true answer to understand the core information and context.
Compare each generated answer against the true answer, assessing factors such as correctness, detail, and how well it addresses the user's question.
Rank the answers from 1 to 6, where 1 is the best answer that most accurately and completely reflects the true answer, and 6 is the least accurate and complete, like a list: 1st place - Mistral-7b, etc., use the titles of those answers that I have meantioned before.
Your output for each entry should clearly indicate the ranking of the generated answers.

Provide your feedback as follows:

Feedback:::
Evaluation: (your rationale for the ranking, as a text)
Total ranking: (your ranking, as a list of titles of answers from best to worst)

You MUST provide values for 'Evaluation:' and 'Total ranking:' in your answer.

Now here are the question, true answer, and alternative answers.

Question: {question}
Answer: {true_answer}
Alternarives {alternative_answers}

Provide your feedback.
Feedback:::
Evaluation: """

In [None]:
answers_df["llm_judge_impr"] = answers_df.progress_apply(
    lambda answers_df: llm_client.text_generation(
        prompt=JUDGE_PROMPT.format(question=answers_df["Questions"], true_answer=answers_df["True answers"], alternative_answers=(answers_df["Llama2-13b"], answers_df["Mistral-7b"], answers_df["Vicuna-7b"], answers_df["Vicuna-13b"], answers_df["Vicuna-33b"], answers_df["Zephyr-7b"])),
        max_new_tokens=500,
    ),
    axis=1,
)

In [None]:
answers_df.to_csv('judge.csv', sep='\t', columns=["Questions", "llm_judge_impr"])

In [None]:
first_place = []

def extract_judge_score(text, current, next_delimiter=None):
    start_split = text.split(current)
    if next_delimiter:
        end_split = start_split[1].split(next_delimiter)
    else:
        end_split = start_split[1].split("\n")
    return end_split[0].strip()

delimiters = ["1st place - ", "2nd place - "]
for text in list(answers_df["llm_judge_impr"]):
  first_place.append(extract_judge_score(text, delimiters[0], delimiters[1]))

In [None]:
# Plot Pie Chart

import plotly.graph_objects as go
import plotly.express as px

first_place = [model.strip().rstrip(',') for model in first_place]

model_counts = {}
for model in first_place:
    if model in model_counts:
        model_counts[model] += 1
    else:
        model_counts[model] = 1
labels = list(model_counts.keys())
sizes = list(model_counts.values())

fig = go.Figure(data=[go.Pie(labels=labels, values=sizes,
                             textinfo='label+percent',
                             insidetextorientation='radial')])

fig.update_layout(
    title_text='LLM-as-a-judge top 1',
    showlegend=True,
    width=600,
    height=600
)
fig.show()