In [66]:
# prompt: i have dataframe true and predicted texts, i need to apply blue, meteor and rouge scores
%%capture
!pip install nltk rouge-score sacrebleu

import nltk
from rouge_score import rouge_scorer
from sacrebleu.metrics import BLEU
from nltk.translate.bleu_score import sentence_bleu
nltk.download('punkt')
nltk.download('wordnet')


# Example usage:
# Assuming you have a DataFrame called 'df' with 'true_text' and 'predicted_text' columns
# df = calculate_scores(df)
# print(df)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...


In [73]:
from nltk.translate.bleu_score import sentence_bleu
def calculate_scores(df , true , pred):
  """Calculates BLEU, METEOR, and ROUGE scores for a dataframe.

  Args:
    df: A Pandas DataFrame containing 'true_text' and 'predicted_text' columns.

  Returns:
    A new DataFrame with BLEU, METEOR, and ROUGE scores appended.
  """

  bleu_scores = []
  meteor_scores = []
  rouge_scores = []
  bleu1_scores = []
  bleu2_scores = []
  bleu3_scores = []
  bleu4_scores = []

  scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)

  for _, row in df.iterrows():
    true_text = row[true]
    predicted_text = row[pred]

    # reference = [['this', 'is', 'very, 'small', 'test']]
    # candidate = ['this', 'is', 'a', 'test']
    # sentence_bleu(reference, candidate)
    reference = [true_text.split()]
    candidate = predicted_text.split()
    bleu1 =  sentence_bleu(reference, candidate, weights=(1, 0, 0, 0))
    bleu2 = sentence_bleu(reference, candidate, weights=(0.5, 0.5, 0, 0))
    bleu3 = sentence_bleu(reference, candidate, weights=(0.33, 0.33, 0.33, 0))
    bleu4 = sentence_bleu(reference, candidate, weights=(0.25, 0.25, 0.25, 0.25))
    # bleu = sentence_bleu( [true_text.split()] , predicted_text.split() )
    bleu1_scores.append(bleu1)
    bleu2_scores.append(bleu2)
    bleu3_scores.append(bleu3)
    bleu4_scores.append(bleu4)

    # METEOR Score
    # print (nltk.translate.meteor_score.meteor_score(
    # ["this is an apple", "that is an apple"], "an apple on this tree"))
    # try:
    meteor_score = nltk.translate.meteor_score.single_meteor_score(true_text.split() , predicted_text.split() )
    meteor_scores.append(meteor_score)
    # except:
    #   print(32)
    #   meteor_scores.append(0)  # Handle potential errors


    # ROUGE Score
    rouge = scorer.score(true_text, predicted_text)
    rouge_scores.append(rouge)

  all_scored = {}
  all_scored['bleu1'] = bleu1_scores
  all_scored['bleu2'] = bleu2_scores
  all_scored['bleu3'] = bleu3_scores
  all_scored['bleu4'] = bleu4_scores
  all_scored['meteor'] = meteor_scores
  all_scored['rouge1'] = [score['rouge1'].fmeasure for score in rouge_scores]
  all_scored['rouge2'] = [score['rouge2'].fmeasure for score in rouge_scores]
  all_scored['rougeL'] = [score['rougeL'].fmeasure for score in rouge_scores]

  return all_scored

In [64]:
import pandas as pd
df = pd.read_csv('/content/llama_dpo_sft_samps300_urdu2eng.csv')
df.columns

Index(['trans', 'orginal_eng', 'Urdu', 'text', 'token_count',
       'untrianed_translation', 'trianed_translation', 'prompt', 'chosen',
       'rejected', 'dpo+sft_trans', 'dpo_sft_trans'],
      dtype='object')

In [86]:
all_scores = calculate_scores(df.tail(25) , 'orginal_eng' , 'trianed_translation')

The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


In [85]:
# prompt: get last 25 rows of df, write a simple single line code

df.tail(25).shape

(25, 12)

In [87]:
len(all_scores['bleu2'])

25

In [88]:
import numpy as np
scores ={}
for key, value in all_scores.items():
    scores[key] = np.array(value).mean()




In [89]:
scores

{'bleu1': 0.2537656352702947,
 'bleu2': 0.0904179191702786,
 'bleu3': 0.0180240658981879,
 'bleu4': 0.003128094415461956,
 'meteor': 0.18298543641873904,
 'rouge1': 0.30372905380613135,
 'rouge2': 0.04523623316803908,
 'rougeL': 0.2005644198596401}

In [74]:
all_scores_dpo = calculate_scores(df, 'orginal_eng' , 'dpo_sft_trans')

The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


In [77]:
scores_dpo ={}
for key, value in all_scores_dpo.items():
    scores_dpo[key] = np.array(value).mean()
scores_dpo , scores

({'bleu1': 0.25597558397333864,
  'bleu2': 0.1026252416733309,
  'bleu3': 0.03022041424994214,
  'bleu4': 0.01046593389246158,
  'meteor': 0.20991123979160528,
  'rouge1': 0.33529334500393526,
  'rouge2': 0.05451672941720152,
  'rougeL': 0.22437770225528167},
 {'bleu1': 0.2609844261461966,
  'bleu2': 0.10383926012482969,
  'bleu3': 0.028501688505826524,
  'bleu4': 0.010048530912659852,
  'meteor': 0.21083288725293767,
  'rouge1': 0.33664732938454867,
  'rouge2': 0.05701932631641834,
  'rougeL': 0.2225822267201796})

In [90]:
all_scores_gpt = calculate_scores(df.tail(25), 'orginal_eng' , 'trans')
scores_gpt ={}
for key, value in all_scores_gpt.items():
    scores_gpt[key] = np.array(value).mean()

scores_gpt , scores

The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


({'bleu1': 0.2523791496767713,
  'bleu2': 0.09666527898920892,
  'bleu3': 0.032711989350160134,
  'bleu4': 0.006811041608997601,
  'meteor': 0.19673154688792718,
  'rouge1': 0.4068814969964297,
  'rouge2': 0.08161020613354937,
  'rougeL': 0.26654801825569724},
 {'bleu1': 0.2537656352702947,
  'bleu2': 0.0904179191702786,
  'bleu3': 0.0180240658981879,
  'bleu4': 0.003128094415461956,
  'meteor': 0.18298543641873904,
  'rouge1': 0.30372905380613135,
  'rouge2': 0.04523623316803908,
  'rougeL': 0.2005644198596401})

In [91]:
all_scores_ut = calculate_scores(df, 'orginal_eng' , 'untrianed_translation')
scores_ut ={}
for key, value in all_scores_ut.items():
    scores_ut[key] = np.array(value).mean()

scores_ut

The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


{'bleu1': 0.1799328013890531,
 'bleu2': 0.042656479100743175,
 'bleu3': 0.006166470870192632,
 'bleu4': 0.001272398789375698,
 'meteor': 0.1439213395000858,
 'rouge1': 0.2951635731077196,
 'rouge2': 0.03502346122931948,
 'rougeL': 0.1822686491789083}