In [1]:
import pickle
import pandas as pd
import csv
from datasets import load_dataset

In [2]:
file_paths = [
    "input_output_pairs/input_output_pairs_1b",
    "input_output_pairs/input_output_pairs_3b",
    "input_output_pairs/input_output_pairs_8b"
]

In [3]:
with open(file_paths[0], 'rb') as f:
    outputs_1b = pickle.load(f)

with open(file_paths[1], 'rb') as f:
    outputs_3b = pickle.load(f)

with open(file_paths[2], 'rb') as f:
    outputs_8b = pickle.load(f)

In [4]:
print(f"Length of outputs_1b['wmt14']: {len(outputs_1b['wmt14'])}")
print(f"Length of outputs_3b['wmt14']: {len(outputs_3b['wmt14'])}")
print(f"Length of outputs_8b['wmt14]: {len(outputs_8b['wmt14'])}")

Length of outputs_1b['wmt14']: 5000
Length of outputs_3b['wmt14']: 5000
Length of outputs_8b['wmt14]: 5000


In [5]:
input_texts_1b = [entry['input_text'] for entry in outputs_1b['wmt14']]
input_texts_3b = [entry['input_text'] for entry in outputs_3b['wmt14']]
input_texts_8b = [entry['input_text'] for entry in outputs_8b['wmt14']]

identical_input_text_count = sum(
    input_text_1b == input_text_3b == input_text_8b
    for input_text_1b, input_text_3b, input_text_8b in zip(input_texts_1b, input_texts_3b, input_texts_8b)
)

print(f"Number of identical input_text entries across all three: {identical_input_text_count}")

Number of identical input_text entries across all three: 5000


In [9]:
print(f"Length of outputs_1b['cnn_dailymail']: {len(outputs_1b['cnn_dailymail'])}")
print(f"Length of outputs_3b['cnn_dailymail']: {len(outputs_3b['cnn_dailymail'])}")
print(f"Length of outputs_8b['cnn_dailymail']: {len(outputs_8b['cnn_dailymail'])}")

Length of outputs_1b['cnn_dailymail']: 5000
Length of outputs_3b['cnn_dailymail']: 5000
Length of outputs_8b['cnn_dailymail']: 5000


In [7]:
input_texts_1b_cnn = [entry['input_text'] for entry in outputs_1b['cnn_dailymail'][:5000]]
input_texts_3b_cnn = [entry['input_text'] for entry in outputs_3b['cnn_dailymail'][:5000]]
input_texts_8b_cnn = [entry['input_text'] for entry in outputs_8b['cnn_dailymail'][:5000]]

identical_input_text_count_cnn = sum(
    input_text_1b == input_text_3b == input_text_8b
    for input_text_1b, input_text_3b, input_text_8b in zip(input_texts_1b_cnn, input_texts_3b_cnn, input_texts_8b_cnn)
)

print(f"Number of identical input_text entries in cnn_dailymail across all three: {identical_input_text_count_cnn}")

Number of identical input_text entries in cnn_dailymail across all three: 5000


In [8]:
outputs_1b['cnn_dailymail'] = outputs_1b['cnn_dailymail'][:5000]
outputs_3b['cnn_dailymail'] = outputs_3b['cnn_dailymail'][:5000]
outputs_8b['cnn_dailymail'] = outputs_8b['cnn_dailymail'][:5000]

with open(file_paths[0], 'wb') as f:
    pickle.dump(outputs_1b, f)

with open(file_paths[1], 'wb') as f:
    pickle.dump(outputs_3b, f)

with open(file_paths[2], 'wb') as f:
    pickle.dump(outputs_8b, f)

print("Truncated datasets saved successfully.")

Truncated datasets saved successfully.


In [12]:
outputs_8b['cnn_dailymail'][4999]

{'input_text': '(Mental Floss) -- Have you ever wondered about the origins and namesakes of our favorite spreads, sauces, and dressings? Here are a few stories that you can use to regale your friends the next time you chow down. Peppers were imported from the Mexican state of Tabasco to make spicy Tabasco sauce, giving the condiment its name. 1. Thousand Island Dressing . Is the delicious dressing that gives a Reuben its tanginess named after an actual chain of islands? You bet it is. The Thousand Islands are an archipelago that sits in the Saint Lawrence River on the U.S.-Canada border, and there are actually 1,793 of them, some of which are so small that they contain nothing more than a single home. So why is the dressing named after an archipelago? No one\'s quite sure. Some people claim that early film star and vaudevillian May Irwin, who summered on the Thousand Islands, named it, while others contend that George Boldt, the famed proprietor of the Waldorf-Astoria, gave the dressin

In [13]:
for dataset_key in outputs_1b.keys():
    print(f"{dataset_key}:")
    print(f"  Length of outputs_1b: {len(outputs_1b[dataset_key])}")
    print(f"  Length of outputs_3b: {len(outputs_3b[dataset_key])}")
    print(f"  Length of outputs_8b: {len(outputs_8b[dataset_key])}")

wmt14:
  Length of outputs_1b: 5000
  Length of outputs_3b: 5000
  Length of outputs_8b: 5000
cnn_dailymail:
  Length of outputs_1b: 5000
  Length of outputs_3b: 5000
  Length of outputs_8b: 5000
gsm8k:
  Length of outputs_1b: 25000
  Length of outputs_3b: 5000
  Length of outputs_8b: 0


In [14]:
combined_outputs = {}

for dataset_key in outputs_1b.keys():
    if dataset_key == "gsm8k":
        break
    combined_outputs[dataset_key] = []

    for i in range(len(outputs_1b[dataset_key])):
        combined_entry = {
            "input_text": outputs_1b[dataset_key][i]["input_text"],  
            "1b": outputs_1b[dataset_key][i].get("1b"),     
            "3b": outputs_3b[dataset_key][i].get("3b"),
            "8b": outputs_8b[dataset_key][i].get("8b")         
        }
        combined_outputs[dataset_key].append(combined_entry)

with open("input_output_pairs.pkl", 'wb') as f:
    pickle.dump(combined_outputs, f)

In [15]:
from datasets import load_dataset
import pickle

wmt14_dataset = load_dataset('wmt14', 'de-en', split='train')
cnndailymail_dataset = load_dataset('abisee/cnn_dailymail', '2.0.0', split='train')

def match_labels(combined_outputs, wmt14_dataset, cnndailymail_dataset):
    # Match wmt14 dataset
    if 'wmt14' in combined_outputs:
        for i, entry in enumerate(combined_outputs['wmt14']):
            if i < len(wmt14_dataset):
                entry['label'] = wmt14_dataset[i]['translation']['en'] 

    if 'cnn_dailymail' in combined_outputs:
        for i, entry in enumerate(combined_outputs['cnn_dailymail']):
            if i < len(cnndailymail_dataset):
                entry['label'] = cnndailymail_dataset[i]['highlights']  

    return combined_outputs

with open("input_output_pairs.pkl", 'rb') as f:
    combined_outputs = pickle.load(f)

combined_outputs = match_labels(combined_outputs, wmt14_dataset, cnndailymail_dataset)

with open("input_output_pairs.pkl", 'wb') as f:
    pickle.dump(combined_outputs, f)

print("Updated combined_outputs saved with labels.")

Updated combined_outputs saved with labels.


In [17]:
import pickle
import pandas as pd

with open('input_output_pairs.pkl', 'rb') as file:
    combined_outputs = pickle.load(file)

df_combined_outputs = pd.DataFrame(combined_outputs)
df_combined_outputs.head()

Unnamed: 0,wmt14,cnn_dailymail
0,{'input_text': 'Wiederaufnahme der Sitzungsper...,"{'input_text': 'LONDON, England (Reuters) -- H..."
1,"{'input_text': 'Ich erkläre die am Freitag, de...",{'input_text': 'Editor's note: In our Behind t...
2,"{'input_text': 'Wie Sie feststellen konnten, i...","{'input_text': 'MINNEAPOLIS, Minnesota (CNN) -..."
3,{'input_text': 'Im Parlament besteht der Wunsc...,{'input_text': 'WASHINGTON (CNN) -- Doctors re...
4,{'input_text': 'Heute möchte ich Sie bitten - ...,{'input_text': '(CNN) -- The National Footbal...


In [21]:
with open('input_output_pairs.pkl', 'rb') as file:
    combined_outputs = pickle.load(file)

df_wmt14 = pd.DataFrame(combined_outputs['wmt14']) if 'wmt14' in combined_outputs else pd.DataFrame()
df_cnn_dailymail = pd.DataFrame(combined_outputs['cnn_dailymail']) if 'cnn_dailymail' in combined_outputs else pd.DataFrame()

df_wmt14.to_csv('wmt14_outputs.csv', index=False)
df_cnn_dailymail.to_csv('cnn_dailymail_outputs.csv', index=False)

In [22]:
df_wmt14.head()

Unnamed: 0,input_text,1b,3b,8b,label
0,Wiederaufnahme der Sitzungsperiode,Wiederaufnahme der Sitzungsperiode\n\nThe Pres...,Reopening of the session\n\nPlease note that t...,Resumption of the session period\n\n Original ...,Resumption of the session
1,"Ich erkläre die am Freitag, dem 17. Dezember u...",I explain the interrupted session of the Europ...,I am explaining the interrupted parliamentary ...,I would like to inform you that the European P...,I declare resumed the session of the European ...
2,"Wie Sie feststellen konnten, ist der gefürchte...","As you can see, the feared ""Millennium Bug"" wa...","As you were able to determine, the feared ""Mil...","As you could see, the dreaded ""Millennium Bug""...","Although, as you will have seen, the dreaded '..."
3,Im Parlament besteht der Wunsch nach einer Aus...,In the parliament there is a wish for an expre...,"In the Parliament, there is a desire for a deb...",In the parliament there is a desire for a deba...,You have requested a debate on this subject in...
4,Heute möchte ich Sie bitten - das ist auch der...,Today I want to ask you - that's also the wish...,He today wants to ask - this is also the wish ...,I would like to ask you - this is also the wis...,"In the meantime, I should like to observe a mi..."


In [23]:
df_cnn_dailymail.head()

Unnamed: 0,input_text,1b,3b,8b,label
0,"LONDON, England (Reuters) -- Harry Potter star...","Daniel Radcliffe, the star of the Harry Potter...","Daniel Radcliffe, star of the Harry Potter ser...","Daniel Radcliffe, who turns 18 on Monday, has ...",Harry Potter star Daniel Radcliffe gets £20M f...
1,Editor's note: In our Behind the Scenes series...,9th floor of the Miami-Dade pretrial detention...,A Miami-Dade pretrial detention facility has a...,"In Miami's ""forgotten floor,"" mentally ill inm...",Mentally ill inmates in Miami are housed on th...
2,"MINNEAPOLIS, Minnesota (CNN) -- Drivers who we...",A devastating bridge collapse occurred in Minn...,55 people were rescued from the Mississippi Ri...,The Minneapolis bridge collapse sent cars plum...,"NEW: ""I thought I was going to die,"" driver sa..."
3,WASHINGTON (CNN) -- Doctors removed five small...,5 small polyps removed from President Bush's c...,President Bush had 5 small polyps removed from...,Doctors removed five small polyps from Preside...,"Five small polyps found during procedure; ""non..."
4,(CNN) -- The National Football League has ind...,The NFL has indefinitely suspended Atlanta Fal...,NFL quarterback Michael Vick has been indefini...,"Michael Vick, Atlanta Falcons quarterback, has...","NEW: NFL chief, Atlanta Falcons owner critical..."


In [43]:
import sys
import os

parent_dir = os.path.abspath(os.path.join(os.getcwd(), ".."))
sys.path.append(parent_dir)

from analysis import metrics

In [44]:
df_wmt14 = pd.read_csv('wmt14_outputs.csv')

bleu_scores_1b = [
    metrics.calculate_bleu(row['1b'], row['label']) for _, row in df_wmt14.iterrows() if pd.notnull(row['1b'])
]
bleu_scores_3b = [
    metrics.calculate_bleu(row['3b'], row['label']) for _, row in df_wmt14.iterrows() if pd.notnull(row['3b'])
]
bleu_scores_8b = [
    metrics.calculate_bleu(row['8b'], row['label']) for _, row in df_wmt14.iterrows() if pd.notnull(row['8b'])
]

avg_bleu_1b = sum(bleu_scores_1b) / len(bleu_scores_1b) if bleu_scores_1b else 0
avg_bleu_3b = sum(bleu_scores_3b) / len(bleu_scores_3b) if bleu_scores_3b else 0
avg_bleu_8b = sum(bleu_scores_8b) / len(bleu_scores_8b) if bleu_scores_8b else 0

print(f"Average BLEU score for 1b: {avg_bleu_1b}")
print(f"Average BLEU score for 3b: {avg_bleu_3b}")
print(f"Average BLEU score for 8b: {avg_bleu_8b}")

Average BLEU score for 1b: 0.4041277811656105
Average BLEU score for 3b: 0.41986096501637005
Average BLEU score for 8b: 0.45745371736690155


In [45]:
threshold = 0.43

def bleu_pass_fail(generated_output, reference_output, threshold):
    if pd.notnull(generated_output) and pd.notnull(reference_output):
        bleu_score = metrics.calculate_bleu(generated_output, reference_output)
        return 1 if bleu_score >= threshold else 0
    return 0

df_wmt14['8b'] = 1  
df_wmt14['3b'] = df_wmt14.apply(lambda row: bleu_pass_fail(row['3b'], row['label'], threshold), axis=1)
df_wmt14['1b'] = df_wmt14.apply(lambda row: bleu_pass_fail(row['1b'], row['label'], threshold), axis=1)

output_csv = 'wmt14_bleu_threshold.csv'
df_wmt14[['input_text', '1b', '3b', '8b']].to_csv(output_csv, index=False)

df_wmt14[['input_text', '1b', '3b', '8b']].head()

Unnamed: 0,input_text,1b,3b,8b
0,Wiederaufnahme der Sitzungsperiode,0,0,1
1,"Ich erkläre die am Freitag, dem 17. Dezember u...",1,1,1
2,"Wie Sie feststellen konnten, ist der gefürchte...",0,0,1
3,Im Parlament besteht der Wunsch nach einer Aus...,0,1,1
4,Heute möchte ich Sie bitten - das ist auch der...,1,1,1
