In [None]:
###USED TO MOUNT GOOGLE DRIVE TO SAVE OUTPUT CSVs IF THE NOTEBOOK IS RUN ON GOOGLE COLAB

from google.colab import drive
import os

print("Mounting Google Drive...")
drive.mount('/content/drive')

DRIVE_PROJECT_PATH = "/content/drive/MyDrive/Deep_Learning_Project"

os.makedirs(DRIVE_PROJECT_PATH, exist_ok=True)
print(f"Project directory ensures at: {DRIVE_PROJECT_PATH}")
# ----------------------------------
!pip install datasets

In [None]:
###LOADS TED TALKS DATA SET AND ONLY TAKES THE ENGLISH AND SPANISH COLUMNS

import random
from datasets import load_dataset, DatasetDict
import os

MODEL_OUTPUT_DIR = os.path.join(DRIVE_PROJECT_PATH, "byt5-ted-en-es-checkpoints")

print("Loading dataset TankuVie/ted_talks_multilingual_parallel_corpus...")
raw_datasets = load_dataset("TankuVie/ted_talks_multilingual_parallel_corpus")
full_train = raw_datasets["train"]

def non_empty_example(example):
    en = example.get("en", "")
    es = example.get("es", "")
    return (
        isinstance(en, str)
        and isinstance(es, str)
        and en.strip() != ""
        and es.strip() != ""
    )

filtered_train = full_train.filter(non_empty_example)

print("Total usable examples:", len(filtered_train))


print("Calculating maximum character lengths...")


en_lengths = [len(s) for s in filtered_train["en"]]
max_en_length = max(en_lengths)

es_lengths = [len(s) for s in filtered_train["es"]]
max_es_length = max(es_lengths)

print("--- RESULTS ---")
print(f"Maximum character length in 'en' column: {max_en_length}")
print(f"Maximum character length in 'es' column: {max_es_length}")

In [None]:
###LOADS THE COMMON MISSPELLED_WORDS DATASET WHICH IS USED TO ADD MISSPELLINGS

import kagglehub
import pandas as pd
import os

misspelling_dir_path = kagglehub.dataset_download("fazilbtopal/misspelled-words")


FILE_NAME = "misspelled.csv"
misspelling_file_path = os.path.join(misspelling_dir_path, FILE_NAME)

print("Path to dataset directory:", misspelling_dir_path)
print("Path to CSV file:", misspelling_file_path)

try:
    df_misspelled = pd.read_csv(misspelling_file_path)
    print("Successfully loaded the CSV file.")
    print("Head of DataFrame:")
    print(df_misspelled.head())
except FileNotFoundError:
    print(f"Error: The file '{FILE_NAME}' was not found inside the downloaded directory.")
    raise

In [6]:
###CLEANS UP AND CREATES THE MISSPELLED WORD DICTIONARY
import re
import pandas as pd

misspelling_map = (
    df_misspelled.groupby('label')['input']
    .apply(list)
    .to_dict()
)

MISSPELLED_DICT = {}
for k, v in misspelling_map.items():
    if isinstance(k, str):
        key = k.lower()
    else:
        continue

    values = []
    for w in v:
        if isinstance(w, str):
            values.append(w.lower())

    if values:
        MISSPELLED_DICT[key] = values

print("Successfully created MISSPELLED_DICT with", len(MISSPELLED_DICT), "entries.")

Successfully created MISSPELLED_DICT with 7762 entries.


In [None]:
#This function generates corrupted text based on the misspelling dict and a corruption rate
def generate_corrupted_text(text_to_corrupt, misspelling_dict, corruption_rate):
    """Generates a single corrupted version of the input text."""
    parts = re.split(r'(\W+)', text_to_corrupt)
    new_parts = []

    for part in parts:
        if re.match(r'\w+', part):
            lower_word = part.lower()
            misspellings = misspelling_dict.get(lower_word)
            # Only corrupt if misspellings exist AND a random number is below the rate
            if misspellings and random.random() < corruption_rate:
                new_part = random.choice(misspellings)
            else:
                new_part = part
        else:
            new_part = part

        new_parts.append(new_part)

    return "".join(new_parts)

#Uses the generate_corrupted_text function to create multiple misspelled sentences with three corruption
#levels 0.1,0.2 and 0.3. This function also clips the length of all inputs/outputs to be max 256 characters, since
#this is the length that the models were fine tuned on
def corrupt_and_clip_example_multi_rate(example, misspelling_dict, max_length=256, corruption_rates=[0.1, 0.2, 0.3]):

    def clip_text(text, length):
        if not isinstance(text, str):
            return ""
        return text[:length]
    clipped_en = clip_text(example["en"], max_length)
    clipped_es = clip_text(example["es"], max_length)
    result = {
        "en": clipped_en,
        "es": clipped_es,
    }

    text_to_corrupt = clipped_en
    for rate in corruption_rates:
        misspelled_sentence = generate_corrupted_text(
            text_to_corrupt,
            misspelling_dict,
            rate
        )

        misspelled_sentence_clipped = clip_text(misspelled_sentence, max_length)

        column_name = f"corruption_{rate}".replace('.', '_') # Use 'corruption_0_1' for 0.1
        result[column_name] = misspelled_sentence_clipped

    return result


TARGET_RATES = [0.1, 0.2, 0.3]

mutated_dataset_map = filtered_train.map(
    lambda x: corrupt_and_clip_example_multi_rate(
        x,
        MISSPELLED_DICT,
        256,
        corruption_rates=TARGET_RATES
    ),
    batched=False,
)

new_corruption_columns = [f"corruption_{rate}".replace('.', '_') for rate in TARGET_RATES]
all_target_columns = ["en", "es"] + new_corruption_columns

mutated_dataset = mutated_dataset_map.select_columns(all_target_columns)

mutated_dataset_dict = DatasetDict({
    "train": mutated_dataset
})

print("\n--- Mutated Dataset Preview ---")
print(mutated_dataset_dict)
print(mutated_dataset_dict["train"][0])

In [None]:
### HERE THE DATASET WITH CORRUPTED ENGLISH SENTENCES IS SAVED AS A CSV
dataset_to_save = mutated_dataset_dict["train"]
output_path = "/content/drive/MyDrive/Deep_Learning_Project/corrupted_dataset.csv"

print(f"Starting to save dataset to: {output_path}")

dataset_to_save.to_csv(output_path, index=False)

print("\nDataset successfully saved")

In [None]:
###IMPORTS OUR T5 and BYT5 MODELS

import torch
from transformers import AutoTokenizer, T5ForConditionalGeneration

MODEL_PATH_BYT5= "malinhauglandh/byt5-en-es-translation"
MODEL_PATH_T5 = "InaMartini/t5-en-es-translation"
TASK_PREFIX = "translate English to Spanish: "
MAX_LENGTH = 256


In [None]:
###THIS FUNCTION TAKES THE MODEL AS AN INPUT AND THE TEXT TO TRANSLATE
def translate_sentence(model, tokenizer, device, texts_to_translate):
    if isinstance(texts_to_translate, str):
        texts_to_translate = [texts_to_translate]

    input_texts = [TASK_PREFIX + text for text in texts_to_translate]

    input_ids = tokenizer(
        input_texts,
        return_tensors="pt",
        padding=True,
        max_length=MAX_LENGTH,
        truncation=True
    ).input_ids.to(device)

    with torch.no_grad():
        output_ids = model.generate(
            input_ids,
            max_length=MAX_LENGTH,
            num_beams=4,
            do_sample=False,
            early_stopping=True,
        )

    translated_texts = [tokenizer.decode(ids, skip_special_tokens=True) for ids in output_ids]
    return translated_texts

In [None]:
###RUNS THE TRANSLATION CODE ON THE ORIGINAL ENGLISH TEXT FOR A SPECIFIED MODEL
###SAVES THE OUTPUT TO A CSV
def model_runner(model_path, model_name):
    device = "cuda" if torch.cuda.is_available() else "cpu"
    print(f"Using device: {device}")

    try:
        print(f"Loading model from: {model_path}")
        tokenizer = AutoTokenizer.from_pretrained(model_path)
        model = T5ForConditionalGeneration.from_pretrained(model_path)
        model.to(device)
        model.eval()
        results = []
        for i in range(0, 1000):
            sentenceEn = mutated_dataset_dict["train"][i]['en']
            translated_text = translate_sentence(model, tokenizer, device, sentenceEn)
            results.append({"en": sentenceEn, "es": translated_text})
            print(f"{i}/1000")
        df_results = pd.DataFrame(results)
        output_csv_path = os.path.join(DRIVE_PROJECT_PATH, f"{model_name}_translated.csv")
        df_results.to_csv(output_csv_path, index=False)
        print(f"\n✅ Translations saved to: {output_csv_path}")

    except Exception as e:
        print(f"\nFATAL ERROR: Could not load or run the model.")
        print(f"Please check your MODEL_PATH variable: {model_path}")
        print(f"Error details: {e}")

In [None]:
###RUNS THE TRANSLATION CODE ON THE MISSPELLED ENLGISH SENTENCES, ON ALL CORRUPTION LEVELS
###SAVES THE OUTPUT TO A CSV
def model_runner_misspelled(model_path, model_name):
    device = "cuda" if torch.cuda.is_available() else "cpu"
    print(f"Using device: {device}")

    try:
        print(f"Loading model from: {model_path}")
        tokenizer = AutoTokenizer.from_pretrained(model_path)
        model = T5ForConditionalGeneration.from_pretrained(model_path)
        model.to(device)
        model.eval()

        results = []
        for i in range(0, 1000):
            corrupted_en_0_1 = mutated_dataset_dict["train"][i]['corruption_0_1']
            corrupted_en_0_2 = mutated_dataset_dict["train"][i]['corruption_0_2']
            corrupted_en_0_3 = mutated_dataset_dict["train"][i]['corruption_0_3']

            # Translate corrupted sentences
            translated_0_1 = translate_sentence(model, tokenizer, device, corrupted_en_0_1)
            translated_0_2 = translate_sentence(model, tokenizer, device, corrupted_en_0_2)
            translated_0_3 = translate_sentence(model, tokenizer, device, corrupted_en_0_3)

            results.append({
                "en_corruption_1": corrupted_en_0_1,
                "es_corruption_1": translated_0_1,
                "en_corruption_2": corrupted_en_0_2,
                "es_corruption_2": translated_0_2,
                "en_corruption_3": corrupted_en_0_3,
                "es_corruption_3": translated_0_3
            })
            print(f"{i}/1000")

        df_results = pd.DataFrame(results)
        output_csv_path = os.path.join(DRIVE_PROJECT_PATH, f"{model_name}_corrupt_translated.csv")
        df_results.to_csv(output_csv_path, index=False)
        print(f"\n✅ Corrupted translations saved to: {output_csv_path}")

    except Exception as e:
        print(f"\nFATAL ERROR: Could not load or run the model.")
        print(f"Please check your MODEL_PATH variable: {model_path}")
        print(f"Error details: {e}")

In [None]:
#RUNS THE MODEL RUNNER_MISSPELLED ON T5
model_runner_misspelled(MODEL_PATH_T5, "t5")

In [None]:
#RUNS THE MODEL RUNNER_MISSPLEED ON BYT5
model_runner_misspelled(MODEL_PATH_BYT5, "byt5")

In [None]:
#RUNS THE MODEL RUNNER ON BYT5
model_runner(MODEL_PATH_BYT5, "byt5")

In [None]:
#RUNS THE MODEL RUNNER ON T5
model_runner(MODEL_PATH_T5,"t5")