In [6]:
from lxml import etree
import pandas as pd
import os
from pathlib import Path

In [1]:
import torch
torch.__version__

'2.9.0+cu130'

In [4]:
dir_path = Path("data/")

files = [f.name.split('.')[0] for f in dir_path.iterdir() if f.is_file()]
files

['en-de', 'en-es', 'en-fr', 'en-pt']

In [39]:
def get_data(file_name):
    # Path to TMX file
    tmx_path = f"data/{file_name}.tmx"

    # Parse the TMX XML
    tree = etree.parse(tmx_path)
    root = tree.getroot()

    rows = []
    name = file_name.split('.')[0].split('-')[1]

    if name == "pt":
        lang_name = "Portuguese"
    elif name == "fr":
        lang_name = "French"
    elif name == "es":
        lang_name = "Spanish"
    else :
        lang_name = "German"
    
    print(f"Creating the dataframe for {lang_name} language")
    # TMX content is inside <body>
    body = root.find("body")

    for tu in body.findall("tu"):
        en_text = None
        second_text = None

        for tuv in tu.findall("tuv"):
            # Language attribute (xml:lang or lang)
            lang = (
                tuv.attrib.get("{http://www.w3.org/XML/1998/namespace}lang")
                or tuv.attrib.get("lang")
            )
            seg = tuv.find("seg")
            if seg is None:
                continue

            if lang == "en":
                en_text = seg.text
            elif lang == name:
                second_text = seg.text

        # Only keep pairs where both languages exist
        if en_text and second_text:
            rows.append({
                "english": en_text,
                lang_name : second_text
            })

    # Create DataFrame
    return pd.DataFrame(rows)

In [41]:
data = {}
for f in files[::-1]:
    print(f"Starting for langauage {f}", flush=True)
    d = get_data(f)
    data[f] = d
    ## df3 = df2.merge(df, on='english', how='inner')
    print(f"Done with the language {f}")

Starting for langauage en-pt
Creating the dataframe for Portuguese language
Done with the language en-pt
Starting for langauage en-fr
Creating the dataframe for French language
Done with the language en-fr
Starting for langauage en-es
Creating the dataframe for Spanish language
Done with the language en-es
Starting for langauage en-de
Creating the dataframe for German language
Done with the language en-de


In [69]:
import re

def clean_text(text):
    if not isinstance(text, str):
        return text

    # Replace non-breaking space
    text = text.replace('\xa0', ' ')

    # Normalize dashes
    text = re.sub(r'[‐-–—]', '-', text)

    # Normalize quotes
    text = text.replace('“', '"').replace('”', '"')
    text = text.replace('‘', "'").replace('’', "'")

    # Remove replacement character
    text = text.replace('�', '')

    # Collapse whitespace
    text = re.sub(r'\s+', ' ', text).strip()

    return text


In [73]:
for i in data:
    print(f"starting for {i}")
    data[i] = data[i].map(clean_text)
    data[i] = data[i].sort_values("english")


starting for en-pt
starting for en-fr
starting for en-es
starting for en-de


In [71]:
for key, df in data.items():
    print(f"The lenght of langauge {key} is {len(df)}")

The lenght of langauge en-pt is 1048025
The lenght of langauge en-fr is 503436
The lenght of langauge en-es is 504981
The lenght of langauge en-de is 505369


In [74]:
base_df = min(data.values(), key=len)
len(base_df)

503436

In [135]:
df3 = base_df.merge(data['en-pt'], how='inner', on='english')

In [150]:
df3 = df3.drop_duplicates(subset=["english"])


In [155]:
df3.loc[454]

english       "(f) the following entries for the status of t...
French        "f) les indications suivantes relatives au sta...
Portuguese    "f) Os seguintes indicadores do estatuto das m...
Name: 454, dtype: object

In [None]:
base_df = min(data.values(), key=len)
len(base_df)

merged_df = base_df.copy()

for df in data.values():
    if df is not base_df:
        merged_df = merged_df.merge(df, on="english", how="inner")

In [156]:
len(merged_df)

113204494

In [58]:
merged_df.isna().sum()

english       0
French        0
Portuguese    0
Spanish       0
German        0
dtype: int64

In [157]:
merged_df = merged_df.drop_duplicates(subset=["english"])


In [158]:
len(merged_df)

363524

In [None]:
merged_df = merged_df.sort_values("english")


In [None]:
len(merged_df)

In [None]:
alums = set()
for col in merged_df.columns:
    for i in range(len(merged_df)):
        val = merged_df.iloc[i][col]
        for j in str(val):
            if not j.isalnum():
                alums.add(j)

len(alums)

In [163]:
merged_df.to_csv("data.csv", sep="\t", index=False)

In [None]:
maximum = 0
cols = list(merged_df.columns)
for _, rows in merged_df.iterrows():
    for col in cols:
        maximum = max( len(rows[col]), maximum)
maximum

ADDITIONAL AGREEMENT to the Agreement concerning products of the clock and watch industry between the European Economic Community and its Member States and the Swiss Confederation


In [None]:
import random

LANGS = {
    "english": "en_XX",
    "Portuguese": "pt_XX",
    "Spanish": "es_XX",
    "French": "fr_XX",
    "German": "de_DE",
}

pairs = []

for _, row in merged_df.iterrows():
    texts = row.to_dict()

    # English ↔ others
    for lang, code in LANGS.items():
        if lang != "english":
            pairs.append({
                "src_text": texts["english"],
                "tgt_text": texts[lang],
                "src_lang": "en_XX",
                "tgt_lang": code,
            })
            pairs.append({
                "src_text": texts[lang],
                "tgt_text": texts["english"],
                "src_lang": code,
                "tgt_lang": "en_XX",
            })

    # random cross-lingual pair
    others = [l for l in LANGS if l != "english"]
    l1, l2 = random.sample(others, 2)
    pairs.append({
        "src_text": texts[l1],
        "tgt_text": texts[l2],
        "src_lang": LANGS[l1],
        "tgt_lang": LANGS[l2],
    })


In [1]:
from format_dataset import get_final_dataframe

df = get_final_dataframe()

Starting for langauage en-pt
Done with the language en-pt
Starting for langauage en-fr
Done with the language en-fr
Starting for langauage en-es
Done with the language en-es
Starting for langauage en-de
Done with the language en-de


In [2]:
df.to_csv("data2.csv", sep="\t", index=False)

In [None]:
import pandas as pd
df = pd.read_csv('data2.csv', sep="\t")

In [3]:
len(df)

2442191

In [4]:
df.tail()

Unnamed: 0,source,target
2442186,<en_XX> Farmers who make use of this possibili...,"<de_DE> Betriebsinhaber, die von dieser Möglic..."
2442187,<en_XX> 2. the following Article 28a is inserted:,<de_DE> 2. Der folgende Artikel 28a wird einge...
2442188,<en_XX> Three months period provided for in Ar...,<de_DE> Drei-Monats-Zeitraum gemäß Artikel 51 ...
2442189,<en_XX> The Member States indicated in the Ann...,<de_DE> Die im Anhang aufgeführten Mitgliedsta...
2442190,<en_XX> 3. an Annex as set out in the Annex to...,<de_DE> 3. Der im Anhang zu dieser Verordnung ...


In [5]:
from tokenize_dataset import create_dataset_dict, map_dataset
dataset = create_dataset_dict(df)



In [6]:
dataset['train']

Dataset({
    features: ['source', 'target'],
    num_rows: 1953752
})

In [7]:
df.isna().sum()

source    0
target    0
dtype: int64

In [6]:
dataset

DatasetDict({
    train: Dataset({
        features: ['source', 'target'],
        num_rows: 1953752
    })
    val: Dataset({
        features: ['source', 'target'],
        num_rows: 244219
    })
    test: Dataset({
        features: ['source', 'target'],
        num_rows: 244220
    })
})

In [8]:
from tokenize_dataset import map_dataset
tokenized_dataset = map_dataset(dataset)

The dataset is DatasetDict({
    train: Dataset({
        features: ['source', 'target'],
        num_rows: 1953752
    })
    val: Dataset({
        features: ['source', 'target'],
        num_rows: 244219
    })
    test: Dataset({
        features: ['source', 'target'],
        num_rows: 244220
    })
})


Map:   0%|          | 0/1953752 [00:00<?, ? examples/s]



TypeError: 'NoneType' object is not iterable

In [15]:
max_length = 250
# Preprocessing function
def preprocess_function(examples):

    inputs = examples["source"]
    targets = examples["target"]
    
    # Tokenize inputs with source language
    model_inputs = tokenizer(
        inputs,
        max_length=max_length,
        truncation=True,
        padding="max_length"
    )

    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            targets,
            max_length=max_length,
            truncation=True,
            padding="max_length"
        )
    
    # labels["input_ids"] = [
    #     [(l if l != tokenizer.pad_token_id else -100) for l in label]
    #     for label in labels["input_ids"]
    # ]

    
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs


In [None]:
from tokenize_dataset import initlized_tokenizer
tokenizer = initlized_tokenizer()

In [23]:
tokenizer.src_lang

'en_XX'

In [25]:
tokenizer.tgt_lang = 'de_DE'

In [22]:
a = "Betriebsinhaber, die von dieser"
b = "what is what the fuck"
tokenizer.decode(tokenizer.encode(a))

'en_XX Betriebsinhaber, die von dieser</s>'

In [17]:
len(tokenizer)

250054

In [26]:
tokenized_datasets =  dataset.map(preprocess_function, batched=True, remove_columns=dataset["train"].column_names )


Map:   0%|          | 0/1953752 [00:00<?, ? examples/s]



KeyboardInterrupt: 

In [30]:
import tiktoken
from environment_variables import checkpoint
tiktoken.get_encoding("mbart-large-50-many-to-many-mmt")

ValueError: Unknown encoding mbart-large-50-many-to-many-mmt.
Plugins found: ['tiktoken_ext.openai_public']
tiktoken version: 0.8.0 (are you on latest?)

In [32]:
from datasets import load_from_disk
tokenized_datasets = load_from_disk('tokenized_dataset/French')

In [39]:
type(tokenized_datasets)

datasets.dataset_dict.DatasetDict

datasets.dataset_dict.DatasetDict

In [46]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 388317
    })
    val: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 48540
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 48540
    })
})

In [None]:
from datasets import DatasetDict

small_dataset = DatasetDict({
    "train": tokenized_datasets["train"].select(range(10,000)),
    "val": tokenized_datasets["val"].select(range(100)),
    "test": tokenized_datasets["test"].select(range(100)),
})

In [53]:
small_dataset['train'][99] == tokenized_datasets['train'][99]

True

In [65]:
a = 10000

In [55]:
len(tokenized_datasets['train'])

388317

In [57]:
round(10000 * 100 / 388317, 2)

2.58

In [58]:
len(tokenized_datasets['test'])

48540

In [60]:
round(2.58 * 48540 / 100)

1252

In [62]:
tokenized_datasets['test']

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 48540
})

In [68]:
if a < 0:
    print("Good")
else:
    assert f"What the fuck is going on"

## Version 2


In [1]:
import pandas as pd
data = pd.read_csv('data/data.csv', sep='\t')

In [2]:
data.head()

Unnamed: 0,english,French,Portuguese,Spanish,German
0,ADDITIONAL AGREEMENT to the Agreement concerni...,ACCORD COMPLÉMENTAIRE à l'«accord concernant l...,ACORDO COMPLEMENTAR ao «Acordo relativo aos Pr...,del Acuerdo relativo a los productos de reloje...,ZUSATZABKOMMEN zum Abkommen zwischen der Europ...
1,"THE SWISS FEDERAL COUNCIL,","LE CONSEIL FÉDÉRAL SUISSE,","O CONSELHO FEDERAL SUÍÇO,","EL CONSEJO FEDERAL SUIZO ,",DER SCHWEIZERISCHE BUNDESRAT
2,WHEREAS an Agreement concerning products of th...,CONSIDÉRANT qu'un accord concernant les produi...,"CONSIDERANDO que, em 30 de Julho de 1967, foi ...",CONSIDERANDO que el 30 de junio de 1967 se fir...,"IN ANBETRACHT dessen, daß am 30. Juni 1967 in ..."
3,WHEREAS it is necessary for the proper functio...,CONSIDÉRANT qu'il importe pour le bon fonction...,"CONSIDERANDO que, para o bom funcionamento des...","CONSIDERANDO que , para el buen funcionamiento...","IN ANBETRACHT dessen, daß es erforderlich ist,..."
4,WHEREAS the ordinance of the Swiss Federal Cou...,CONSIDÉRANT l'ordonnance du Conseil fédéral su...,CONSIDERANDO a Decisão do Conselho Federal Suí...,CONSIDERANDO que la Ordenanza del Consejo Fede...,IN ANBETRACHT der Verordnung des Schweizerisch...


In [3]:
langs = ['english', 'French', 'Portuguese', 'Spanish', 'German']
lang_codes = {
    "english": "en_XX",
    "Portuguese": "pt_XX",
    "Spanish": "es_XX",
    "French": "fr_XX",
    "German": "de_DE",
} # Adjust codes as needed

for i in langs:
    data[i] = data[i].apply(lambda x : f"<{lang_codes[i]}> {x}")

In [4]:
data.shape

(363524, 5)

In [5]:
data.head()

Unnamed: 0,english,French,Portuguese,Spanish,German
0,<en_XX> ADDITIONAL AGREEMENT to the Agreement ...,<fr_XX> ACCORD COMPLÉMENTAIRE à l'«accord conc...,<pt_XX> ACORDO COMPLEMENTAR ao «Acordo relativ...,<es_XX> del Acuerdo relativo a los productos d...,<de_DE> ZUSATZABKOMMEN zum Abkommen zwischen d...
1,"<en_XX> THE SWISS FEDERAL COUNCIL,","<fr_XX> LE CONSEIL FÉDÉRAL SUISSE,","<pt_XX> O CONSELHO FEDERAL SUÍÇO,","<es_XX> EL CONSEJO FEDERAL SUIZO ,",<de_DE> DER SCHWEIZERISCHE BUNDESRAT
2,<en_XX> WHEREAS an Agreement concerning produc...,<fr_XX> CONSIDÉRANT qu'un accord concernant le...,"<pt_XX> CONSIDERANDO que, em 30 de Julho de 19...",<es_XX> CONSIDERANDO que el 30 de junio de 196...,"<de_DE> IN ANBETRACHT dessen, daß am 30. Juni ..."
3,<en_XX> WHEREAS it is necessary for the proper...,<fr_XX> CONSIDÉRANT qu'il importe pour le bon ...,"<pt_XX> CONSIDERANDO que, para o bom funcionam...","<es_XX> CONSIDERANDO que , para el buen funcio...","<de_DE> IN ANBETRACHT dessen, daß es erforderl..."
4,<en_XX> WHEREAS the ordinance of the Swiss Fed...,<fr_XX> CONSIDÉRANT l'ordonnance du Conseil fé...,<pt_XX> CONSIDERANDO a Decisão do Conselho Fed...,<es_XX> CONSIDERANDO que la Ordenanza del Cons...,<de_DE> IN ANBETRACHT der Verordnung des Schwe...


In [6]:
from datasets import Dataset
import random

dataset = Dataset.from_pandas(data)
# List of languages and their mBART codes


def generate_pairs(example):
    pairs = []
    for src_lang in langs:
        for tgt_lang in langs:
            if src_lang != tgt_lang:
                pairs.append({
                    'translation': {
                        'src_text': example[src_lang],
                        'tgt_text': example[tgt_lang]
                    }
                })
    # Optional: Shuffle and sample to avoid redundancy/excess data
    random.shuffle(pairs)
    return {'pairs': pairs}  # Limit per example if dataset is large

# Flatten into a dataset of pairs
dataset = dataset.map(generate_pairs, remove_columns=dataset.column_names)


Map:   0%|          | 0/363524 [00:00<?, ? examples/s]

In [7]:
dataset

Dataset({
    features: ['pairs'],
    num_rows: 363524
})

In [8]:
#dataset['pairs'][0]

In [None]:
def pair_generator():
    for example in dataset:
        for pair in example['pairs']:
            yield pair

dataset_final = Dataset.from_generator(pair_generator)

KeyboardInterrupt: 

In [None]:
#dataset_final = Dataset.from_list([item for sublist in dataset['pairs'] for item in sublist])

In [None]:
# Split into train/test
dataset_final = dataset_final.train_test_split(test_size=0.1)

In [17]:
dataset.save_to_disk('data/')

Saving the dataset (0/3 shards):   0%|          | 0/3271716 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/363524 [00:00<?, ? examples/s]

In [24]:
dataset['test']['translation'][0]

{'src_lang': 'fr_XX',
 'src_text': "Statut des fonctionnaires des Communautés européennes, et notamment ses articles 20 et 71 et l'article 10 de son annexe VII.",
 'tgt_lang': 'es_XX',
 'tgt_text': 'Estatuto de los funcionarios de las Comunidades Europeas y, en particular, sus artículos 20 y 71 y el artículo 10 de su anexo VII.'}

In [18]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from peft import get_peft_model, LoraConfig

model_name = "facebook/mbart-large-50-many-to-many-mmt"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

# Apply PEFT (LoRA config example; adjust ranks/modules as needed)
peft_config = LoraConfig(
    task_type="SEQ_2_SEQ_LM",
    r=8,  # LoRA rank
    lora_alpha=32,
    lora_dropout=0.1,
    target_modules=["q_proj", "v_proj"]  # For mBART, focus on attention layers
)
model = get_peft_model(model, peft_config)
model.print_trainable_parameters()  # Should show ~0.1-1% trainable params

trainable params: 1,179,648 || all params: 612,059,136 || trainable%: 0.1927


In [25]:
# Filter out None or empty texts
def is_valid(example):
    trans = example['translation']
    return (
        trans['src_text'] is not None and len(trans['src_text'].strip()) > 0 and
        trans['tgt_text'] is not None and len(trans['tgt_text'].strip()) > 0
    )

dataset = dataset.filter(is_valid)

Filter:   0%|          | 0/3271716 [00:00<?, ? examples/s]

Filter:   0%|          | 0/363524 [00:00<?, ? examples/s]

In [26]:
max_length = 128  # Adjust as needed

def preprocess_function(examples):
    inputs = [ex['src_text'] for ex in examples['translation']]
    targets = [ex['tgt_text'] for ex in examples['translation']]
    src_langs = [ex['src_lang'] for ex in examples['translation']]
    tgt_langs = [ex['tgt_lang'] for ex in examples['translation']]
    
    # Tokenize sources (defaults to <s> ... </s>)
    model_inputs = tokenizer(
        inputs, 
        max_length=max_length, 
        truncation=True, 
        padding="max_length",  # Pad for batch consistency
        return_tensors="pt"
    )
    
    # Tokenize targets (defaults to <s> ... </s>, but we'll adjust BOS)
    # No as_target_tokenizer needed; we handle manually
    labels = tokenizer(
        targets, 
        max_length=max_length, 
        truncation=True, 
        padding="max_length", 
        return_tensors="pt"
    )["input_ids"]
    
    # Post-process to set language tokens per example
    batch_size = len(inputs)
    for i in range(batch_size):
        # For inputs: replace EOS (</s>, id=2) with src_lang token
        src_lang_id = tokenizer.convert_tokens_to_ids(src_langs[i])
        if model_inputs["input_ids"][i][-1] == tokenizer.eos_token_id:
            model_inputs["input_ids"][i][-1] = src_lang_id
        
        # For labels: replace BOS (<s>, id=0) with tgt_lang token
        tgt_lang_id = tokenizer.convert_tokens_to_ids(tgt_langs[i])
        if labels[i][0] == tokenizer.bos_token_id:
            labels[i][0] = tgt_lang_id
    
    model_inputs["labels"] = labels
    
    # No need for decoder_input_ids; trainer handles it
    # Convert back to lists if needed (but trainer accepts tensors)
    model_inputs = {k: v.tolist() for k, v in model_inputs.items()}
    
    return model_inputs

In [28]:
tokenized_datasets = dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/3271716 [00:00<?, ? examples/s]

KeyboardInterrupt: 