In [None]:
# from lxml import etree
import pandas as pd
import os
from pathlib import Path

In [None]:
import torch
torch.__version__

In [None]:
dir_path = Path("data/")

files = [f.name.split('.')[0] for f in dir_path.iterdir() if f.is_file()]
files

In [None]:
def get_data(file_name):
    # Path to TMX file
    tmx_path = f"data/{file_name}.tmx"

    # Parse the TMX XML
    tree = etree.parse(tmx_path)
    root = tree.getroot()

    rows = []
    name = file_name.split('.')[0].split('-')[1]

    if name == "pt":
        lang_name = "Portuguese"
    elif name == "fr":
        lang_name = "French"
    elif name == "es":
        lang_name = "Spanish"
    else :
        lang_name = "German"
    
    print(f"Creating the dataframe for {lang_name} language")
    # TMX content is inside <body>
    body = root.find("body")

    for tu in body.findall("tu"):
        en_text = None
        second_text = None

        for tuv in tu.findall("tuv"):
            # Language attribute (xml:lang or lang)
            lang = (
                tuv.attrib.get("{http://www.w3.org/XML/1998/namespace}lang")
                or tuv.attrib.get("lang")
            )
            seg = tuv.find("seg")
            if seg is None:
                continue

            if lang == "en":
                en_text = seg.text
            elif lang == name:
                second_text = seg.text

        # Only keep pairs where both languages exist
        if en_text and second_text:
            rows.append({
                "english": en_text,
                lang_name : second_text
            })

    # Create DataFrame
    return pd.DataFrame(rows)

In [None]:
data = {}
for f in files[::-1]:
    print(f"Starting for langauage {f}", flush=True)
    d = get_data(f)
    data[f] = d
    ## df3 = df2.merge(df, on='english', how='inner')
    print(f"Done with the language {f}")

In [None]:
import re

def clean_text(text):
    if not isinstance(text, str):
        return text

    # Replace non-breaking space
    text = text.replace('\xa0', ' ')

    # Normalize dashes
    text = re.sub(r'[‐-–—]', '-', text)

    # Normalize quotes
    text = text.replace('“', '"').replace('”', '"')
    text = text.replace('‘', "'").replace('’', "'")

    # Remove replacement character
    text = text.replace('�', '')

    # Collapse whitespace
    text = re.sub(r'\s+', ' ', text).strip()

    return text


In [None]:
for i in data:
    print(f"starting for {i}")
    data[i] = data[i].map(clean_text)
    data[i] = data[i].sort_values("english")


In [None]:
for key, df in data.items():
    print(f"The lenght of langauge {key} is {len(df)}")

In [None]:
base_df = min(data.values(), key=len)
len(base_df)

In [None]:
df3 = base_df.merge(data['en-pt'], how='inner', on='english')

In [None]:
df3 = df3.drop_duplicates(subset=["english"])


In [None]:
df3.loc[454]

In [None]:
base_df = min(data.values(), key=len)
len(base_df)

merged_df = base_df.copy()

for df in data.values():
    if df is not base_df:
        merged_df = merged_df.merge(df, on="english", how="inner")

In [None]:
len(merged_df)

In [None]:
merged_df.isna().sum()

In [None]:
merged_df = merged_df.drop_duplicates(subset=["english"])


In [None]:
len(merged_df)

In [None]:
merged_df = merged_df.sort_values("english")


In [None]:
len(merged_df)

In [None]:
alums = set()
for col in merged_df.columns:
    for i in range(len(merged_df)):
        val = merged_df.iloc[i][col]
        for j in str(val):
            if not j.isalnum():
                alums.add(j)

len(alums)

In [None]:
merged_df.to_csv("data.csv", sep="\t", index=False)

In [None]:
maximum = 0
cols = list(merged_df.columns)
for _, rows in merged_df.iterrows():
    for col in cols:
        maximum = max( len(rows[col]), maximum)
maximum

In [None]:
import random

LANGS = {
    "english": "en_XX",
    "Portuguese": "pt_XX",
    "Spanish": "es_XX",
    "French": "fr_XX",
    "German": "de_DE",
}

pairs = []

for _, row in merged_df.iterrows():
    texts = row.to_dict()

    # English ↔ others
    for lang, code in LANGS.items():
        if lang != "english":
            pairs.append({
                "src_text": texts["english"],
                "tgt_text": texts[lang],
                "src_lang": "en_XX",
                "tgt_lang": code,
            })
            pairs.append({
                "src_text": texts[lang],
                "tgt_text": texts["english"],
                "src_lang": code,
                "tgt_lang": "en_XX",
            })

    # random cross-lingual pair
    others = [l for l in LANGS if l != "english"]
    l1, l2 = random.sample(others, 2)
    pairs.append({
        "src_text": texts[l1],
        "tgt_text": texts[l2],
        "src_lang": LANGS[l1],
        "tgt_lang": LANGS[l2],
    })


In [None]:
from format_dataset import get_final_dataframe

df = get_final_dataframe()

In [None]:
df.to_csv("data2.csv", sep="\t", index=False)

In [None]:
import pandas as pd
df = pd.read_csv('data2.csv', sep="\t")

In [None]:
len(df)

In [None]:
df.tail()

In [None]:
from tokenize_dataset import create_dataset_dict, map_dataset
dataset = create_dataset_dict(df)

In [None]:
dataset['train']

In [None]:
df.isna().sum()

In [None]:
dataset

In [None]:
from tokenize_dataset import map_dataset
tokenized_dataset = map_dataset(dataset)

In [None]:
max_length = 250
# Preprocessing function
def preprocess_function(examples):

    inputs = examples["source"]
    targets = examples["target"]
    
    # Tokenize inputs with source language
    model_inputs = tokenizer(
        inputs,
        max_length=max_length,
        truncation=True,
        padding="max_length"
    )

    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            targets,
            max_length=max_length,
            truncation=True,
            padding="max_length"
        )
    
    # labels["input_ids"] = [
    #     [(l if l != tokenizer.pad_token_id else -100) for l in label]
    #     for label in labels["input_ids"]
    # ]

    
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs


In [None]:
from tokenize_dataset import initlized_tokenizer
tokenizer = initlized_tokenizer()

In [None]:
tokenizer.src_lang

In [None]:
tokenizer.tgt_lang = 'de_DE'

In [None]:
a = "Betriebsinhaber, die von dieser"
b = "what is what the fuck"
tokenizer.decode(tokenizer.encode(a))

In [None]:
len(tokenizer)

In [None]:
tokenized_datasets =  dataset.map(preprocess_function, batched=True, remove_columns=dataset["train"].column_names )


In [None]:
import tiktoken
from environment_variables import checkpoint
tiktoken.get_encoding("mbart-large-50-many-to-many-mmt")

In [None]:
from datasets import load_from_disk
tokenized_datasets = load_from_disk('tokenized_dataset/French')

In [None]:
type(tokenized_datasets)

In [None]:
tokenized_datasets

In [None]:
from datasets import DatasetDict

small_dataset = DatasetDict({
    "train": tokenized_datasets["train"].select(range(10,000)),
    "val": tokenized_datasets["val"].select(range(100)),
    "test": tokenized_datasets["test"].select(range(100)),
})

In [None]:
small_dataset['train'][99] == tokenized_datasets['train'][99]

In [None]:
a = 10000

In [None]:
len(tokenized_datasets['train'])

In [None]:
round(10000 * 100 / 388317, 2)

In [None]:
len(tokenized_datasets['test'])

In [None]:
round(2.58 * 48540 / 100)

In [None]:
tokenized_datasets['test']

In [None]:
if a < 0:
    print("Good")
else:
    assert f"What the fuck is going on"

## Version 2


In [None]:
import pandas as pd
data = pd.read_csv('data/data.csv', sep='\t')

In [None]:
data.head()

In [None]:
langs = ['english', 'French', 'Portuguese', 'Spanish', 'German']
lang_codes = {
    "english": "en_XX",
    "Portuguese": "pt_XX",
    "Spanish": "es_XX",
    "French": "fr_XX",
    "German": "de_DE",
} # Adjust codes as needed

for i in langs:
    data[i] = data[i].apply(lambda x : f"<{lang_codes[i]}> {x}")

In [None]:
data.shape

In [None]:
data.head()

In [None]:
final_dataframe = pd.DataFrame(columns=['src_text', 'tgt_text'])
final_dataframe.head()

In [None]:
data.iloc[0]['english']

In [None]:
langs

In [None]:
# from tqdm import tqdm

# for i in tqdm(range(len(data))):
#         for src_lang in langs:
#                 for tgt_lang in langs:
#                         if src_lang != tgt_lang:
#                                 final_dataframe['src_text'] = data.iloc[i][src_lang]
#                                 final_dataframe['tgt_text'] = data.iloc[i][tgt_lang]

In [None]:
from datasets import Dataset
import random

dataset = Dataset.from_pandas(data)
# List of languages and their mBART codes


def generate_pairs(example):
    pairs = []
    for src_lang in langs:
        for tgt_lang in langs:
            if src_lang != tgt_lang:
                pairs.append({
                        'src_text': example[src_lang],
                        'tgt_text': example[tgt_lang]
                })
    # Optional: Shuffle and sample to avoid redundancy/excess data
    random.shuffle(pairs)
    return {'pairs': pairs[:10]}  # Limit per example if dataset is large

# Flatten into a dataset of pairs
dataset = dataset.map(generate_pairs, remove_columns=dataset.column_names)


In [None]:
dataset

In [None]:
#dataset['pairs'][0]

In [None]:
def pair_generator():
    for example in dataset:
        for pair in example['pairs']:
            yield pair

dataset_final = Dataset.from_generator(pair_generator)

In [None]:
#dataset_final = Dataset.from_list([item for sublist in dataset['pairs'] for item in sublist])

In [None]:
len(dataset_final)

In [None]:
dataset_final

In [None]:
# Split into train/test
dataset_final = dataset_final.train_test_split(test_size=0.1)

In [None]:
dataset_final.save_to_disk('data/final_dataset/')

In [None]:
dataset_final['test'][0]

In [None]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from peft import get_peft_model, LoraConfig

model_name = "facebook/mbart-large-50-many-to-many-mmt"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

# Apply PEFT (LoRA config example; adjust ranks/modules as needed)
peft_config = LoraConfig(
    task_type="SEQ_2_SEQ_LM",
    r=8,  # LoRA rank
    lora_alpha=32,
    lora_dropout=0.1,
    target_modules=["q_proj", "v_proj"]  # For mBART, focus on attention layers
)
model = get_peft_model(model, peft_config)
model.print_trainable_parameters()  # Should show ~0.1-1% trainable params

In [None]:
model

In [None]:
tokenizer.src_lang = "en_XX"
tokenizer.tgt_lang = "en_XX"

In [None]:
max_length = 128  # Adjust as needed

def preprocess_function(examples):
    inputs = examples['src_text']
    targets = examples['tgt_text']
    
    # Tokenize sources (adds </s> at end; lang prefix becomes BOS ID)
    model_inputs = tokenizer(
        inputs, 
        max_length=max_length, 
        truncation=True,
        padding="max_length"  # For batch consistency; data_collator can handle dynamic if preferred
    )
    
    # Tokenize targets similarly
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            targets, 
            max_length=max_length, 
            truncation=True,
            padding="max_length"
        )
    
    model_inputs["labels"] = labels["input_ids"]
    
    return model_inputs

tokenized_datasets = dataset_final.map(preprocess_function, batched=True, remove_columns=dataset_final.column_names)

In [None]:
from pathlib import Path
from environment_variables import raw_data_path
dir_path = Path(raw_data_path)
files = [f for f in dir_path.iterdir() if f.is_file()]
files = sorted(files, key=lambda x : x.stat().st_size)
files = [f.name.split('.')[0].split('-')[1] for f in files]
files

In [None]:
for f in files:
    print(f.name, f.stat().st_size)

In [None]:
import pandas as pd
merged_df = pd.DataFrame()


In [None]:
merged_df.empty

In [1]:
from format_dataset import get_final_dataframe

dataframe = get_final_dataframe()

2026-01-05 01:48:05,894 | INFO | root | Creating dataframe for German
2026-01-05 01:48:19,552 | INFO | root | Creating dataframe for French
2026-01-05 01:48:34,381 | INFO | root | Creating dataframe for Spanish
2026-01-05 01:48:49,287 | INFO | root | Creating dataframe for Portuguese


In [3]:
dataframe.head()

Unnamed: 0,english,German,French,Spanish,Portuguese
0,"""","""",»,»,».
216,""" ""EEA"" shall mean the European Economic Area ...",""" ""EWR"": der Europäische Wirtschaftsraum im Si...","""""EEE"": l'Espace économique européen tel que d...","""""EEE"": el Espacio Económico Europeo según se ...","""""EEE"": o Espaço Económico Europeu, conforme d..."
217,""" ""connected NCB"" shall mean an NCB real-time ...",""" ""angeschlossene NZB"": eine NZB, deren Echtze...","""""BCN connectée"": une BCN dont le système à rè...","""""BCN conectado"": el BCN cuyo sistema de liqui...","""""BCN ligado"": o sistema de liquidação por bru..."
218,""" ""finality"" or ""final"" shall mean that the se...",""" ""Endgültigkeit"" bzw. ""endgültig"": Die Abwick...","""""caractère définitif"" ou ""définitif"": le fait...","""""firmeza"" o ""firme"": que la liquidación de un...","""""Carácter definitivo"" ou ""irrevogável"": signi..."
219,""" ""inter-NCB accounts"" shall mean the accounts...",""" ""Inter-NZB-Konten"": die Verrechnungskonten, ...","""""comptes inter-BCN"": les comptes que les BCN ...","""""cuentas entre BCN"": las cuentas que cada BCN...","""""Contas inter-BCN"": as contas interbancárias ..."


In [4]:
len(dataframe)

363449

In [5]:
dataframe.to_csv('data/csv_data/data3.csv', sep='\t', index=False)

In [1]:
import pandas as pd
dataframe = pd.read_csv('./data/csv_data/data3.csv', sep='\t')

In [7]:
dataframe.head()

Unnamed: 0,english,German,French,Spanish,Portuguese
0,"""","""",»,»,».
216,""" ""EEA"" shall mean the European Economic Area ...",""" ""EWR"": der Europäische Wirtschaftsraum im Si...","""""EEE"": l'Espace économique européen tel que d...","""""EEE"": el Espacio Económico Europeo según se ...","""""EEE"": o Espaço Económico Europeu, conforme d..."
217,""" ""connected NCB"" shall mean an NCB real-time ...",""" ""angeschlossene NZB"": eine NZB, deren Echtze...","""""BCN connectée"": une BCN dont le système à rè...","""""BCN conectado"": el BCN cuyo sistema de liqui...","""""BCN ligado"": o sistema de liquidação por bru..."
218,""" ""finality"" or ""final"" shall mean that the se...",""" ""Endgültigkeit"" bzw. ""endgültig"": Die Abwick...","""""caractère définitif"" ou ""définitif"": le fait...","""""firmeza"" o ""firme"": que la liquidación de un...","""""Carácter definitivo"" ou ""irrevogável"": signi..."
219,""" ""inter-NCB accounts"" shall mean the accounts...",""" ""Inter-NZB-Konten"": die Verrechnungskonten, ...","""""comptes inter-BCN"": les comptes que les BCN ...","""""cuentas entre BCN"": las cuentas que cada BCN...","""""Contas inter-BCN"": as contas interbancárias ..."


In [8]:
from tokenize_dataset import create_dataset_dict

final_dataset = create_dataset_dict(dataframe)

2026-01-05 01:52:45,664 | INFO | datasets | PyTorch version 2.5.1+cu124 available.


Map:   0%|          | 0/363449 [00:00<?, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

In [9]:
final_dataset

DatasetDict({
    train: Dataset({
        features: ['source', 'src_lang', 'target', 'tgt_lang'],
        num_rows: 2907592
    })
    val: Dataset({
        features: ['source', 'src_lang', 'target', 'tgt_lang'],
        num_rows: 363449
    })
    test: Dataset({
        features: ['source', 'src_lang', 'target', 'tgt_lang'],
        num_rows: 363449
    })
})

In [13]:
from pprint import pprint
a = 56454
pprint("src " + final_dataset['test']['source'][a])
pprint("tgt " + final_dataset['test']['target'][a])
pprint("src lang " + final_dataset['test']['src_lang'][a])
pprint("tgt lang " + final_dataset['test']['tgt_lang'][a])

('src Obvious formal errors such as typing errors on a proof of origin should '
 'not cause the document to be rejected if those errors are not such as to '
 'create doubts concerning the correctness of the statements made in the '
 'document.')
('tgt Los errores de forma manifiestos, tales como los errores de mecanografía '
 'en un documento de prueba del origen no implicarán el rechazo del documento '
 'si dichos errores no pueden suscitar dudas acerca de la exactitud de las '
 'declaraciones contenidas en dicho documento.')
'src lang en_XX'
'tgt lang es_XX'


In [16]:
final_dataset.save_to_disk('./temp_data')

Saving the dataset (0/3 shards):   0%|          | 0/2907592 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/363449 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/363449 [00:00<?, ? examples/s]

In [2]:
from datasets import load_from_disk
final_dataset = load_from_disk('./temp_data')

In [3]:
final_dataset

DatasetDict({
    train: Dataset({
        features: ['source', 'src_lang', 'target', 'tgt_lang'],
        num_rows: 2907592
    })
    val: Dataset({
        features: ['source', 'src_lang', 'target', 'tgt_lang'],
        num_rows: 363449
    })
    test: Dataset({
        features: ['source', 'src_lang', 'target', 'tgt_lang'],
        num_rows: 363449
    })
})

In [5]:
from tokenize_dataset import map_dataset
from initialize_model import init_tokenizer
tokenizer = init_tokenizer()

In [None]:
tokenized_dataset = map_dataset(tokenizer=tokenizer, dataset=final_dataset)

Map:   0%|          | 0/2907592 [00:00<?, ? examples/s]



Map:   0%|          | 0/363449 [00:00<?, ? examples/s]

Map:   0%|          | 0/363449 [00:00<?, ? examples/s]

In [None]:
tokenized_dataset

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 2907592
    })
    val: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 363449
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 363449
    })
})

In [None]:
from environment_variables import tokenized_data_path
tokenized_dataset.save_to_disk(tokenized_data_path)

Saving the dataset (0/19 shards):   0%|          | 0/2907592 [00:00<?, ? examples/s]

Saving the dataset (0/3 shards):   0%|          | 0/363449 [00:00<?, ? examples/s]

Saving the dataset (0/3 shards):   0%|          | 0/363449 [00:00<?, ? examples/s]

In [1]:
from datasets import load_from_disk
from environment_variables import tokenized_data_path
tokenized_dataset = load_from_disk(tokenized_data_path)

Loading dataset from disk:   0%|          | 0/19 [00:00<?, ?it/s]

In [4]:
from pprint import pprint
a = 56454
src = tokenized_dataset['test']['input_ids'][a]
tgt = tokenized_dataset['test']['labels'][a]

In [2]:
from initialize_model import init_model

In [3]:
model = init_model(get_lora_model=False)

2026-01-08 15:46:05,482 | INFO | accelerate.utils.modeling | We will use 90% of the memory on device 0 for storing the model, and 10% for the buffer to avoid OOM. You can set `max_memory` in to a higher value to use more memory (at your own risk).


In [1]:
# 1. Install vLLM
!pip install vllm





In [4]:
from vllm import LLM

llm = LLM(model)

ModuleNotFoundError: No module named 'vllm._C'

In [5]:
'ab'[:-1]

'a'