In [3]:
from lxml import etree
import pandas as pd
import os
from pathlib import Path

In [1]:
import torch
torch.__version__

'2.9.0+cu130'

In [4]:
dir_path = Path("data/")

files = [f.name.split('.')[0] for f in dir_path.iterdir() if f.is_file()]
files

['en-de', 'en-es', 'en-fr', 'en-pt']

In [39]:
def get_data(file_name):
    # Path to TMX file
    tmx_path = f"data/{file_name}.tmx"

    # Parse the TMX XML
    tree = etree.parse(tmx_path)
    root = tree.getroot()

    rows = []
    name = file_name.split('.')[0].split('-')[1]

    if name == "pt":
        lang_name = "Portuguese"
    elif name == "fr":
        lang_name = "French"
    elif name == "es":
        lang_name = "Spanish"
    else :
        lang_name = "German"
    
    print(f"Creating the dataframe for {lang_name} language")
    # TMX content is inside <body>
    body = root.find("body")

    for tu in body.findall("tu"):
        en_text = None
        second_text = None

        for tuv in tu.findall("tuv"):
            # Language attribute (xml:lang or lang)
            lang = (
                tuv.attrib.get("{http://www.w3.org/XML/1998/namespace}lang")
                or tuv.attrib.get("lang")
            )
            seg = tuv.find("seg")
            if seg is None:
                continue

            if lang == "en":
                en_text = seg.text
            elif lang == name:
                second_text = seg.text

        # Only keep pairs where both languages exist
        if en_text and second_text:
            rows.append({
                "english": en_text,
                lang_name : second_text
            })

    # Create DataFrame
    return pd.DataFrame(rows)

In [41]:
data = {}
for f in files[::-1]:
    print(f"Starting for langauage {f}", flush=True)
    d = get_data(f)
    data[f] = d
    ## df3 = df2.merge(df, on='english', how='inner')
    print(f"Done with the language {f}")

Starting for langauage en-pt
Creating the dataframe for Portuguese language
Done with the language en-pt
Starting for langauage en-fr
Creating the dataframe for French language
Done with the language en-fr
Starting for langauage en-es
Creating the dataframe for Spanish language
Done with the language en-es
Starting for langauage en-de
Creating the dataframe for German language
Done with the language en-de


In [69]:
import re

def clean_text(text):
    if not isinstance(text, str):
        return text

    # Replace non-breaking space
    text = text.replace('\xa0', ' ')

    # Normalize dashes
    text = re.sub(r'[‐-–—]', '-', text)

    # Normalize quotes
    text = text.replace('“', '"').replace('”', '"')
    text = text.replace('‘', "'").replace('’', "'")

    # Remove replacement character
    text = text.replace('�', '')

    # Collapse whitespace
    text = re.sub(r'\s+', ' ', text).strip()

    return text


In [73]:
for i in data:
    print(f"starting for {i}")
    data[i] = data[i].map(clean_text)
    data[i] = data[i].sort_values("english")


starting for en-pt
starting for en-fr
starting for en-es
starting for en-de


In [71]:
for key, df in data.items():
    print(f"The lenght of langauge {key} is {len(df)}")

The lenght of langauge en-pt is 1048025
The lenght of langauge en-fr is 503436
The lenght of langauge en-es is 504981
The lenght of langauge en-de is 505369


In [74]:
base_df = min(data.values(), key=len)
len(base_df)

503436

In [135]:
df3 = base_df.merge(data['en-pt'], how='inner', on='english')

In [150]:
df3 = df3.drop_duplicates(subset=["english"])


In [155]:
df3.loc[454]

english       "(f) the following entries for the status of t...
French        "f) les indications suivantes relatives au sta...
Portuguese    "f) Os seguintes indicadores do estatuto das m...
Name: 454, dtype: object

In [None]:
base_df = min(data.values(), key=len)
len(base_df)

merged_df = base_df.copy()

for df in data.values():
    if df is not base_df:
        merged_df = merged_df.merge(df, on="english", how="inner")

In [156]:
len(merged_df)

113204494

In [58]:
merged_df.isna().sum()

english       0
French        0
Portuguese    0
Spanish       0
German        0
dtype: int64

In [157]:
merged_df = merged_df.drop_duplicates(subset=["english"])


In [158]:
len(merged_df)

363524

In [None]:
merged_df = merged_df.sort_values("english")


In [None]:
len(merged_df)

In [None]:
alums = set()
for col in merged_df.columns:
    for i in range(len(merged_df)):
        val = merged_df.iloc[i][col]
        for j in str(val):
            if not j.isalnum():
                alums.add(j)

len(alums)

In [163]:
merged_df.to_csv("data.csv", sep="\t", index=False)

In [None]:
maximum = 0
cols = list(merged_df.columns)
for _, rows in merged_df.iterrows():
    for col in cols:
        maximum = max( len(rows[col]), maximum)
maximum

ADDITIONAL AGREEMENT to the Agreement concerning products of the clock and watch industry between the European Economic Community and its Member States and the Swiss Confederation


In [None]:
import random

LANGS = {
    "english": "en_XX",
    "Portuguese": "pt_XX",
    "Spanish": "es_XX",
    "French": "fr_XX",
    "German": "de_DE",
}

pairs = []

for _, row in merged_df.iterrows():
    texts = row.to_dict()

    # English ↔ others
    for lang, code in LANGS.items():
        if lang != "english":
            pairs.append({
                "src_text": texts["english"],
                "tgt_text": texts[lang],
                "src_lang": "en_XX",
                "tgt_lang": code,
            })
            pairs.append({
                "src_text": texts[lang],
                "tgt_text": texts["english"],
                "src_lang": code,
                "tgt_lang": "en_XX",
            })

    # random cross-lingual pair
    others = [l for l in LANGS if l != "english"]
    l1, l2 = random.sample(others, 2)
    pairs.append({
        "src_text": texts[l1],
        "tgt_text": texts[l2],
        "src_lang": LANGS[l1],
        "tgt_lang": LANGS[l2],
    })


In [1]:
from format_dataset import get_final_dataframe

df = get_final_dataframe()

Starting for langauage en-pt
Done with the language en-pt
Starting for langauage en-fr
Done with the language en-fr
Starting for langauage en-es
Done with the language en-es
Starting for langauage en-de
Done with the language en-de


In [2]:
df.to_csv("data2.csv", sep="\t", index=False)

In [None]:
import pandas as pd
df = pd.read_csv('data2.csv', sep="\t")

In [3]:
len(df)

2442191

In [4]:
df.tail()

Unnamed: 0,source,target
2442186,<en_XX> Farmers who make use of this possibili...,"<de_DE> Betriebsinhaber, die von dieser Möglic..."
2442187,<en_XX> 2. the following Article 28a is inserted:,<de_DE> 2. Der folgende Artikel 28a wird einge...
2442188,<en_XX> Three months period provided for in Ar...,<de_DE> Drei-Monats-Zeitraum gemäß Artikel 51 ...
2442189,<en_XX> The Member States indicated in the Ann...,<de_DE> Die im Anhang aufgeführten Mitgliedsta...
2442190,<en_XX> 3. an Annex as set out in the Annex to...,<de_DE> 3. Der im Anhang zu dieser Verordnung ...


In [5]:
from tokenize_dataset import create_dataset_dict, map_dataset
dataset = create_dataset_dict(df)



In [6]:
dataset['train']

Dataset({
    features: ['source', 'target'],
    num_rows: 1953752
})

In [7]:
df.isna().sum()

source    0
target    0
dtype: int64

In [6]:
dataset

DatasetDict({
    train: Dataset({
        features: ['source', 'target'],
        num_rows: 1953752
    })
    val: Dataset({
        features: ['source', 'target'],
        num_rows: 244219
    })
    test: Dataset({
        features: ['source', 'target'],
        num_rows: 244220
    })
})

In [8]:
from tokenize_dataset import map_dataset
tokenized_dataset = map_dataset(dataset)

The dataset is DatasetDict({
    train: Dataset({
        features: ['source', 'target'],
        num_rows: 1953752
    })
    val: Dataset({
        features: ['source', 'target'],
        num_rows: 244219
    })
    test: Dataset({
        features: ['source', 'target'],
        num_rows: 244220
    })
})


Map:   0%|          | 0/1953752 [00:00<?, ? examples/s]



TypeError: 'NoneType' object is not iterable

In [15]:
max_length = 250
# Preprocessing function
def preprocess_function(examples):

    inputs = examples["source"]
    targets = examples["target"]
    
    # Tokenize inputs with source language
    model_inputs = tokenizer(
        inputs,
        max_length=max_length,
        truncation=True,
        padding="max_length"
    )

    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            targets,
            max_length=max_length,
            truncation=True,
            padding="max_length"
        )
    
    # labels["input_ids"] = [
    #     [(l if l != tokenizer.pad_token_id else -100) for l in label]
    #     for label in labels["input_ids"]
    # ]

    
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs


In [None]:
from tokenize_dataset import initlized_tokenizer
tokenizer = initlized_tokenizer()

In [23]:
tokenizer.src_lang

'en_XX'

In [25]:
tokenizer.tgt_lang = 'de_DE'

In [22]:
a = "Betriebsinhaber, die von dieser"
b = "what is what the fuck"
tokenizer.decode(tokenizer.encode(a))

'en_XX Betriebsinhaber, die von dieser</s>'

In [17]:
len(tokenizer)

250054

In [26]:
tokenized_datasets =  dataset.map(preprocess_function, batched=True, remove_columns=dataset["train"].column_names )


Map:   0%|          | 0/1953752 [00:00<?, ? examples/s]



KeyboardInterrupt: 

In [30]:
import tiktoken
from environment_variables import checkpoint
tiktoken.get_encoding("mbart-large-50-many-to-many-mmt")

ValueError: Unknown encoding mbart-large-50-many-to-many-mmt.
Plugins found: ['tiktoken_ext.openai_public']
tiktoken version: 0.8.0 (are you on latest?)

In [32]:
from datasets import load_from_disk
tokenized_datasets = load_from_disk('tokenized_dataset/French')

In [39]:
type(tokenized_datasets)

datasets.dataset_dict.DatasetDict

datasets.dataset_dict.DatasetDict

In [46]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 388317
    })
    val: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 48540
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 48540
    })
})

In [None]:
from datasets import DatasetDict

small_dataset = DatasetDict({
    "train": tokenized_datasets["train"].select(range(10,000)),
    "val": tokenized_datasets["val"].select(range(100)),
    "test": tokenized_datasets["test"].select(range(100)),
})

In [53]:
small_dataset['train'][99] == tokenized_datasets['train'][99]

True

In [65]:
a = 10000

In [55]:
len(tokenized_datasets['train'])

388317

In [57]:
round(10000 * 100 / 388317, 2)

2.58

In [58]:
len(tokenized_datasets['test'])

48540

In [60]:
round(2.58 * 48540 / 100)

1252

In [62]:
tokenized_datasets['test']

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 48540
})

In [68]:
if a < 0:
    print("Good")
else:
    assert f"What the fuck is going on"