In [1]:
import os
import re
import pandas as pd

In [2]:
# Carga el archivo .tsv
df = pd.read_csv("sentences.tsv", sep="\t", encoding="utf-8", names=["0", "es", "1", "map"])[["es", "map"]]

df.sample(20)


Unnamed: 0,es,map
18,Es el peor ladrón del pueblo porque siempre lo...,Rumel pengen mew fey doy weda weñefengey kara ...
216,¿Cuándo puedes venir?,Chumül pepi küpaymi?
83,¡Baila!,Pürünge!
343,Le gustan los dulces.,Kümentukey ta kochülu.
308,¿Son ellos japoneses?,Niponchengey am fey engün?
136,Devuélvame mi ropa.,Wüñolelaen ñi takuluwün.
23,Debería dormir más porque todas las tardes and...,Komke narantü ürkülen iñche feymew müley ñi do...
186,Cuando era joven...,Wechekalu iñche...
219,"Cuando estudio, escucho música con auriculares.","Chillkatuyüm iñche, allkütuken musika kümeallk..."
56,No.,Nu.


In [3]:
def parse_txt_file(filepath):
    with open(filepath, encoding="utf-8") as f:
        lines = f.readlines()

    blocks = []

    current_id = None
    current_mapudungun = []
    current_castellano = []

    for line in lines:
        line = line.strip()

        match = re.match(r"^([a-zA-Z0-9_-]+):$", line)
        if match:
            if current_id:
                blocks.append({
                    "map": "\n".join(current_mapudungun).strip(),
                    "es": "\n".join(current_castellano).strip(),
                })
            current_id = match.group(1)
            current_mapudungun = []
            current_castellano = []
        elif line.startswith("M:"):
            current_mapudungun.append(line[2:].strip())
        elif line.startswith("C:"):
            current_castellano.append(line[2:].strip())

    if current_id:
        blocks.append({
            "map": "\n".join(current_mapudungun).strip(),
            "es": "\n".join(current_castellano).strip(),
        })

    return blocks

def crear_dataframe_desde_directorio(directorio):
    all_blocks = []

    for file in os.listdir(directorio):
        if file.endswith(".txt"):
            path = os.path.join(directorio, file)
            bloques = parse_txt_file(path)
            all_blocks.extend(bloques)

    df = pd.DataFrame(all_blocks)
    return df


directorio = "mapudungun-corpus//translation-clean"
df_map_corp = crear_dataframe_desde_directorio(directorio)

In [4]:
# Unimos los df

df = pd.concat([df, df_map_corp], ignore_index=True)

In [5]:
# Eliminamos onomatopeyas
df["es"] = df["es"].str.replace(r"<[^>]+>", "", regex=True)
df["map"] = df["map"].str.replace(r"<[^>]+>", "", regex=True)


In [6]:
with open("corpus.txt", "w", encoding="utf-8") as f:
    for es, mapu in zip(df["es"], df["map"]):
        f.write(es.strip() + "\n")
        f.write(mapu.strip() + "\n")


In [7]:
from tokenizers import ByteLevelBPETokenizer

# Inicializar
tokenizer = ByteLevelBPETokenizer()

# Entrenar con el corpus
tokenizer.train(
    files="corpus.txt",
    vocab_size=8000,
    min_frequency=2,
    special_tokens=[
        "<s>",
        "<pad>",
        "</s>",
        "<unk>",
        "<mask>"
    ]
)

# Guardar el tokenizador
tokenizer.save_model("bpe_tokenizer")


['bpe_tokenizer\\vocab.json', 'bpe_tokenizer\\merges.txt']

In [None]:
# https://github.com/huggingface/tokenizers/tree/main/bindings/python

tokenizer = ByteLevelBPETokenizer(
    "bpe_tokenizer/vocab.json",
    "bpe_tokenizer/merges.txt"
)

# Tokenizar texto
output = tokenizer.encode("Mari mari lamuen dos\n.")
print(output.tokens)
# ['Mari', ' mari', ' lamuen', '.']

print(output.ids)
# [token IDs]


['Mari', 'Ġmari', 'Ġlamuen', 'Ġdos', 'Ċ', '.']
[2775, 1383, 1310, 1143, 203, 18]


In [9]:
df["es_tok"] = df["es"].apply(lambda x: tokenizer.encode(x).ids)
df["map_tok"] = df["map"].apply(lambda x: tokenizer.encode(x).ids)

In [10]:
df

Unnamed: 0,es,map,es_tok,map_tok
0,"Hola, como estas?","Marimari, chumleymi am?","[4942, 16, 545, 1938, 35]","[5052, 16, 758, 80, 615, 799, 35]"
1,Ayer me dolía la cabeza.,Wüya kutrani ñi longko.,"[4160, 273, 356, 7694, 321, 1080, 18]","[5013, 748, 1927, 439, 1227, 18]"
2,Me gusta dormir en la tarde.,Küpa umawtun narantü mew.,"[1346, 3112, 5886, 305, 321, 2937, 18]","[7090, 4816, 510, 7762, 1750, 2215, 18]"
3,Esta silla está hecha de madera.,Tüfachi wangko mamüll mew dewmangekey.,"[1403, 452, 790, 673, 3861, 326, 5905, 18]","[4973, 347, 494, 538, 1595, 2215, 1247, 3673, ..."
4,Te corté con un cuchillo.,Katrüeyu kiñe winu mew.,"[2539, 1038, 2990, 366, 418, 6610, 18]","[4141, 956, 534, 1370, 89, 2215, 18]"
...,...,...,...,...
88339,Sí.,May.,"[445, 18]","[498, 18]"
88340,"Sí, si.","May, may.","[445, 16, 452, 18]","[498, 16, 502, 18]"
88341,Ahora que esté muy bien pues hermano\nte vine ...,Feyta kümeleaymi pu peñi\npepapeyu tati.,"[1484, 332, 3825, 631, 579, 325, 503, 203, 313...","[1074, 3949, 897, 298, 509, 203, 542, 5963, 38..."
88342,Gracias sí pues gracias\nesos por venir a conv...,"Kürasia may nay kürasia,\nfey tañi ngütramkapa...","[5525, 820, 325, 2629, 203, 1725, 470, 3015, 2...","[1295, 289, 4348, 502, 3920, 392, 289, 4348, 1..."
