## MUSE

In [12]:
import pandas as pd
import re

# Define valid character patterns
en_pattern = re.compile(r'^[a-zA-Z]+$')
ko_pattern = re.compile(r'^[\uac00-\ud7a3]+$')
de_pattern = re.compile(r'^[a-zA-ZäöüÄÖÜß]+$')

In [82]:
wiki_en = pd.read_csv("/home/hyujang/multilingual-inner-lexicon/data/English_tokenizers_comparison.csv")
wiki_en_words = set(wiki_en["word"])
wiki_ko = pd.read_csv("/home/hyujang/multilingual-inner-lexicon/data/Korean_tokenizers_comparison.csv")
wiki_ko_words = set(wiki_ko["word"])
wiki_de = pd.read_csv("/home/hyujang/multilingual-inner-lexicon/data/German_tokenizers_comparison.csv")
wiki_de_words = set(wiki_de["word"])

### ko -> en

In [60]:
# Load the en–ko word pairs
ko_en_df = pd.read_csv("/home/hyujang/multilingual-inner-lexicon/data/RQ2/MUSE/ko-en.txt",
                 sep="\t", header=None, names=["ko", "en"], encoding="utf-8")

# Drop rows with missing values
ko_en_df = ko_en_df.dropna(subset=["ko", "en"])

# Ensure strings and filter with regex
ko_en_df = ko_en_df[ko_en_df['en'].astype(str).apply(lambda x: bool(en_pattern.fullmatch(x)))]
ko_en_df = ko_en_df[ko_en_df['ko'].astype(str).apply(lambda x: bool(ko_pattern.fullmatch(x)))]

# Group by the English word and collect all Korean translations into a list
ko_en_df = ko_en_df.groupby("ko")["en"].apply(list).reset_index()
ko_en_df
# grouped_df.to_csv("/home/hyujang/multilingual-inner-lexicon/data/RQ2/MUSE/ko-en.csv", index=False)

Unnamed: 0,ko,en
0,가게,"[shop, shops, stores, store]"
1,가격,"[prices, price, pricing]"
2,가격결정,[pricing]
3,가고시마,[kagoshima]
4,가공,[machining]
...,...,...
13667,힐튼,[hilton]
13668,힘내,[cheers]
13669,힘든,[tough]
13670,힘줄,"[tendons, tendon]"


In [91]:
ko_en_df[ko_en_df["ko"].apply(lambda de_word: de_word in wiki_ko_words)].sample(1000, random_state=2025).reset_index(drop=True).to_csv("/home/hyujang/multilingual-inner-lexicon/data/RQ2/MUSE/Korean-English_1000.csv", index=False)

### en -> ko

In [39]:
# Load the en–ko word pairs
en_ko_df = pd.read_csv("/home/hyujang/multilingual-inner-lexicon/data/RQ2/MUSE/en-ko.txt",
                 sep="\t", header=None, names=["en", "ko"], encoding="utf-8")

# Drop rows with missing values
en_ko_df = en_ko_df.dropna(subset=["en", "ko"])

# Ensure strings and filter with regex
en_ko_df = en_ko_df[en_ko_df['en'].astype(str).apply(lambda x: bool(en_pattern.fullmatch(x)))]
en_ko_df = en_ko_df[en_ko_df['ko'].astype(str).apply(lambda x: bool(ko_pattern.fullmatch(x)))]

# Group by the English word and collect all Korean translations into a list
en_ko_df = en_ko_df.groupby("en")["ko"].apply(list).reset_index()
en_ko_df
# grouped_df.to_csv("/home/hyujang/multilingual-inner-lexicon/data/RQ2/MUSE/ko-en.csv", index=False)

Unnamed: 0,en,ko
0,aachen,[아헨]
1,aalborg,[올보르]
2,aaron,"[에런, 아론]"
3,abandoned,[버림]
4,abbas,"[압바스, 아바스]"
...,...,...
15863,zoom,"[축소, 확대]"
15864,zoos,[동물원]
15865,zucchini,[호박]
15866,zulu,"[줄루어, 줄루]"


In [92]:
en_ko_df[en_ko_df["en"].apply(lambda de_word: de_word in wiki_en_words)].sample(1000, random_state=2025).reset_index(drop=True).to_csv("/home/hyujang/multilingual-inner-lexicon/data/RQ2/MUSE/English_Korean_1000.csv", index=False)

### en -> de

In [None]:
# Load the en–ko word pairs
en_de_df = pd.read_csv("/home/hyujang/multilingual-inner-lexicon/data/RQ2/MUSE/en-de.txt",
                 sep=" ", header=None, names=["en", "de"], encoding="utf-8")

# Drop rows with missing values
en_de_df = en_de_df.dropna(subset=["en", "de"])

# Ensure strings and filter with regex
en_de_df = en_de_df[en_de_df['en'].astype(str).apply(lambda x: bool(en_pattern.fullmatch(x)))]
en_de_df = en_de_df[en_de_df['de'].astype(str).apply(lambda x: bool(de_pattern.fullmatch(x)))]
en_de_df = en_de_df[en_de_df['en'] != en_de_df['de']]

# Group by the English word and collect all Korean translations into a list
en_de_df = en_de_df.groupby("en")["de"].apply(list).reset_index()
en_de_df
# grouped_df.to_csv("/home/hyujang/multilingual-inner-lexicon/data/RQ2/MUSE/ko-en.csv", index=False)

Unnamed: 0,en,de
0,aah,[ahh]
1,aaron,[harun]
2,ababa,[abeba]
3,abacus,[abakus]
4,abandon,[aufgeben]
...,...,...
29318,zurich,[zürich]
29319,zvereva,[swerawa]
29320,zvezda,[swesda]
29321,zvi,[zwi]


In [93]:
en_de_df[en_de_df["en"].apply(lambda de_word: de_word in wiki_en_words)].sample(1000, random_state=2025).reset_index(drop=True).to_csv("/home/hyujang/multilingual-inner-lexicon/data/RQ2/MUSE/English_German_1000.csv", index=False)

### de -> en

In [111]:
# Load the en–ko word pairs
de_en_df = pd.read_csv("/home/hyujang/multilingual-inner-lexicon/data/RQ2/MUSE/de-en.txt",
                 sep=" ", header=None, names=["de", "en"], encoding="utf-8")

# Drop rows with missing values
de_en_df = de_en_df.dropna(subset=["de", "en"])

# Ensure strings and filter with regex
de_en_df = de_en_df[de_en_df['en'].astype(str).apply(lambda x: bool(en_pattern.fullmatch(x)))]
de_en_df = de_en_df[de_en_df['de'].astype(str).apply(lambda x: bool(de_pattern.fullmatch(x)))]
de_en_df = de_en_df[de_en_df['en'] != de_en_df['de']]

# Group by the English word and collect all Korean translations into a list
de_en_df = de_en_df.groupby("de")["en"].apply(list).reset_index()
de_en_df
# grouped_df.to_csv("/home/hyujang/multilingual-inner-lexicon/data/RQ2/MUSE/ko-en.csv", index=False)

Unnamed: 0,de,en
0,aal,"[eel, eels]"
1,aale,[eels]
2,aalen,[eels]
3,aas,[carrion]
4,abakus,[abacus]
...,...,...
31104,übung,"[practice, drill, exercise]"
31105,übungen,"[exercises, tutorials, drills, exercise]"
31106,üppig,"[lavish, luscious, lush, luxuriant]"
31107,üppige,[lush]


In [112]:
de_en_df[de_en_df["de"].apply(lambda de_word: de_word in wiki_de_words)].sample(1000, random_state=2025).reset_index(drop=True).to_csv("/home/hyujang/multilingual-inner-lexicon/data/RQ2/MUSE/German_English_1000.csv", index=False)

### ko -> de (ko->en & en->de)

In [113]:
en_de_dict = dict(zip(en_de_df['en'], en_de_df['de']))
ko_de_df = ko_en_df.copy()
ko_de_df["de"] = ko_de_df["en"].apply(
    lambda en_list: [en_de_dict[en] for en in en_list if en in en_de_dict]
)
ko_de_df["de"] = ko_de_df["de"].apply(lambda nested_list: [item for sublist in nested_list for item in sublist])
ko_de_df["de"] = [list(set(l)) for l in ko_de_df["de"]]
ko_de_df = ko_de_df[ko_de_df["de"].str.len() > 0].reset_index(drop=True)

ko_de_df

Unnamed: 0,ko,en,de
0,가게,"[shop, shops, stores, store]","[filialen, geschäften, einkaufsmöglichkeiten, ..."
1,가격,"[prices, price, pricing]","[preise, preispolitik, preisen, preisliste, pr..."
2,가격결정,[pricing],"[preise, preispolitik, preisliste, preisgestal..."
3,가공,[machining],"[metallbearbeitung, bearbeitungen, bearbeitung]"
4,가구,"[households, household, furnished, furniture]","[haushalten, mobiliar, privathaushalte, hausha..."
...,...,...,...
9328,힐스,[hills],"[berge, hügel, hügeln]"
9329,힘내,[cheers],"[prost, jubel]"
9330,힘든,[tough],"[schwer, harte, hart, schwierig]"
9331,힘줄,"[tendons, tendon]","[sehne, sehnen]"


In [None]:
ko_de_df[ko_de_df["ko"].apply(lambda de_word: de_word in wiki_ko_words)][["ko","de"]].sample(1000, random_state=2025).reset_index(drop=True).to_csv("/home/hyujang/multilingual-inner-lexicon/data/RQ2/MUSE/Korean_German_1000.csv", index=False)

### de -> ko (de->en & en->ko)

In [63]:
en_ko_dict = dict(zip(en_ko_df['en'], en_ko_df['ko']))
de_ko_df = de_en_df.copy()
de_ko_df["ko"] = de_ko_df["en"].apply(
    lambda en_list: [en_ko_dict[en] for en in en_list if en in en_ko_dict]
)
de_ko_df["ko"] = de_ko_df["ko"].apply(lambda nested_list: [item for sublist in nested_list for item in sublist])
de_ko_df["ko"] = [list(set(l)) for l in de_ko_df["ko"]]
de_ko_df = de_ko_df[de_ko_df["ko"].str.len() > 0].reset_index(drop=True)

de_ko_df

Unnamed: 0,de,en,ko
0,aal,"[eel, eels]",[장어]
1,aale,[eels],[장어]
2,aalen,[eels],[장어]
3,abb,[fig],[무화과]
4,abbau,"[degradation, dismantling]",[해체]
...,...,...,...
16116,üblicherweise,"[usually, typically]","[보통, 일반적으로]"
16117,übrig,"[leftover, remaining]",[남음]
16118,übrige,"[remainder, rest]","[휴식, 나머지, 쉬어]"
16119,übung,"[practice, drill, exercise]","[연습, 드릴, 연습생, 훈련, 운동]"


In [116]:
de_ko_df[de_ko_df["de"].apply(lambda de_word: de_word in wiki_de_words)]

Unnamed: 0,de,en,ko
23,abfahren,[depart],[출발]
83,abkühlen,[cooling],"[냉방, 냉각]"
90,ablauf,"[sequence, expiry, expire, expiration]","[만료, 만기, 순서, 시퀀스]"
103,abnahme,"[decline, acceptance, decrease, decreasing]","[쇠퇴, 거부, 줄이기, 감소, 거절, 합격]"
106,abnehmer,"[buyer, customers, buyers, contractor]","[고객, 구매자]"
...,...,...,...
15964,östlich,[east],"[이스트, 동쪽, 동부]"
15971,über,"[on, about, via, over, above]","[대략, 비아, 소개]"
16064,übersetzung,"[translation, translate, translations]","[번역하기, 번역, 번역기]"
16075,übertragung,"[transmission, broadcast, delegation, transfer...","[위임, 방송, 변속기, 전송]"


In [119]:
de_ko_df_filtered = de_ko_df[de_ko_df["de"].apply(lambda de_word: de_word in wiki_de_words)]
if len(de_ko_df_filtered) < 1000:
    rows_needed = 1000 - len(de_ko_df_filtered)
    remaining_rows = de_ko_df[~de_ko_df.index.isin(de_ko_df_filtered.index)]
    additional_rows = remaining_rows.sample(n=rows_needed, random_state=2025)
    final_df = pd.concat([de_ko_df_filtered, additional_rows]).reset_index(drop=True)
final_df[["de","ko"]].to_csv("/home/hyujang/multilingual-inner-lexicon/data/RQ2/MUSE/German_Korean_1000.csv", index=False)

## Word2word

In [3]:
import pandas as pd
from word2word import Word2word

# Load bilingual dictionaries
en2ko = Word2word("en", "ko")
en2de = Word2word("en", "de")
ko2en = Word2word("ko", "en")
ko2de = Word2word("ko", "de")
de2en = Word2word("de", "en")
de2ko = Word2word("de", "ko")

# General function
def make_trilingual_dataset(dict1, dict2, lang_key, lang1, lang2):
    # dict1 and dict2 are from same source (e.g., en2ko and en2de)
    results = []

    # Shared source indices
    shared_indices = set(dict1.x2ys.keys()) & set(dict2.x2ys.keys())

    # Invert word2x to get index → word
    idx2word = {v: k for k, v in dict1.word2x.items()}

    for idx in shared_indices:
        try:
            word = idx2word[idx]  # source word
        except KeyError:
            continue  # skip if missing

        target1_indices = dict1.x2ys.get(idx, [])
        target2_indices = dict2.x2ys.get(idx, [])

        if not target1_indices or not target2_indices:
            continue

        # Convert indices to actual words
        words1 = [dict1.y2word.get(i) for i in target1_indices if i in dict1.y2word]
        words2 = [dict2.y2word.get(i) for i in target2_indices if i in dict2.y2word]

        # Final check
        if words1 and words2:
            results.append({
                lang_key: word,
                lang1: words1,
                lang2: words2
            })

    return pd.DataFrame(results)

# Create datasets
df_en = make_trilingual_dataset(en2ko, en2de, "en", "ko", "de")
df_ko = make_trilingual_dataset(ko2en, ko2de, "ko", "en", "de")
df_de = make_trilingual_dataset(de2en, de2ko, "de", "en", "ko")

# Save to CSV
df_en.to_csv("en-ko-de.csv", index=False)
df_ko.to_csv("ko-en-de.csv", index=False)
df_de.to_csv("de-en-ko.csv", index=False)

print("✅ Trilingual dictionaries saved.")


✅ Trilingual dictionaries saved.


In [4]:
print(len(en2ko.x2ys))
print(len(en2de.x2ys))
print(len(ko2en.x2ys))
print(len(ko2de.x2ys))
print(len(de2en.x2ys))
print(len(de2ko.x2ys))

105537
93868
87153
72939
141921
144716


In [None]:
import re
en_pattern = re.compile(r'^[a-zA-Z]+$')
de_pattern = re.compile(r'^[a-zA-ZäöüÄÖÜß]+$')
ko_pattern = re.compile(r'^[\uac00-\ud7a3]+$')

def filter_words(word_list, pattern):
    return [word for word in word_list if pattern.match(word)]

df_en = df_en[df_en["en"].apply(lambda word: bool(en_pattern.match(word)))]
df_en["de"] = df_en["de"].apply(lambda words: filter_words(words, de_pattern))
df_en["ko"] = df_en["ko"].apply(lambda words: filter_words(words, ko_pattern))
df_en = df_en[df_en["de"].str.len() > 0]
df_en = df_en[df_en["ko"].str.len() > 0]

df_en.reset_index(drop=True, inplace=True)

df_en

Unnamed: 0,en,ko,de
0,I,"[미안, 모르, 줄, 는지, 못, 지금, 잘, 해요]","[dachte, glaube, leid, hätte, hatte, wollte, h..."
1,you,"[널, 자네, 넌, 으면, 어떻게, 무슨, 줄, 는지, 잖아]","[Danke, kannst, willst, Warum, euch, sehen, wi..."
2,the,"[중, 입니다, 까지, 인, 오, 님, 던, 자]","[Danke, Wer, Wo, Warum, Hey, Ist, denn, hab, O..."
3,to,"[려고, 해야, 얘기, 까지, 필요, 던, 해요, 한테, 시]","[Welt, zur, wurde, am, alle, des, über, zum, Der]"
4,s,"[그건, 그게, 입니다, 무슨, 됐, 네요, 괜찮, 그래, 잖아]","[möchte, soll, wollen, gehen, sehen, wollte, z..."
...,...,...,...
75549,Mixin,"[세거, 이딴, 든, 훨씬, 마시, 한테, 니, 보다]","[Bettlaken, müsse, leeren, zufriedengeben, Bet..."
75550,Pruno,"[세거, 이딴, 든, 훨씬, 마시, 한테, 니, 보다]","[Fährt, Preise, Bandit, Cadillac, Bettpfanne, ..."
75551,sippin,"[세거, 이딴, 훨씬, 든, 마시]","[ausgeruht, ausgeschlafen, übernächtigt, Verbr..."
75552,sharecropping,"[소작, 싶, 아요, 않, 하지만, 난, 알, 죠, 고, 지]","[Danebenstehen, Freund, Ihrem, Mein, vorbei, m..."


## Tokenize

In [None]:
from transformers import AutoTokenizer
import json

def load_tokenizer(tokenizer_name):
    with open("/home/hyujang/multilingual-inner-lexicon/RQ1/config.json", "r") as f:
            config = json.load(f)
    token_key = config["tokenizers"][tokenizer_name]
    if token_key:
        with open("/home/hyujang/multilingual-inner-lexicon/user_config.json", "r") as f:
            user_config = json.load(f)
            token_value = user_config["huggingface_token"].get(token_key)
    else:
        token_value = None

    tokenizer = AutoTokenizer.from_pretrained(tokenizer_name, use_fast=True, tokens=token_value)
    return tokenizer


In [None]:
babel_tokenizer = load_tokenizer("Tower-Babel/Babel-9B-Chat")