In [1]:
import pandas as pd
import random
from jamo import h2j, j2h

# ENGLISH

### Random Split

In [2]:
def random_split(word, MIN_WORD_LEN):
    if len(word) <= 1:
        return [word]
    # num_splits = random.randint(1, min(4, len(word) - MIN_WORD_LEN))
    try:
        num_splits = random.randint(1, min(4, len(word) - MIN_WORD_LEN - 1))
    except:
        num_splits = 1
    split_points = sorted(random.sample(range(1, len(word)), num_splits))
    tokens = [word[i:j] for i, j in zip([0] + split_points, split_points + [None])]
    return tokens

In [18]:
# TOKENIZER = "babel_9b"
# TOKENIZER = "gemma_12b"
TOKENIZER = "llama_2_7b"
LANGUAGE = "English"
MIN_WORD_LEN = 3
RANDOM_SEED = 2025
random.seed(RANDOM_SEED)

df = pd.read_csv(f"/home/hyujang/multilingual-inner-lexicon/data/{LANGUAGE}_tokenizers_comparison.csv")

# for English
if LANGUAGE == "English":
    df["word_len"] = df["word"].apply(len)
    df = df[(df[f"token_num_{TOKENIZER}"]==1) & (df["word_len"]>MIN_WORD_LEN)].reset_index(drop=True)
    df[f"splitted_tokens"] = df["word"].apply(lambda x: random_split(x, MIN_WORD_LEN))
    print(df[f"splitted_tokens"].apply(len).value_counts())


splitted_tokens
2    2567
3     879
4     567
Name: count, dtype: int64


In [19]:
num_samples = 1000
num_quantiles = 5
df['freq_quantile'], bins = pd.qcut(df['freq'], num_quantiles, labels=False,  duplicates='drop', retbins=True)

num_quantiles = df['freq_quantile'].nunique()
samples_per_quantile = num_samples // num_quantiles

sampled = []
for quantile in range(num_quantiles):
    quantile_df = df[df['freq_quantile'] == quantile]
    if len(quantile_df) > 0:
        sampled.append(quantile_df.sample(min(len(quantile_df), samples_per_quantile), 
                                          replace=False, random_state=RANDOM_SEED))

sampled_df = pd.concat(sampled, ignore_index=False).drop_duplicates(subset=['word']).reset_index(drop=True)

model_name = "Llama-2-7b-chat-hf"
# Display the sampled DataFrame
print(sampled_df[f"splitted_tokens"].apply(len).value_counts())
sampled_df = sampled_df[['word' , f"splitted_tokens", "same_token_num", "same_token_num2", "freq", "freq_quantile", "word_len"]]
sampled_df.to_csv(f"/home/hyujang/multilingual-inner-lexicon/data/RQ1/WordIdentity/single_token_splitted_{model_name}_{LANGUAGE}_2.csv", index=False)

splitted_tokens
2    645
3    223
4    132
Name: count, dtype: int64


### Typo

In [9]:
def introduce_typo(word, typo_type=None):
    letters = 'abcdefghijklmnopqrstuvwxyz'
    if typo_type is None:
        # typo_type = random.choice(["substitution", "deletion", "insertion", "transposition"])
        typo_type = random.choice(["substitution", "deletion", "insertion"])

    if typo_type == "substitution":
        position = random.randint(1, len(word) - 1)
        original_char = word[position]
        typo_char = random.choice([c for c in letters if c != original_char])
        return word[:position] + typo_char + word[position + 1:], typo_type
    elif typo_type == "deletion":
        position = random.randint(1, len(word) - 1)
        return word[:position] + word[position + 1:], typo_type
    elif typo_type == "insertion":
        position = random.randint(1, len(word) - 1)
        typo_char = random.choice(letters)
        return word[:position] + typo_char + word[position:], typo_type
    elif typo_type == "transposition":
        position = random.randint(1, len(word) - 2)
        return word[:position] + word[position + 1] + word[position] + word[position + 2:], typo_type
    else:
        return word, typo_type

In [10]:
# TOKENIZER = "babel_9b"
TOKENIZER = "gemma_12b"
# TOKENIZER = "llama_2_7b"
LANGUAGE = "English"
MIN_WORD_LEN = 4
RANDOM_SEED = 2025  # Set a random seed for reproducibility
random.seed(RANDOM_SEED)

df = pd.read_csv(f"/home/hyujang/multilingual-inner-lexicon/data/{LANGUAGE}_tokenizers_comparison.csv")
df["word_len"] = df["word"].apply(lambda x: len(x))
df = df[(df[f"token_num_{TOKENIZER}"]==1) & (df["word_len"]>MIN_WORD_LEN-1)].reset_index(drop=True)    
# df = df[df[f"token_num_{TOKENIZER}"]==1].reset_index(drop=True)
df[[f"typo_tokens_{TOKENIZER}", f"typo_type_{TOKENIZER}"]] = df["word"].apply(lambda x: pd.Series(introduce_typo(x, typo_type=None)))
df["typo_word_len"] = df[f"typo_tokens_{TOKENIZER}"].apply(lambda x: len(x))
df = df[(df["typo_word_len"]>MIN_WORD_LEN)].reset_index(drop=True)

df[f"splitted_typo_tokens"] = df[f"typo_tokens_{TOKENIZER}"].apply(lambda x: random_split(x, MIN_WORD_LEN))
print(df[f"typo_type_{TOKENIZER}"].value_counts())
print(df[f"splitted_typo_tokens"].apply(len).value_counts())

df

typo_type_gemma_12b
insertion       2105
substitution    1618
deletion        1190
Name: count, dtype: int64
splitted_typo_tokens
2    3128
3     967
4     516
5     302
Name: count, dtype: int64


Unnamed: 0,word,tokens_babel_9b,token_num_babel_9b,tokens_gemma_12b,token_num_gemma_12b,tokens_llama_2_7b,token_num_llama_2_7b,avg_token_num,same_token_num,avg_token_num_rounded,avg_token_num2,same_token_num2,avg_token_num2_rounded,any_token_num_is_1,freq,word_len,typo_tokens_gemma_12b,typo_type_gemma_12b,typo_word_len,splitted_typo_tokens
0,year,['year'],1,['year'],1,['▁year'],1,1.000000,True,1,1.0,True,1,True,15118,4,yuear,insertion,5,"[yu, ear]"
1,people,['people'],1,['people'],1,['▁people'],1,1.000000,True,1,1.0,True,1,True,12856,6,peple,deletion,5,"[p, eple]"
2,player,['player'],1,['player'],1,['▁player'],1,1.000000,True,1,1.0,True,1,True,12022,6,pllayer,insertion,7,"[pll, a, yer]"
3,member,['member'],1,['member'],1,['▁member'],1,1.000000,True,1,1.0,True,1,True,8579,6,mmber,deletion,5,"[mmb, er]"
4,season,['season'],1,['season'],1,['▁season'],1,1.000000,True,1,1.0,True,1,True,8256,6,seasdn,substitution,6,"[seasd, n]"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4908,classified,['classified'],1,['classified'],1,"['▁class', 'ified']",2,1.333333,False,1,1.0,True,1,True,1,10,cassified,deletion,9,"[cas, sifie, d]"
4909,modelo,['modelo'],1,['modelo'],1,['▁modelo'],1,1.000000,True,1,1.0,True,1,True,1,6,modelco,insertion,7,"[m, o, delco]"
4910,clave,['clave'],1,['clave'],1,"['▁cla', 've']",2,1.333333,False,1,1.0,True,1,True,1,5,caave,substitution,5,"[c, aave]"
4911,mito,"['mit', 'o']",2,['mito'],1,"['▁mit', 'o']",2,1.666667,False,2,1.5,False,2,True,1,4,mitzo,insertion,5,"[mit, zo]"


In [11]:
num_samples = 1000
num_quantiles = 5
df['freq_quantile'], bins = pd.qcut(df['freq'], num_quantiles, labels=False,  duplicates='drop', retbins=True)

num_quantiles = df['freq_quantile'].nunique()
samples_per_quantile = num_samples // num_quantiles

sampled = []
for quantile in range(num_quantiles):
    quantile_df = df[df['freq_quantile'] == quantile]
    if len(quantile_df) > 0:
        sampled.append(quantile_df.sample(min(len(quantile_df), samples_per_quantile), 
                                          replace=False, random_state=RANDOM_SEED))

sampled_df = pd.concat(sampled, ignore_index=False).drop_duplicates(subset=['word']).reset_index(drop=True)
print(sampled_df[f"typo_type_{TOKENIZER}"].value_counts())
sampled_df = sampled_df[['word' , f"typo_tokens_{TOKENIZER}", f"splitted_typo_tokens", f"typo_type_{TOKENIZER}", "same_token_num", "same_token_num2", "freq", "freq_quantile", "word_len", "typo_word_len"]]
sampled_df
sampled_df.to_csv(f"/home/hyujang/multilingual-inner-lexicon/data/RQ1/WordIdentity/single_token_typos_{TOKENIZER}_{LANGUAGE}.csv", index=False)

typo_type_gemma_12b
insertion       416
substitution    346
deletion        238
Name: count, dtype: int64


# GERMAN

### Random Split

In [2]:
def random_split(word, MIN_WORD_LEN):
    if len(word) <= 1:
        return [word]
    try:
        num_splits = random.randint(1, min(4, len(word) - MIN_WORD_LEN - 1))
    except:
        num_splits = 1
    split_points = sorted(random.sample(range(1, len(word)), num_splits))
    tokens = [word[i:j] for i, j in zip([0] + split_points, split_points + [None])]
    return tokens

In [13]:
# TOKENIZER = "babel_9b"
TOKENIZER = "gemma_12b"
# TOKENIZER = "llama_2_7b"
LANGUAGE = "German"
MIN_WORD_LEN = 3
RANDOM_SEED = 2025
random.seed(RANDOM_SEED)

df = pd.read_csv(f"/home/hyujang/multilingual-inner-lexicon/data/{LANGUAGE}_tokenizers_comparison.csv")

# for English
if LANGUAGE == "German":
    df["word_len"] = df["word"].apply(len)
    df = df[(df[f"token_num_{TOKENIZER}"]==1) & (df["word_len"]>MIN_WORD_LEN)].reset_index(drop=True)
    df[f"splitted_tokens"] = df["word"].apply(lambda x: random_split(x, MIN_WORD_LEN))
    print(df[f"splitted_tokens"].apply(len).value_counts())

splitted_tokens
2    1827
3     449
4     245
5     129
Name: count, dtype: int64


In [4]:
num_samples = 1000
num_quantiles = 5
df['freq_quantile'], bins = pd.qcut(df['freq'], num_quantiles, labels=False,  duplicates='drop', retbins=True)

num_quantiles = df['freq_quantile'].nunique()
samples_per_quantile = num_samples // num_quantiles

sampled = []
for quantile in range(num_quantiles):
    quantile_df = df[df['freq_quantile'] == quantile]
    if len(quantile_df) > 0:
        sampled.append(quantile_df.sample(min(len(quantile_df), samples_per_quantile), 
                                          replace=False, random_state=RANDOM_SEED))
sampled_df = pd.concat(sampled, ignore_index=False).drop_duplicates(subset=['word'])

if len(sampled_df) < num_samples:
    remaining = num_samples - len(sampled_df)
    other_df = df.drop(sampled_df.index, errors='ignore')
    print(f"remaining: {remaining}, other_df: {len(other_df)}")
    additional_samples = other_df.sample(min(len(other_df), remaining), replace=False, random_state=RANDOM_SEED)
    # sampled_indices += additional_samples.index.to_list()
    sampled_df = pd.concat([sampled_df, additional_samples]).drop_duplicates(subset=['word']).reset_index(drop=True)
    print(f"sampled_df: {len(sampled_df)}")

# Display the sampled DataFrame
print(sampled_df[f"splitted_tokens"].apply(len).value_counts())
sampled_df = sampled_df[['word' , f"splitted_tokens", "same_token_num", "same_token_num2", "freq", "freq_quantile", "word_len"]]
sampled_df
sampled_df.to_csv(f"/home/hyujang/multilingual-inner-lexicon/data/RQ1/WordIdentity/single_token_splitted_{TOKENIZER}_{LANGUAGE}.csv", index=False)

splitted_tokens
2    693
3    166
4     92
5     49
Name: count, dtype: int64


### Typo

In [5]:
def introduce_typo_german(word, typo_type=None):
    # Include German-specific characters
    letters = 'abcdefghijklmnopqrstuvwxyzäöüß'

    if typo_type is None:
        typo_type = random.choice(["substitution", "deletion", "insertion"])

    if typo_type == "substitution":
        position = random.randint(1, len(word) - 1)
        original_char = word[position]
        typo_char = random.choice([c for c in letters if c != original_char])
        return word[:position] + typo_char + word[position + 1:], typo_type

    elif typo_type == "deletion":
        position = random.randint(1, len(word) - 1)
        return word[:position] + word[position + 1:], typo_type

    elif typo_type == "insertion":
        position = random.randint(1, len(word) - 1)
        typo_char = random.choice(letters)
        return word[:position] + typo_char + word[position:], typo_type

    elif typo_type == "transposition" and len(word) >= 3:
        position = random.randint(1, len(word) - 2)
        return word[:position] + word[position + 1] + word[position] + word[position + 2:], typo_type

    else:
        return word, typo_type

In [6]:
# TOKENIZER = "babel_9b"
TOKENIZER = "gemma_12b"
# TOKENIZER = "llama_2_7b"
LANGUAGE = "German"
MIN_WORD_LEN = 4
RANDOM_SEED = 2025  # Set a random seed for reproducibility
random.seed(RANDOM_SEED)

df = pd.read_csv(f"/home/hyujang/multilingual-inner-lexicon/data/{LANGUAGE}_tokenizers_comparison.csv")
df["word_len"] = df["word"].apply(lambda x: len(x))
df = df[(df[f"token_num_{TOKENIZER}"]==1) & (df["word_len"]>MIN_WORD_LEN-1)].reset_index(drop=True)    
# df = df[df[f"token_num_{TOKENIZER}"]==1].reset_index(drop=True)
df[[f"typo_tokens_{TOKENIZER}", f"typo_type_{TOKENIZER}"]] = df["word"].apply(lambda x: pd.Series(introduce_typo_german(x, typo_type=None)))
df["typo_word_len"] = df[f"typo_tokens_{TOKENIZER}"].apply(lambda x: len(x))
df = df[(df["typo_word_len"]>MIN_WORD_LEN)].reset_index(drop=True)

df[f"splitted_typo_tokens"] = df[f"typo_tokens_{TOKENIZER}"].apply(lambda x: random_split(x, MIN_WORD_LEN))
print(df[f"typo_type_{TOKENIZER}"].value_counts())
print(df[f"splitted_typo_tokens"].apply(len).value_counts())

df

typo_type_gemma_12b
insertion       907
substitution    601
deletion        435
Name: count, dtype: int64
splitted_typo_tokens
2    1386
3     321
4     153
5      83
Name: count, dtype: int64


Unnamed: 0,word,tokens_babel_9b,token_num_babel_9b,tokens_gemma_12b,token_num_gemma_12b,tokens_llama_2_7b,token_num_llama_2_7b,avg_token_num,same_token_num,avg_token_num_rounded,avg_token_num2,same_token_num2,avg_token_num2_rounded,any_token_num_is_1,freq,word_len,typo_tokens_gemma_12b,typo_type_gemma_12b,typo_word_len,splitted_typo_tokens
0,Jahr,"['J', 'ahr']",2,['Jahr'],1,['▁Jahr'],1,1.333333,False,1,1.5,False,2,True,31379,4,Juahr,insertion,5,"[Juah, r]"
1,Weblinks,"['We', 'bl', 'inks']",3,['Weblinks'],1,['▁Weblinks'],1,1.666667,False,2,2.0,False,2,True,12234,8,Welinks,deletion,7,"[W, e, links]"
2,Stadt,"['St', 'adt']",2,['Stadt'],1,['▁Stadt'],1,1.333333,False,1,1.5,False,2,True,8535,5,Sltadt,insertion,6,"[Slt, adt]"
3,Literatur,"['Liter', 'atur']",2,['Literatur'],1,['▁Literatur'],1,1.333333,False,1,1.5,False,2,True,7820,9,Liteatur,deletion,8,"[Lit, eatur]"
4,Geschichte,"['G', 'esch', 'ichte']",3,['Geschichte'],1,['▁Geschichte'],1,1.666667,False,2,2.0,False,2,True,6754,10,Geschicdte,substitution,10,"[Gesc, hicdte]"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1938,Flower,"['Fl', 'ower']",2,['Flower'],1,"['▁F', 'lower']",2,1.666667,False,2,1.5,False,2,True,1,6,Fflower,insertion,7,"[F, flowe, r]"
1939,Number,['Number'],1,['Number'],1,['▁Number'],1,1.000000,True,1,1.0,True,1,True,1,6,Nucmber,insertion,7,"[Nucmbe, r]"
1940,Finance,['Finance'],1,['Finance'],1,"['▁Fin', 'ance']",2,1.333333,False,1,1.0,True,1,True,1,7,Financbe,insertion,8,"[Fin, an, c, be]"
1941,Finish,['Finish'],1,['Finish'],1,"['▁Fin', 'ish']",2,1.333333,False,1,1.0,True,1,True,1,6,Fiwish,substitution,6,"[F, iwish]"


In [8]:
num_samples = 1000
num_quantiles = 5
df['freq_quantile'], bins = pd.qcut(df['freq'], num_quantiles, labels=False,  duplicates='drop', retbins=True)

num_quantiles = df['freq_quantile'].nunique()
samples_per_quantile = num_samples // num_quantiles

sampled = []
for quantile in range(num_quantiles):
    quantile_df = df[df['freq_quantile'] == quantile]
    if len(quantile_df) > 0:
        sampled.append(quantile_df.sample(min(len(quantile_df), samples_per_quantile), 
                                          replace=False, random_state=RANDOM_SEED))

sampled_df = pd.concat(sampled, ignore_index=False).drop_duplicates(subset=['word']).reset_index(drop=True)

if len(sampled_df) < num_samples:
    remaining = num_samples - len(sampled_df)
    other_df = df.drop(sampled_df.index, errors='ignore')
    print(f"remaining: {remaining}, other_df: {len(other_df)}")
    additional_samples = other_df.sample(min(len(other_df), remaining), replace=False, random_state=RANDOM_SEED)
    # sampled_indices += additional_samples.index.to_list()
    sampled_df = pd.concat([sampled_df, additional_samples]).drop_duplicates(subset=['word']).reset_index(drop=True)
    print(f"sampled_df: {len(sampled_df)}")

# Display the sampled DataFrame
print(sampled_df[f"typo_type_{TOKENIZER}"].value_counts())
sampled_df = sampled_df[['word' , f"typo_tokens_{TOKENIZER}", f"splitted_typo_tokens", f"typo_type_{TOKENIZER}", "same_token_num", "same_token_num2", "freq", "freq_quantile", "word_len", "typo_word_len"]]
sampled_df


sampled_df.to_csv(f"/home/hyujang/multilingual-inner-lexicon/data/RQ1/WordIdentity/single_token_typos_{TOKENIZER}_{LANGUAGE}.csv", index=False)

typo_type_gemma_12b
insertion       452
substitution    318
deletion        230
Name: count, dtype: int64


# KOREAN

In [None]:
def count_jamos(word):
    decomposed = h2j(word)  # Decomposes into jamos
    return len(decomposed)

def split_jamos(word):
    return list(h2j(word))  # decompose Hangul syllables to jamos

def join_jamos(jamos):
    return j2h(''.join(jamos))  # compose jamos back to syllables

def random_split_korean(word, MIN_JAMO_LEN):
    jamos = list(split_jamos(word))
    if len(jamos) <= 1:
        return [word]
    num_splits = random.randint(1, min(4, len(jamos) - MIN_JAMO_LEN))
    split_points = sorted(random.sample(range(1, len(jamos)), num_splits))
    jamo_tokens = [jamos[i:j] for i, j in zip([0] + split_points, split_points + [None])]
    return [''.join(token) for token in jamo_tokens]

# def random_split_korean(word, min_jamo_len):
#     if not word or len(word) == 1:
#         return [word]

#     syllables = list(word)
#     jamo_lengths = [count_jamos(syl) for syl in syllables]

#     # Accumulate positions ensuring min_jamo_len per segment
#     valid_indices = []
#     total = 0
#     for i in range(1, len(syllables)):
#         total += jamo_lengths[i - 1]
#         if total >= min_jamo_len:
#             valid_indices.append(i)

#     if not valid_indices:
#         return [word]

#     num_splits = random.randint(1, min(4, len(valid_indices)))
#     split_points = sorted(random.sample(valid_indices, num_splits))

#     segments = [syllables[i:j] for i, j in zip([0] + split_points, split_points + [None])]
#     return [''.join(seg) for seg in segments]


### Random Split

In [79]:
TOKENIZER = "babel_9b"
# TOKENIZER = "gemma_12b"
# TOKENIZER = "llama_2_7b"
LANGUAGE = "Korean"
MIN_JAMO_LEN = 2
RANDOM_SEED = 2025
random.seed(RANDOM_SEED)

df = pd.read_csv(f"/home/hyujang/multilingual-inner-lexicon/data/{LANGUAGE}_tokenizers_comparison.csv")
df["word_len"] = df["word"].apply(len)
df['jamo_len'] = df['word'].apply(count_jamos)
df = df[(df[f"token_num_{TOKENIZER}"]==1) & (df["jamo_len"]>MIN_JAMO_LEN)].reset_index(drop=True)
df

Unnamed: 0,word,tokens_babel_9b,token_num_babel_9b,tokens_gemma_12b,token_num_gemma_12b,tokens_llama_2_7b,token_num_llama_2_7b,avg_token_num,same_token_num,avg_token_num_rounded,avg_token_num2,same_token_num2,avg_token_num2_rounded,any_token_num_is_1,freq,word_len,jamo_len
0,역,['ìĹŃ'],1,['역'],1,"['▁', '역']",2,1.333333,False,1,1.0,True,1,True,16297,1,3
1,팀,['íĮĢ'],1,['팀'],1,"['▁', '<0xED>', '<0x8C>', '<0x80>']",4,2.000000,False,2,1.0,True,1,True,5551,1,3
2,말,['ë§Ĳ'],1,['말'],1,"['▁', '<0xEB>', '<0xA7>', '<0x90>']",4,2.000000,False,2,1.0,True,1,True,5115,1,3
3,군,['êµ°'],1,['군'],1,"['▁', '군']",2,1.333333,False,1,1.0,True,1,True,4356,1,3
4,전,['ìłĦ'],1,['전'],1,"['▁', '전']",2,1.333333,False,1,1.0,True,1,True,4316,1,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
830,도록,['ëıĦë¡Ŀ'],1,['도록'],1,"['▁', '도', '<0xEB>', '<0xA1>', '<0x9D>']",5,2.333333,False,2,1.0,True,1,True,1,2,5
831,엤,['ìĹ¤'],1,"['<0xEC>', '<0x97>', '<0xA4>']",3,"['▁', '<0xEC>', '<0x97>', '<0xA4>']",4,2.666667,False,3,2.0,False,2,True,1,1,3
832,둡,['ëĳ¡'],1,['둡'],1,"['▁', '<0xEB>', '<0x91>', '<0xA1>']",4,2.000000,False,2,1.0,True,1,True,1,1,3
833,둣,['ëĳ£'],1,['둣'],1,"['▁', '<0xEB>', '<0x91>', '<0xA3>']",4,2.000000,False,2,1.0,True,1,True,1,1,3


In [83]:
TOKENIZER = "babel_9b"
# TOKENIZER = "gemma_12b"
# TOKENIZER = "llama_2_7b"
LANGUAGE = "Korean"
MIN_JAMO_LEN = 2
RANDOM_SEED = 2025
random.seed(RANDOM_SEED)

df = pd.read_csv(f"/home/hyujang/multilingual-inner-lexicon/data/{LANGUAGE}_tokenizers_comparison.csv")

# for Korean
if LANGUAGE == "Korean":
    df["word_len"] = df["word"].apply(len)
    df['jamo_len'] = df['word'].apply(count_jamos)
    df = df[(df[f"token_num_{TOKENIZER}"]==1) & (df["jamo_len"]>MIN_JAMO_LEN)].reset_index(drop=True)
    df[f"splitted_tokens_{TOKENIZER}"] = df["word"].apply(lambda x: random_split_korean(x, MIN_JAMO_LEN))
    print(df[f"splitted_tokens_{TOKENIZER}"].apply(len).value_counts())


splitted_tokens_babel_9b
2    744
3     57
4     28
5      6
Name: count, dtype: int64


In [84]:
df[f"splitted_tokens_{TOKENIZER}"]

0            [여, ᆨ]
1            [ᄐ, ᅵᆷ]
2            [마, ᆯ]
3            [ᄀ, ᅮᆫ]
4            [저, ᆫ]
           ...      
830    [도, ᄅ, ᅩ, ᆨ]
831          [에, ᆻ]
832          [ᄃ, ᅮᆸ]
833          [ᄃ, ᅮᆺ]
834          [예, ᆯ]
Name: splitted_tokens_babel_9b, Length: 835, dtype: object

### Typo

In [None]:
import random
from jamo import h2j, j2h, hangul_to_jamo, jamo_to_hangul

# Standard Jamo sets
CHO = ['ㄱ','ㄲ','ㄴ','ㄷ','ㄸ','ㄹ','ㅁ','ㅂ','ㅃ','ㅅ','ㅆ','ㅇ','ㅈ','ㅉ','ㅊ','ㅋ','ㅌ','ㅍ','ㅎ']
JUN = ['ㅏ','ㅐ','ㅑ','ㅒ','ㅓ','ㅔ','ㅕ','ㅖ','ㅗ','ㅘ','ㅙ','ㅚ','ㅛ','ㅜ','ㅝ','ㅞ','ㅟ','ㅠ','ㅡ','ㅢ','ㅣ']
JON = ['','ㄱ','ㄲ','ㄳ','ㄴ','ㄵ','ㄶ','ㄷ','ㄹ','ㄺ','ㄻ','ㄼ','ㄽ','ㄾ','ㄿ','ㅀ','ㅁ','ㅂ','ㅄ','ㅅ','ㅆ','ㅇ','ㅈ','ㅊ','ㅋ','ㅌ','ㅍ','ㅎ']

def decompose_syllable(s):
    code = ord(s) - 0xAC00
    cho = code // (21 * 28)
    jun = (code % (21 * 28)) // 28
    jon = code % 28
    return cho, jun, jon

def compose_syllable(cho, jun, jon):
    return chr(0xAC00 + cho * 21 * 28 + jun * 28 + jon)

def introduce_korean_syllable_typo(word, typo_type=None):
    if typo_type is None:
        typo_type = random.choice(["substitution", "deletion", "insertion", "transposition"])

    chars = list(word)
    if not chars:
        return word, typo_type

    idx = random.randint(0, len(chars) - 1)
    c = chars[idx]
    try:
        cho, jun, jon = decompose_syllable(c)
    except:
        return word, typo_type  # Skip non-Hangul

    if typo_type == "substitution":
        part = random.choice(['cho', 'jun', 'jon'])
        if part == 'cho':
            cho = random.choice([i for i in range(len(CHO)) if i != cho])
        elif part == 'jun':
            jun = random.choice([i for i in range(len(JUN)) if i != jun])
        elif part == 'jon':
            jon = random.choice([i for i in range(len(JON)) if i != jon])
    elif typo_type == "deletion":
        part = random.choice(['cho', 'jun', 'jon'])
        if part == 'jon':
            jon = 0  # Remove final
    elif typo_type == "insertion":
        if jon == 0:
            jon = random.randint(1, len(JON) - 1)  # Add a final
    elif typo_type == "transposition":
        # Only meaningful if jon exists — swap cho and jon
        if jon != 0:
            cho, jon = jon % len(CHO), cho % len(JON)

    chars[idx] = compose_syllable(cho, jun, jon)
    return ''.join(chars), typo_type
