In [None]:
import os
import json
import pandas as pd
import random

BASE_DIR = "/home/hyujang/multilingual-inner-lexicon"
with open(os.path.join(BASE_DIR, "RQ1/config.json"), "r") as f:
    CONFIG = json.load(f)

# Configuration variables
model_name_map = {
    "llama_2_7b": "Llama-2-7b-chat-hf",
    "babel_9b": "Babel-9B-Chat",
    "gemma_12b": "gemma-3-12b-it"
}

MIN_WORD_LEN = 3
MIN_JAMO_LEN = 2
MIN_WORD_FREQ = CONFIG["min_freq"]

NUM_SAMPLES = 1000
NUM_QUANTILES = CONFIG["num_quantiles"]

RANDOM_SEED = CONFIG["seed"]
random.seed(RANDOM_SEED)


In [None]:
def sample_by_freq(df):
    df['freq_quantile'], bins = pd.qcut(df['freq'], NUM_QUANTILES, labels=False, duplicates='drop', retbins=True)
    num_quantiles = df['freq_quantile'].nunique()
    samples_per_quantile = NUM_SAMPLES // num_quantiles
    
    sampled = []
    for quantile in range(num_quantiles):
        quantile_df = df[df['freq_quantile'] == quantile]
        if len(quantile_df) > 0:
            sampled.append(quantile_df.sample(min(len(quantile_df), samples_per_quantile), replace=False, random_state=RANDOM_SEED))
    sampled_df = pd.concat(sampled, ignore_index=False).drop_duplicates(subset=['word'])
    
    if len(sampled_df) < NUM_SAMPLES:
        remaining = NUM_SAMPLES - len(sampled_df)
        other_df = df.drop(sampled_df.index, errors='ignore')
        print(f"remaining: {remaining}, other_df: {len(other_df)}")
        additional_samples = other_df.sample(min(len(other_df), remaining), replace=False, random_state=RANDOM_SEED)
        print(f"additional_samples: {len(additional_samples)}")
        sampled_df = pd.concat([sampled_df, additional_samples]).drop_duplicates(subset=['word'])

    print(f"sampled_df: {len(sampled_df)}")
    
    return sampled_df.reset_index(drop=True)

## ENGLISH

In [None]:
LANGUAGE = "English"
TOKENIZER = "babel_9b"
MODEL_NAME = model_name_map[TOKENIZER]

df = pd.read_csv(f"/home/hyujang/multilingual-inner-lexicon/data/{LANGUAGE}_tokenizers_comparison.csv")
df.drop_duplicates(subset=["word"], keep="first", inplace=True)
df["word_len"] = df["word"].apply(len)
df = df[df['freq'] >= MIN_WORD_FREQ]
df = df[(df[f"token_num_{TOKENIZER}"]>1)].reset_index(drop=True)
print(df[f'token_num_{TOKENIZER}'].value_counts())
sampled_df = sample_by_freq(df)
print(sampled_df[f'token_num_{TOKENIZER}'].value_counts(normalize=True))
# sampled_df.to_csv(f"/home/hyujang/multilingual-inner-lexicon/data/RQ1/WordIdentity/multi_token_{MODEL_NAME}_{LANGUAGE}.csv", index=False)

In [None]:
LANGUAGE = "English"
TOKENIZER = "gemma_12b"
MODEL_NAME = model_name_map[TOKENIZER]

df = pd.read_csv(f"/home/hyujang/multilingual-inner-lexicon/data/{LANGUAGE}_tokenizers_comparison.csv")
df.drop_duplicates(subset=["word"], keep="first", inplace=True)
df["word_len"] = df["word"].apply(len)
df = df[df['freq'] >= MIN_WORD_FREQ]
df = df[(df[f"token_num_{TOKENIZER}"]>1)].reset_index(drop=True)
print(df[f'token_num_{TOKENIZER}'].value_counts())
sampled_df = sample_by_freq(df)
print(sampled_df[f'token_num_{TOKENIZER}'].value_counts(normalize=True))
sampled_df.to_csv(f"/home/hyujang/multilingual-inner-lexicon/data/RQ1/WordIdentity/multi_token_{MODEL_NAME}_{LANGUAGE}.csv", index=False)

In [None]:
LANGUAGE = "English"
TOKENIZER = "llama_2_7b"
MODEL_NAME = model_name_map[TOKENIZER]

df = pd.read_csv(f"/home/hyujang/multilingual-inner-lexicon/data/{LANGUAGE}_tokenizers_comparison.csv")
df.drop_duplicates(subset=["word"], keep="first", inplace=True)
df["word_len"] = df["word"].apply(len)
df = df[df['freq'] >= MIN_WORD_FREQ]
df = df[(df[f"token_num_{TOKENIZER}"]>1)].reset_index(drop=True)
print(df[f'token_num_{TOKENIZER}'].value_counts())
sampled_df = sample_by_freq(df)
print(sampled_df[f'token_num_{TOKENIZER}'].value_counts(normalize=True))
sampled_df.to_csv(f"/home/hyujang/multilingual-inner-lexicon/data/RQ1/WordIdentity/multi_token_{MODEL_NAME}_{LANGUAGE}.csv", index=False)

## KOREAN

In [None]:
LANGUAGE = "Korean"
TOKENIZER = "babel_9b"
MODEL_NAME = model_name_map[TOKENIZER]

df = pd.read_csv(f"/home/hyujang/multilingual-inner-lexicon/data/{LANGUAGE}_tokenizers_comparison.csv")
df.drop_duplicates(subset=["word"], keep="first", inplace=True)
df["word_len"] = df["word"].apply(len)
df = df[df['freq'] >= MIN_WORD_FREQ]
df = df[(df[f"token_num_{TOKENIZER}"]>1)].reset_index(drop=True)
print(df[f'token_num_{TOKENIZER}'].value_counts())
sampled_df = sample_by_freq(df)
print(sampled_df[f'token_num_{TOKENIZER}'].value_counts(normalize=True))
print(sampled_df['word_len'].value_counts(normalize=True))
sampled_df.to_csv(f"/home/hyujang/multilingual-inner-lexicon/data/RQ1/WordIdentity/multi_token_{MODEL_NAME}_{LANGUAGE}.csv", index=False)

In [None]:
LANGUAGE = "Korean"
TOKENIZER = "gemma_12b"
MODEL_NAME = model_name_map[TOKENIZER]

df = pd.read_csv(f"/home/hyujang/multilingual-inner-lexicon/data/{LANGUAGE}_tokenizers_comparison.csv")
df.drop_duplicates(subset=["word"], keep="first", inplace=True)
df["word_len"] = df["word"].apply(len)
df = df[df['freq'] >= MIN_WORD_FREQ]
df = df[(df[f"token_num_{TOKENIZER}"]>1)].reset_index(drop=True)
print(df[f'token_num_{TOKENIZER}'].value_counts())
sampled_df = sample_by_freq(df)
print(sampled_df[f'token_num_{TOKENIZER}'].value_counts(normalize=True))
print(sampled_df['word_len'].value_counts(normalize=True))
sampled_df.to_csv(f"/home/hyujang/multilingual-inner-lexicon/data/RQ1/WordIdentity/multi_token_{MODEL_NAME}_{LANGUAGE}.csv", index=False)

In [None]:
LANGUAGE = "Korean"
TOKENIZER = "llama_2_7b"
MODEL_NAME = model_name_map[TOKENIZER]

df = pd.read_csv(f"/home/hyujang/multilingual-inner-lexicon/data/{LANGUAGE}_tokenizers_comparison.csv")
df.drop_duplicates(subset=["word"], keep="first", inplace=True)
df["word_len"] = df["word"].apply(len)
df = df[df['freq'] >= MIN_WORD_FREQ]
df = df[(df[f"token_num_{TOKENIZER}"]>2)].reset_index(drop=True) # Adjusted to >2 for llama_2_7b
print(df[f'token_num_{TOKENIZER}'].value_counts())
sampled_df = sample_by_freq(df)
print(sampled_df[f'token_num_{TOKENIZER}'].value_counts(normalize=True))
print(sampled_df['word_len'].value_counts(normalize=True))
sampled_df.to_csv(f"/home/hyujang/multilingual-inner-lexicon/data/RQ1/WordIdentity/multi_token_{MODEL_NAME}_{LANGUAGE}.csv", index=False)

## GERMAN

In [None]:
LANGUAGE = "German"
TOKENIZER = "babel_9b"
MODEL_NAME = model_name_map[TOKENIZER]

df = pd.read_csv(f"/home/hyujang/multilingual-inner-lexicon/data/{LANGUAGE}_tokenizers_comparison.csv")
df.drop_duplicates(subset=["word"], keep="first", inplace=True)
df["word_len"] = df["word"].apply(len)
df = df[df['freq'] >= MIN_WORD_FREQ]
df = df[(df[f"token_num_{TOKENIZER}"]>1) & (df[f"token_num_{TOKENIZER}"]<2)].reset_index(drop=True) # Adjusted to >2 for llama_2_7b
print(df[f'token_num_{TOKENIZER}'].value_counts())
sampled_df = sample_by_freq(df)
print(sampled_df[f'token_num_{TOKENIZER}'].value_counts(normalize=True))
print(sampled_df['word_len'].value_counts(normalize=True))
# sampled_df.to_csv(f"/home/hyujang/multilingual-inner-lexicon/data/RQ1/WordIdentity/multi_token_{MODEL_NAME}_{LANGUAGE}.csv", index=False)

In [None]:
LANGUAGE = "German"
TOKENIZER = "gemma_12b"
MODEL_NAME = model_name_map[TOKENIZER]

df = pd.read_csv(f"/home/hyujang/multilingual-inner-lexicon/data/{LANGUAGE}_tokenizers_comparison.csv")
df.drop_duplicates(subset=["word"], keep="first", inplace=True)
df["word_len"] = df["word"].apply(len)
df = df[df['freq'] >= MIN_WORD_FREQ]
df = df[(df[f"token_num_{TOKENIZER}"]>2)].reset_index(drop=True) # Adjusted to >2 for llama_2_7b
print(df[f'token_num_{TOKENIZER}'].value_counts())
sampled_df = sample_by_freq(df)
print(sampled_df[f'token_num_{TOKENIZER}'].value_counts(normalize=True))
print(sampled_df['word_len'].value_counts(normalize=True))
sampled_df.to_csv(f"/home/hyujang/multilingual-inner-lexicon/data/RQ1/WordIdentity/multi_token_{MODEL_NAME}_{LANGUAGE}.csv", index=False)

In [None]:
LANGUAGE = "German"
TOKENIZER = "llama_2_7b"
MODEL_NAME = model_name_map[TOKENIZER]

df = pd.read_csv(f"/home/hyujang/multilingual-inner-lexicon/data/{LANGUAGE}_tokenizers_comparison.csv")
df.drop_duplicates(subset=["word"], keep="first", inplace=True)
df["word_len"] = df["word"].apply(len)
df = df[df['freq'] >= MIN_WORD_FREQ]
df = df[(df[f"token_num_{TOKENIZER}"]>2)].reset_index(drop=True) # Adjusted to >2 for llama_2_7b
print(df[f'token_num_{TOKENIZER}'].value_counts())
sampled_df = sample_by_freq(df)
print(sampled_df[f'token_num_{TOKENIZER}'].value_counts(normalize=True))
print(sampled_df['word_len'].value_counts(normalize=True))
sampled_df.to_csv(f"/home/hyujang/multilingual-inner-lexicon/data/RQ1/WordIdentity/multi_token_{MODEL_NAME}_{LANGUAGE}.csv", index=False)

# TESTS

In [None]:
import sys
import os
# Add the RQ1 directory to the path
# sys.path.append(os.path.abspath("../"))
# from ..WordNonword.classification import WordNonwordClassifier
from logitlens import LogitLens

# model_name = "google/gemma-3-12b-it"
# model_name = "google/gemma-3-12b-pt"
model_name = "meta-llama/Llama-2-7b-chat-hf"
# model_name = "Tower-Babel/Babel-9B-Chat"
word_nonword_cls = LogitLens("English", model_name) # language is not used in the model name, but it is required by the class

In [None]:
import ast
word_nonword_cls.tokenizer.convert_tokens_to_string([ast.literal_eval(df['tokens_babel_9b'][0])[0]])

In [None]:
word_nonword_cls.tokenizer.convert_tokens_to_string