In [5]:
import os
import json
import pandas as pd
import random

BASE_DIR = "/home/hyujang/multilingual-inner-lexicon"
with open(os.path.join(BASE_DIR, "RQ1/config.json"), "r") as f:
    CONFIG = json.load(f)

# Configuration variables
model_name_map = {
    "llama_2_7b": "Llama-2-7b-chat-hf",
    "babel_9b": "Babel-9B-Chat",
    "gemma_12b": "gemma-3-12b-it"
}

MIN_WORD_LEN = 3
MIN_JAMO_LEN = 2
MIN_WORD_FREQ = CONFIG["min_freq"]

NUM_SAMPLES = 1000
NUM_QUANTILES = CONFIG["num_quantiles"]

RANDOM_SEED = CONFIG["seed"]
random.seed(RANDOM_SEED)


In [6]:
def sample_by_freq(df):
    df['freq_quantile'], bins = pd.qcut(df['freq'], NUM_QUANTILES, labels=False, duplicates='drop', retbins=True)
    num_quantiles = df['freq_quantile'].nunique()
    samples_per_quantile = NUM_SAMPLES // num_quantiles
    
    sampled = []
    for quantile in range(num_quantiles):
        quantile_df = df[df['freq_quantile'] == quantile]
        if len(quantile_df) > 0:
            sampled.append(quantile_df.sample(min(len(quantile_df), samples_per_quantile), replace=False, random_state=RANDOM_SEED))
    sampled_df = pd.concat(sampled, ignore_index=False).drop_duplicates(subset=['word'])
    
    if len(sampled_df) < NUM_SAMPLES:
        remaining = NUM_SAMPLES - len(sampled_df)
        other_df = df.drop(sampled_df.index, errors='ignore')
        print(f"remaining: {remaining}, other_df: {len(other_df)}")
        additional_samples = other_df.sample(min(len(other_df), remaining), replace=False, random_state=RANDOM_SEED)
        print(f"additional_samples: {len(additional_samples)}")
        sampled_df = pd.concat([sampled_df, additional_samples]).drop_duplicates(subset=['word'])

    print(f"sampled_df: {len(sampled_df)}")
    
    return sampled_df.reset_index(drop=True)

## ENGLISH

In [None]:
LANGUAGE = "English"
TOKENIZER = "babel_9b"
MODEL_NAME = model_name_map[TOKENIZER]

df = pd.read_csv(f"/home/hyujang/multilingual-inner-lexicon/data/{LANGUAGE}_tokenizers_comparison.csv")
df.drop_duplicates(subset=["word"], keep="first", inplace=True)
df["word_len"] = df["word"].apply(len)
df = df[df['freq'] >= MIN_WORD_FREQ]
df = df[(df[f"token_num_{TOKENIZER}"]>1)].reset_index(drop=True)
print(df[f'token_num_{TOKENIZER}'].value_counts())
sampled_df = sample_by_freq(df)
print(sampled_df[f'token_num_{TOKENIZER}'].value_counts(normalize=True))
sampled_df.to_csv(f"/home/hyujang/multilingual-inner-lexicon/data/RQ1/WordIdentity/multi_token_{MODEL_NAME}_{LANGUAGE}.csv", index=False)

token_num_babel_9b
2    8714
3    3666
4     680
5      86
6      21
7       4
8       1
Name: count, dtype: int64
sampled_df: 1000
token_num_babel_9b
2    0.629
3    0.317
4    0.048
5    0.006
Name: proportion, dtype: float64


: 

In [42]:
LANGUAGE = "English"
TOKENIZER = "gemma_12b"
MODEL_NAME = model_name_map[TOKENIZER]

df = pd.read_csv(f"/home/hyujang/multilingual-inner-lexicon/data/{LANGUAGE}_tokenizers_comparison.csv")
df.drop_duplicates(subset=["word"], keep="first", inplace=True)
df["word_len"] = df["word"].apply(len)
df = df[df['freq'] >= MIN_WORD_FREQ]
df = df[(df[f"token_num_{TOKENIZER}"]>1)].reset_index(drop=True)
print(df[f'token_num_{TOKENIZER}'].value_counts())
sampled_df = sample_by_freq(df)
print(sampled_df[f'token_num_{TOKENIZER}'].value_counts(normalize=True))
sampled_df.to_csv(f"/home/hyujang/multilingual-inner-lexicon/data/RQ1/WordIdentity/multi_token_{MODEL_NAME}_{LANGUAGE}.csv", index=False)

token_num_gemma_12b
2    8543
3    2178
4     262
5      29
6       3
7       1
Name: count, dtype: int64
sampled_df: 1000
token_num_gemma_12b
2    0.779
3    0.187
4    0.029
5    0.004
7    0.001
Name: proportion, dtype: float64


In [41]:
LANGUAGE = "English"
TOKENIZER = "llama_2_7b"
MODEL_NAME = model_name_map[TOKENIZER]

df = pd.read_csv(f"/home/hyujang/multilingual-inner-lexicon/data/{LANGUAGE}_tokenizers_comparison.csv")
df.drop_duplicates(subset=["word"], keep="first", inplace=True)
df["word_len"] = df["word"].apply(len)
df = df[df['freq'] >= MIN_WORD_FREQ]
df = df[(df[f"token_num_{TOKENIZER}"]>1)].reset_index(drop=True)
print(df[f'token_num_{TOKENIZER}'].value_counts())
sampled_df = sample_by_freq(df)
print(sampled_df[f'token_num_{TOKENIZER}'].value_counts(normalize=True))
sampled_df.to_csv(f"/home/hyujang/multilingual-inner-lexicon/data/RQ1/WordIdentity/multi_token_{MODEL_NAME}_{LANGUAGE}.csv", index=False)

token_num_llama_2_7b
2    7650
3    3885
4    1016
5     207
6      44
7       7
8       2
9       1
Name: count, dtype: int64
sampled_df: 1000
token_num_llama_2_7b
2    0.595
3    0.304
4    0.080
5    0.014
6    0.004
8    0.002
7    0.001
Name: proportion, dtype: float64


## KOREAN

In [40]:
LANGUAGE = "Korean"
TOKENIZER = "babel_9b"
MODEL_NAME = model_name_map[TOKENIZER]

df = pd.read_csv(f"/home/hyujang/multilingual-inner-lexicon/data/{LANGUAGE}_tokenizers_comparison.csv")
df.drop_duplicates(subset=["word"], keep="first", inplace=True)
df["word_len"] = df["word"].apply(len)
df = df[df['freq'] >= MIN_WORD_FREQ]
df = df[(df[f"token_num_{TOKENIZER}"]>1)].reset_index(drop=True)
print(df[f'token_num_{TOKENIZER}'].value_counts())
sampled_df = sample_by_freq(df)
print(sampled_df[f'token_num_{TOKENIZER}'].value_counts(normalize=True))
print(sampled_df['word_len'].value_counts(normalize=True))
sampled_df.to_csv(f"/home/hyujang/multilingual-inner-lexicon/data/RQ1/WordIdentity/multi_token_{MODEL_NAME}_{LANGUAGE}.csv", index=False)

token_num_babel_9b
2    13927
3     6238
4     1559
5      353
6       91
7       25
8       20
9        1
Name: count, dtype: int64
sampled_df: 1000
token_num_babel_9b
2    0.620
3    0.284
4    0.077
5    0.015
6    0.002
7    0.001
8    0.001
Name: proportion, dtype: float64
word_len
2    0.595
3    0.300
4    0.086
5    0.015
6    0.003
8    0.001
Name: proportion, dtype: float64


In [39]:
LANGUAGE = "Korean"
TOKENIZER = "gemma_12b"
MODEL_NAME = model_name_map[TOKENIZER]

df = pd.read_csv(f"/home/hyujang/multilingual-inner-lexicon/data/{LANGUAGE}_tokenizers_comparison.csv")
df.drop_duplicates(subset=["word"], keep="first", inplace=True)
df["word_len"] = df["word"].apply(len)
df = df[df['freq'] >= MIN_WORD_FREQ]
df = df[(df[f"token_num_{TOKENIZER}"]>1)].reset_index(drop=True)
print(df[f'token_num_{TOKENIZER}'].value_counts())
sampled_df = sample_by_freq(df)
print(sampled_df[f'token_num_{TOKENIZER}'].value_counts(normalize=True))
print(sampled_df['word_len'].value_counts(normalize=True))
sampled_df.to_csv(f"/home/hyujang/multilingual-inner-lexicon/data/RQ1/WordIdentity/multi_token_{MODEL_NAME}_{LANGUAGE}.csv", index=False)

token_num_gemma_12b
2     14327
3      6144
4      1293
5       227
6        47
7        11
8         5
10        1
9         1
Name: count, dtype: int64
sampled_df: 1000
token_num_gemma_12b
2    0.681
3    0.240
4    0.064
5    0.012
6    0.002
7    0.001
Name: proportion, dtype: float64
word_len
2    0.626
3    0.263
4    0.088
5    0.018
6    0.003
7    0.001
8    0.001
Name: proportion, dtype: float64


In [38]:
LANGUAGE = "Korean"
TOKENIZER = "llama_2_7b"
MODEL_NAME = model_name_map[TOKENIZER]

df = pd.read_csv(f"/home/hyujang/multilingual-inner-lexicon/data/{LANGUAGE}_tokenizers_comparison.csv")
df.drop_duplicates(subset=["word"], keep="first", inplace=True)
df["word_len"] = df["word"].apply(len)
df = df[df['freq'] >= MIN_WORD_FREQ]
df = df[(df[f"token_num_{TOKENIZER}"]>2)].reset_index(drop=True) # Adjusted to >2 for llama_2_7b
print(df[f'token_num_{TOKENIZER}'].value_counts())
sampled_df = sample_by_freq(df)
print(sampled_df[f'token_num_{TOKENIZER}'].value_counts(normalize=True))
print(sampled_df['word_len'].value_counts(normalize=True))
sampled_df.to_csv(f"/home/hyujang/multilingual-inner-lexicon/data/RQ1/WordIdentity/multi_token_{MODEL_NAME}_{LANGUAGE}.csv", index=False)

token_num_llama_2_7b
5     7086
7     3947
3     3322
6     2742
8     2154
4     1849
10     663
9      607
11     325
12     100
13      71
14      49
15      23
17       7
16       6
19       2
21       1
25       1
Name: count, dtype: int64
sampled_df: 1000
token_num_llama_2_7b
5     0.317
7     0.169
3     0.158
6     0.120
8     0.086
4     0.074
10    0.027
9     0.022
11    0.016
12    0.003
13    0.003
14    0.003
15    0.002
Name: proportion, dtype: float64
word_len
2    0.609
3    0.265
4    0.069
1    0.026
5    0.019
6    0.008
7    0.003
8    0.001
Name: proportion, dtype: float64


In [11]:
import ast
import pandas as pd
LANGUAGE
df = pd.read_csv(f"/home/hyujang/multilingual-inner-lexicon/data/{LANGUAGE}_tokenizers_comparison.csv")
df['tokens_llama_2_7b'] = df['tokens_llama_2_7b'].apply(ast.literal_eval)
df['tokens_llama_2_7b'].str[0].value_counts() # all tokens start with '▁'


tokens_llama_2_7b
▁    60595
Name: count, dtype: int64

In [12]:
df['tokens_llama_2_7b'][0]

['▁', '역']

In [16]:
word_nonword_cls.tokenizer.convert_tokens_to_string([df['tokens_llama_2_7b'][0][0]])

''

## GERMAN

In [37]:
LANGUAGE = "German"
TOKENIZER = "babel_9b"
MODEL_NAME = model_name_map[TOKENIZER]

df = pd.read_csv(f"/home/hyujang/multilingual-inner-lexicon/data/{LANGUAGE}_tokenizers_comparison.csv")
df.drop_duplicates(subset=["word"], keep="first", inplace=True)
df["word_len"] = df["word"].apply(len)
df = df[df['freq'] >= MIN_WORD_FREQ]
df = df[(df[f"token_num_{TOKENIZER}"]>2)].reset_index(drop=True) # Adjusted to >2 for llama_2_7b
print(df[f'token_num_{TOKENIZER}'].value_counts())
sampled_df = sample_by_freq(df)
print(sampled_df[f'token_num_{TOKENIZER}'].value_counts(normalize=True))
print(sampled_df['word_len'].value_counts(normalize=True))
sampled_df.to_csv(f"/home/hyujang/multilingual-inner-lexicon/data/RQ1/WordIdentity/multi_token_{MODEL_NAME}_{LANGUAGE}.csv", index=False)

token_num_babel_9b
4     10118
3      8932
5      7174
6      3489
7      1399
8       392
9       135
10       39
11        6
12        3
Name: count, dtype: int64
sampled_df: 1000
token_num_babel_9b
4     0.350
3     0.270
5     0.229
6     0.101
7     0.033
8     0.012
9     0.004
10    0.001
Name: proportion, dtype: float64
word_len
10    0.123
9     0.109
11    0.099
12    0.097
13    0.097
15    0.076
8     0.072
14    0.059
16    0.051
7     0.044
17    0.044
18    0.027
19    0.025
6     0.023
5     0.011
21    0.008
22    0.008
20    0.008
4     0.005
24    0.004
23    0.004
28    0.002
26    0.002
25    0.001
29    0.001
Name: proportion, dtype: float64


In [36]:
LANGUAGE = "German"
TOKENIZER = "gemma_12b"
MODEL_NAME = model_name_map[TOKENIZER]

df = pd.read_csv(f"/home/hyujang/multilingual-inner-lexicon/data/{LANGUAGE}_tokenizers_comparison.csv")
df.drop_duplicates(subset=["word"], keep="first", inplace=True)
df["word_len"] = df["word"].apply(len)
df = df[df['freq'] >= MIN_WORD_FREQ]
df = df[(df[f"token_num_{TOKENIZER}"]>2)].reset_index(drop=True) # Adjusted to >2 for llama_2_7b
print(df[f'token_num_{TOKENIZER}'].value_counts())
sampled_df = sample_by_freq(df)
print(sampled_df[f'token_num_{TOKENIZER}'].value_counts(normalize=True))
print(sampled_df['word_len'].value_counts(normalize=True))
sampled_df.to_csv(f"/home/hyujang/multilingual-inner-lexicon/data/RQ1/WordIdentity/multi_token_{MODEL_NAME}_{LANGUAGE}.csv", index=False)

token_num_gemma_12b
3     12866
4      9306
5      3888
6      1106
7       262
8        48
9         8
10        2
Name: count, dtype: int64
sampled_df: 1000
token_num_gemma_12b
3    0.492
4    0.313
5    0.143
6    0.040
7    0.007
8    0.005
Name: proportion, dtype: float64
word_len
11    0.130
12    0.115
10    0.106
9     0.097
14    0.084
13    0.078
15    0.065
8     0.050
18    0.045
17    0.044
16    0.041
7     0.033
19    0.025
6     0.024
20    0.018
21    0.012
5     0.008
22    0.007
24    0.005
28    0.002
25    0.002
26    0.002
23    0.002
27    0.002
4     0.002
29    0.001
Name: proportion, dtype: float64


In [33]:
LANGUAGE = "German"
TOKENIZER = "llama_2_7b"
MODEL_NAME = model_name_map[TOKENIZER]

df = pd.read_csv(f"/home/hyujang/multilingual-inner-lexicon/data/{LANGUAGE}_tokenizers_comparison.csv")
df.drop_duplicates(subset=["word"], keep="first", inplace=True)
df["word_len"] = df["word"].apply(len)
df = df[df['freq'] >= MIN_WORD_FREQ]
df = df[(df[f"token_num_{TOKENIZER}"]>2)].reset_index(drop=True) # Adjusted to >2 for llama_2_7b
print(df[f'token_num_{TOKENIZER}'].value_counts())
sampled_df = sample_by_freq(df)
print(sampled_df[f'token_num_{TOKENIZER}'].value_counts(normalize=True))
print(sampled_df['word_len'].value_counts(normalize=True))
sampled_df.to_csv(f"/home/hyujang/multilingual-inner-lexicon/data/RQ1/WordIdentity/multi_token_{MODEL_NAME}_{LANGUAGE}.csv", index=False)

token_num_llama_2_7b
3     11449
4      9848
5      5172
6      1960
7       601
8       179
9        35
10       16
11        2
12        1
Name: count, dtype: int64
sampled_df: 1000
token_num_llama_2_7b
3     0.416
4     0.335
5     0.177
6     0.048
7     0.017
8     0.006
10    0.001
Name: proportion, dtype: float64
word_len
10    0.127
11    0.123
13    0.111
12    0.094
9     0.087
15    0.080
8     0.065
14    0.061
17    0.041
16    0.038
7     0.035
18    0.030
19    0.026
6     0.022
21    0.015
20    0.015
22    0.010
5     0.007
23    0.006
24    0.003
26    0.002
28    0.001
4     0.001
Name: proportion, dtype: float64


# TESTS

In [1]:
import sys
import os
# Add the RQ1 directory to the path
# sys.path.append(os.path.abspath("../"))
# from ..WordNonword.classification import WordNonwordClassifier
from logitlens import LogitLens

# model_name = "google/gemma-3-12b-it"
# model_name = "google/gemma-3-12b-pt"
model_name = "meta-llama/Llama-2-7b-chat-hf"
# model_name = "Tower-Babel/Babel-9B-Chat"
word_nonword_cls = LogitLens("English", model_name) # language is not used in the model name, but it is required by the class

Using device: cuda


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [7]:
import ast
word_nonword_cls.tokenizer.convert_tokens_to_string([ast.literal_eval(df['tokens_babel_9b'][0])[0]])

'선'

In [15]:
word_nonword_cls.tokenizer.convert_tokens_to_string

['ìĦł', 'ìĪĺ']