# Multi-token words

In [None]:
import os
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Directory containing the files
data_dir = "/home/hyujang/multilingual-inner-lexicon/data/RQ1/WordNonword"

# Load all files that don't end with "2token.csv"
files = [f for f in os.listdir(data_dir) if not f.endswith("2token.csv")]
# files = [f for f in os.listdir(data_dir) if f.endswith("2token.csv")]
files.sort()
# Initialize a dictionary to store data for each model and language
data_dict = {}

# Process each file
for file in files:
    # Extract model and language from the filename
    parts = file.split("_")
    model = parts[1]
    language = parts[2].split(".")[0]
    
    # Load the CSV file
    file_path = os.path.join(data_dir, file)
    df = pd.read_csv(file_path)
    
    # Ensure the required columns exist
    if "token_num" not in df.columns or "freq" not in df.columns:
        print(f"Skipping file {file} due to missing columns.")
        continue
    df = df[df["label"]=="realword"]
    # Store the processed data
    key = f"{model} ({language})"
    data_dict[key] = df

# Define the desired order for languages and models
lang_order = ["English", "German", "Korean"]
model_order = ["Babel-9B-Chat", "gemma-3-12b-it", "Llama-2-7b-chat-hf"]

# Sort the data_dict keys based on the specified order
ordered_keys = sorted(
    data_dict.keys(),
    key=lambda k: (lang_order.index(k.split(" (")[1][:-1]), model_order.index(k.split(" (")[0]))
    # key=lambda k: (lang_order.index(k.split(" (")[1][:-1].split("-")[0]), model_order.index(k.split(" (")[0]))
)

# Reorder the data_dict based on the ordered keys
ordered_data_dict = {key: data_dict[key] for key in ordered_keys}

# Create a 3x3 grid of subplots
num_subplots = len(ordered_data_dict)
rows, cols = 3, 3  # Adjust rows and columns as needed
fig, axes = plt.subplots(nrows=rows, ncols=cols, figsize=(20, 10), sharex=True, sharey=True)

# Flatten axes for easier iteration
axes = axes.flatten()

for i, (ax, (key, df)) in enumerate(zip(axes, ordered_data_dict.items())):
    # Filter data for token lengths 2, 3, and 4
    filtered_df = df[df["token_num"].isin([2, 3, 4])]

    # Create a boxplot
    sns.boxplot(data=filtered_df, x="token_num", y="freq", ax=ax, palette="Set2")
    ax.set_yscale("log")
    ax.set_title(f"{key}", fontsize=20)
    ax.set_xlabel("Token Length", labelpad=20, fontsize=16)
    ax.set_xticklabels("")
    ax.set_ylabel("Frequency", fontsize=16)
    ax.tick_params(axis='both', which='major', labelsize=14)
    ax.grid(axis="y", linestyle="--", alpha=0.7)

    # Calculate dataset size for each token length
    token_sizes = filtered_df.groupby("token_num").size()

    # Add dataset size below each boxplot
    for token, size in token_sizes.items():
        ax.text(
            x=token - 2,  # Adjust x position based on token length (2, 3, 4)
            y=ax.get_ylim()[0] - 1.5,  # Add more space below the boxplot
            s=f"{token} (n={size})",
            ha="center",
            fontsize=15
        )

# Remove unused axes
for i in range(len(ordered_data_dict), len(axes)):
    fig.delaxes(axes[i])

# Adjust layout for more space between subplots and labels
# plt.tight_layout(pad=3.0)  # Increase padding between subplots
fig.subplots_adjust(bottom=0.1, top=0.95, hspace=0.35, wspace=0.1)  # Add more space between rows and adjust margins
# fig.text(0.5, 0.04, "Sentence length", ha="center", fontsize=13)

plt.show()

In [None]:
import os
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Directory containing the files
data_dir = "/home/hyujang/multilingual-inner-lexicon/data/RQ1/WordNonword"

# Load all files that don't end with "2token.csv"
# files = [f for f in os.listdir(data_dir) if not f.endswith("2token.csv")]
files = [f for f in os.listdir(data_dir) if f.endswith("2token.csv")]
files.sort()
# Initialize a dictionary to store data for each model and language
data_dict = {}

# Process each file
for file in files:
    # Extract model and language from the filename
    parts = file.split("_")
    model = parts[1]
    language = parts[2].split(".")[0].split("-")[0]
    
    # Load the CSV file
    file_path = os.path.join(data_dir, file)
    df = pd.read_csv(file_path)
    
    # Ensure the required columns exist
    if "token_num" not in df.columns or "freq" not in df.columns:
        print(f"Skipping file {file} due to missing columns.")
        continue
    df = df[df["label"]=="realword"]
    # Store the processed data
    key = f"{model} ({language})"
    data_dict[key] = df

# Define the desired order for languages and models
lang_order = ["English", "German", "Korean"]
model_order = ["Babel-9B-Chat", "gemma-3-12b-it", "Llama-2-7b-chat-hf"]

# Sort the data_dict keys based on the specified order
ordered_keys = sorted(
    data_dict.keys(),
    key=lambda k: (lang_order.index(k.split(" (")[1][:-1]), model_order.index(k.split(" (")[0]))
    # key=lambda k: (lang_order.index(k.split(" (")[1][:-1].split("-")[0]), model_order.index(k.split(" (")[0]))
)

# Reorder the data_dict based on the ordered keys
ordered_data_dict = {key: data_dict[key] for key in ordered_keys}

# Create a 3x3 grid of subplots
num_subplots = len(ordered_data_dict)
rows, cols = 3, 3  # Adjust rows and columns as needed
fig, axes = plt.subplots(nrows=rows, ncols=cols, figsize=(20, 10), sharex=True, sharey=True)

# Flatten axes for easier iteration
axes = axes.flatten()

for i, (ax, (key, df)) in enumerate(zip(axes, ordered_data_dict.items())):
    # Filter data for token lengths 2, 3, and 4
    filtered_df = df[df["token_num"].isin([2])]

    sns.boxplot(data=filtered_df, x="token_num", y="freq", ax=ax, palette="Set2")
    ax.set_yscale("log")
    ax.set_title(f"{key}", fontsize=20)
    ax.set_xlabel("Token Length", labelpad=20, fontsize=16)
    ax.set_xticklabels("")
    ax.set_ylabel("Frequency", fontsize=16)
    ax.tick_params(axis='both', which='major', labelsize=14)
    ax.grid(axis="y", linestyle="--", alpha=0.7)

    # Calculate dataset size for each token length
    token_sizes = filtered_df.groupby("token_num").size()

    # Add dataset size below each boxplot
    for token, size in token_sizes.items():
        ax.text(
            x=token - 2,  # Adjust x position based on token length (2, 3, 4)
            y=ax.get_ylim()[0] - 1.5,  # Add more space below the boxplot
            s=f"{token} (n={size})",
            ha="center",
            fontsize=15
        )


# Remove unused axes
for i in range(len(ordered_data_dict), len(axes)):
    fig.delaxes(axes[i])

# Adjust layout for more space between subplots and labels
plt.tight_layout(pad=3.0)  # Increase padding between subplots
fig.subplots_adjust(bottom=0.1, top=0.95, hspace=0.35, wspace=0.1)  # Add more space between rows and adjust margins

plt.show()

# Single-token words

In [None]:
import os
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from ast import literal_eval
# Directory containing the files
data_dir = "/home/hyujang/multilingual-inner-lexicon/data/RQ1/WordIdentity"

# Load all files that don't end with "2token.csv"
files = [f for f in os.listdir(data_dir) if "single_token_splitted" in f]

# Initialize a dictionary to store data for each model and language
data_dict = {}

# Process each file
for file in files:
    # Extract model and language from the filename
    parts = file.split("_")
    model = parts[-3]
    language = parts[-2]
    
    # Load the CSV file
    file_path = os.path.join(data_dir, file)
    df = pd.read_csv(file_path)
    df["token_num"] = df["splitted_tokens"].apply(literal_eval).apply(len)
    
    # Ensure the required columns exist
    if "token_num" not in df.columns or "freq" not in df.columns:
        print(f"Skipping file {file} due to missing columns.")
        continue
    # df = df[df["label"]=="realword"]
    # Store the processed data
    key = f"{model} ({language})"
    data_dict[key] = df

# Define the desired order for languages and models
lang_order = ["English", "German"]
model_order = ["Babel-9B-Chat", "gemma-3-12b-it", "Llama-2-7b-chat-hf"]

# Sort the data_dict keys based on the specified order
ordered_keys = sorted(
    data_dict.keys(),
    key=lambda k: (lang_order.index(k.split(" (")[1][:-1]), model_order.index(k.split(" (")[0]))
)

# Reorder the data_dict based on the ordered keys
ordered_data_dict = {key: data_dict[key] for key in ordered_keys}

# Create a grid of subplots with languages as rows and models as columns
rows, cols = len(lang_order), len(model_order)
fig, axes = plt.subplots(nrows=rows, ncols=cols, figsize=(20, 10), sharex=True, sharey=True)

# Flatten axes for easier iteration
axes = axes.flatten()

for i, (ax, (key, df)) in enumerate(zip(axes, ordered_data_dict.items())):
    # Filter data for token lengths 2, 3, and 4
    # filtered_df = df[df["token_num"].isin([2, 3, 4])]

    # Create a boxplot
    sns.boxplot(data=df, x="token_num", y="freq", ax=ax, palette="Set2")
    ax.set_yscale("log")
    ax.set_title(f"{key}", fontsize=20)
    ax.set_xlabel("Token Length", labelpad=20, fontsize=16)
    ax.set_xticklabels("")
    ax.set_ylabel("Frequency", fontsize=16)
    ax.tick_params(axis='both', which='major', labelsize=14)
    ax.grid(axis="y", linestyle="--", alpha=0.7)


    # Calculate dataset size for each token length
    token_sizes = df.groupby("token_num").size()

    # Add dataset size below each boxplot
    for token, size in token_sizes.items():
        ax.text(
            x=token - 2,  # Adjust x position based on token length (2, 3, 4)
            y=ax.get_ylim()[0] - 1.5,  # Add more space below the boxplot
            s=f"{token} (n={size})",
            ha="center",
            fontsize=15
        )


# Remove unused axes
for i in range(len(ordered_data_dict), len(axes)):
    fig.delaxes(axes[i])

# Adjust layout for more space between subplots and labels
plt.tight_layout(pad=3.0)  # Increase padding between subplots
fig.subplots_adjust(bottom=0.3, top=0.95, hspace=0.3)  # Add more space between rows and adjust margins
# fig.text(0.5, 0.04, "Token Length", ha="center", fontsize=14)

plt.show()

In [None]:
import os
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from ast import literal_eval
# Directory containing the files
data_dir = "/home/hyujang/multilingual-inner-lexicon/data/RQ1/WordIdentity"

# Load all files that don't end with "2token.csv"
files = [f for f in os.listdir(data_dir) if "single_token_typos" in f]

# Initialize a dictionary to store data for each model and language
data_dict = {}

# Process each file
for file in files:
    # Extract model and language from the filename
    parts = file.split("_")
    model = parts[-3]
    language = parts[-2]
    
    # Load the CSV file
    file_path = os.path.join(data_dir, file)
    df = pd.read_csv(file_path)
    df["token_num"] = df["splitted_typo_tokens"].apply(literal_eval).apply(len)
    
    # Ensure the required columns exist
    if "token_num" not in df.columns or "freq" not in df.columns:
        print(f"Skipping file {file} due to missing columns.")
        continue
    # df = df[df["label"]=="realword"]
    # Store the processed data
    key = f"{model} ({language})"
    data_dict[key] = df

# Define the desired order for languages and models
lang_order = ["English", "German"]
model_order = ["Babel-9B-Chat", "gemma-3-12b-it", "Llama-2-7b-chat-hf"]

# Sort the data_dict keys based on the specified order
ordered_keys = sorted(
    data_dict.keys(),
    key=lambda k: (lang_order.index(k.split(" (")[1][:-1]), model_order.index(k.split(" (")[0]))
)

# Reorder the data_dict based on the ordered keys
ordered_data_dict = {key: data_dict[key] for key in ordered_keys}

# Create a grid of subplots with languages as rows and models as columns
rows, cols = len(lang_order), len(model_order)
fig, axes = plt.subplots(nrows=rows, ncols=cols, figsize=(20, 10), sharex=True, sharey=True)

# Flatten axes for easier iteration
axes = axes.flatten()

for i, (ax, (key, df)) in enumerate(zip(axes, ordered_data_dict.items())):
    # Filter data for token lengths 2, 3, and 4
    # filtered_df = df[df["token_num"].isin([2, 3, 4])]

    # Create a boxplot
    sns.boxplot(data=df, x="token_num", y="freq", ax=ax, palette="Set2")
    ax.set_yscale("log")
    ax.set_title(f"{key}", fontsize=20)
    ax.set_xlabel("Token Length", labelpad=20, fontsize=16)
    ax.set_xticklabels("")
    ax.set_ylabel("Frequency", fontsize=16)
    ax.tick_params(axis='both', which='major', labelsize=14)
    ax.grid(axis="y", linestyle="--", alpha=0.7)


    # Calculate dataset size for each token length
    token_sizes = df.groupby("token_num").size()

    # Add dataset size below each boxplot
    for token, size in token_sizes.items():
        ax.text(
            x=token - 2,  # Adjust x position based on token length (2, 3, 4)
            y=ax.get_ylim()[0] - 1.5,  # Add more space below the boxplot
            s=f"{token} (n={size})",
            ha="center",
            fontsize=15
        )


# Remove unused axes
for i in range(len(ordered_data_dict), len(axes)):
    fig.delaxes(axes[i])

# Adjust layout for more space between subplots and labels
plt.tight_layout(pad=3.0)  # Increase padding between subplots
fig.subplots_adjust(bottom=0.3, top=0.95, hspace=0.3)  # Add more space between rows and adjust margins
# fig.text(0.5, 0.04, "Token Length", ha="center", fontsize=14)

plt.show()

In [None]:
data_dir = "/home/hyujang/multilingual-inner-lexicon/data/RQ1/WordIdentity"

# Load all files that don't end with "2token.csv"
files = [f for f in os.listdir(data_dir) if "single_token_typos" in f]

# Initialize a dictionary to store data for each model and language
data_dict = {}

# Process each file
for file in files:
    # Extract model and language from the filename
    parts = file.split("_")
    model = parts[-3]
    language = parts[-2]
    
    # Load the CSV file
    file_path = os.path.join(data_dir, file)
    df = pd.read_csv(file_path)
    if "typo_type" in df.columns:
        # Calculate the count of each typo type
        typo_type_counts = df["typo_type"].value_counts()

        # Calculate the percentage of each typo type
        typo_type_percentages = df["typo_type"].value_counts(normalize=True) * 100

        # Combine counts and percentages into a DataFrame
        typo_stats = pd.DataFrame({
            "Count": typo_type_counts,
            "Percentage": typo_type_percentages
        })

        # Display the statistics
        print("Typo Type Statistics:")
        print(typo_stats)
    else:
        print("The column 'typo_type' does not exist in the dataset.")

# attention

In [None]:
import os
import pandas as pd
import matplotlib.pyplot as plt

# ---------- config ----------
data_dir = "/home/hyujang/multilingual-inner-lexicon/output/RQ1/ComponentAnalysis/attention_weights2"
ref_dir  = "/home/hyujang/multilingual-inner-lexicon/data/RQ1/ComponentAnalysis"

# Files look like: Babel-9B-Chat_English_1token.csv
lang_order  = ["English", "German", "Korean"]
model_order = ["Babel-9B-Chat", "gemma-3-12b-it", "Llama-2-7b-chat-hf"]

# ---------- load & merge ----------
files_1token = [f for f in os.listdir(data_dir) if "1token" in f and f.endswith(".csv")]
files_2token = [f for f in os.listdir(data_dir) if "2token" in f and f.endswith(".csv")]

data_dict_1token = {}  # key: "Model (Language)" -> merged df
data_dict_2token = {}  # key: "Model (Language)" -> merged df

def load_and_merge(files, data_dict):
    for file in files:
        parts = file.replace(".csv", "").split("_")
        if len(parts) < 3:
            continue
        model, language = parts[0], parts[1]

        # read primary
        df = pd.read_csv(os.path.join(data_dir, file))

        # read reference for merge (provides sentence_length)
        ref_path = os.path.join(ref_dir, f"{model}_{language}_wiki_noun_frequencies_context.csv")
        if not os.path.exists(ref_path):
            print(f"[warn] Missing ref CSV: {ref_path}")
            continue
        ref_df = pd.read_csv(ref_path)

        if "word" in df.columns and "word" in ref_df.columns:
            merged = df.merge(
                ref_df[["word", "sentence_length", "original_frequency"]],
                on="word",
                how="left"
            )
            data_dict[f"{model} ({language})"] = merged
        else:
            print(f"[warn] 'word' column missing in {file} or its ref; skipped.")

load_and_merge(files_1token, data_dict_1token)
load_and_merge(files_2token, data_dict_2token)

# ---------- figure layout ----------
n_rows, n_cols = len(lang_order), len(model_order)
fig, axes = plt.subplots(n_rows, n_cols, figsize=(15, 10), sharex=False, sharey=True)
if n_rows == 1 and n_cols == 1:
    axes = [[axes]]
elif n_rows == 1:
    axes = [axes]
elif n_cols == 1:
    axes = [[ax] for ax in axes]

# compute a global max sentence length to align bins
all_lengths = []
for df in data_dict_1token.values():
    if "sentence_length" in df.columns:
        all_lengths.extend(df["sentence_length"].dropna().astype(int).tolist())
for df in data_dict_2token.values():
    if "sentence_length" in df.columns:
        all_lengths.extend(df["sentence_length"].dropna().astype(int).tolist())
max_len = max(all_lengths) if all_lengths else 0
bins = range(1, max_len + 2) if max_len > 0 else 10  # 1..max_len inclusive; fallback 10 bins

# map for quick lookup
def key_for(model, lang):
    return f"{model} ({lang})"

# ---------- plot ----------
for r, lang in enumerate(lang_order):
    for c, model in enumerate(model_order):
        ax = axes[r][c]
        key = key_for(model, lang)
        if key in data_dict_1token:
            df_1token = data_dict_1token[key]
            sl_1token = df_1token["sentence_length"].dropna().astype(int)
            if len(sl_1token) > 0:
                ax.hist(sl_1token, bins=bins, edgecolor="black", alpha=0.5, label="single-token words", color="r")
        if key in data_dict_2token:
            df_2token = data_dict_2token[key]
            sl_2token = df_2token["sentence_length"].dropna().astype(int)
            if len(sl_2token) > 0:
                ax.hist(sl_2token, bins=bins, edgecolor="black", alpha=0.5, label="two-token words", color="b")

        # Add legend and tidy grid
        ax.legend(loc="upper right", fontsize=16)
        ax.grid(axis="y", linestyle="--", alpha=0.5)

for c, model in enumerate(model_order):
    axes[0][c].set_title(model, fontsize=20, pad=10)

# row labels (languages) on left; also set y-axis label on left column only
for r, lang in enumerate(lang_order):
    axes[r][0].set_ylabel(f"{lang} Count", fontsize=20)

# shared labels
plt.tight_layout(pad=0.6, w_pad=0.3, h_pad=0.3)  # smaller pads = less gap
fig.subplots_adjust(bottom=0.1, top=0.95, hspace=0.1)  # Add more space between rows and adjust margins
fig.text(0.5, 0.04, "Sentence length", ha="center", fontsize=18)

plt.show()

# word translation pair

korean → german: 3408

korean → english: 1690

german → english: 2096

german → korean: 2210

english → german: 1868

english → korean: 1341