## visualize all 36 results

In [None]:
import unicodedata
import re

def normalize(text, lang="German"):
    """ Normalize text: lowercasing, unicode normalize, replace ß with ss, remove accents """
    text = text.lower().strip()
    text = text.replace("ß", "ss")
    text = text.replace("ä", "ae").replace("ö", "oe").replace("ü", "ue")
    # Remove accents
    text = unicodedata.normalize("NFKD", text)
    text = "".join(c for c in text if not unicodedata.combining(c))
    return text

def is_match(output, targets, lang="German"):
    norm_output = normalize(output, lang)
    for target in targets:
        norm_target = normalize(target, lang)
        # Exact match or substring match
        if norm_target in norm_output or norm_output in norm_target:
            return True
    return False

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from ast import literal_eval

# Define models, languages, and prompt versions
models = ["Tower-Babel/Babel-9B-Chat", "google/gemma-3-12b-it", "meta-llama/Llama-2-7b-chat-hf"]
language_pairs = [("English", "Korean"), ("Korean", "English"), ("English", "German"), ("German", "English"), ("German", "Korean"), ("Korean", "German")]

# Define colors and line styles for visualization
pair_colors = {
    ("English", "Korean"): "#1f77b4",  # Blue
    ("Korean", "English"): "#ff7f0e",  # Orange
    ("English", "German"): "#2ca02c",  # Green
    ("German", "English"): "#d62728",  # Red
    ("German", "Korean"): "#9467bd",  # Purple
    ("Korean", "German"): "#8c564b",  # Brown
}
version_colors = {
    0 : "#1f77b4",  # Blue
    1: "#ff7f0e",  # Orange
    2: "#2ca02c",  # Green
    3: "#8c564b",  # Brown
}

prompt_styles = {
    "EnglishPrompt": "--",
    "KoreanPrompt": "-",
    "GermanPrompt": ":"
}

lang_abbr = {
    "English": "En",
    "Korean": "Ko",
    "German": "De"
}
promptversion_abbr = {
    "EnglishPrompt": "En",
    "KoreanPrompt": "Ko",
    "GermanPrompt": "De"
}

# Create subplots
fig, axes = plt.subplots(len(models), len(language_pairs) // 2, figsize=(20, 15), sharex=True, sharey=True)
# fig.suptitle("PatchScope Retrieval Rate per Layer (Models & Language Pairs)", fontsize=20)

for i, model in enumerate(models):
    model_short = model.split("/")[-1]
    for j, pair_group in enumerate([language_pairs[:2], language_pairs[2:4], language_pairs[4:]]):  # Group pairs into subplots
        ax = axes[j, i]
        for src, tgt in pair_group:
            prompt_versions = [f"{src}Prompt", f"{tgt}Prompt"]
            for prompt_version in prompt_versions:
                try:
                    # Load dataset
                    data_df = pd.read_csv(f"/home/hyujang/multilingual-inner-lexicon/data/RQ2/MUSE/{src}_{tgt}_1000.csv")
                    data_df[tgt] = data_df[tgt].apply(literal_eval)
                    data_dict = dict(zip(data_df[src], data_df[tgt]))

                    # Load output
                    output_path = f"/home/hyujang/multilingual-inner-lexicon/output/RQ2/PatchScope/num_token_20/{model_short}_{src}_to_{tgt}_{prompt_version}_withOriginalCode.csv"
                    output_df = pd.read_csv(output_path)
                    output_df["patchscope_result"] = output_df["patchscope_result"].str.replace("\n", " ")

                    # Compute retrieval rate
                    output_df['retrieved'] = output_df.apply(
                        lambda row: any(value in str(row['patchscope_result']) for value in data_dict.get(row['word'], [])),
                        axis=1
                    )
                    retrieval_rate_by_layer = output_df.groupby('layer')['retrieved'].mean()

                    # Plot retrieval rate
                    ax.plot(
                        retrieval_rate_by_layer.index,
                        retrieval_rate_by_layer.values,
                        marker='o',
                        label=f"{lang_abbr[src]}→{lang_abbr[tgt]} ({promptversion_abbr[prompt_version]})",
                        # color = 
                        # color=pair_colors.get((src, tgt), "black"),
                        # linestyle=prompt_styles.get(prompt_version, "-"),
                        alpha=0.7
                    )
                except FileNotFoundError:
                    print(f"Data file for {model_short} ({src}-{tgt}, {prompt_version}) not found. Skipping...")

        # Set subplot title and labels
        ax.set_title(f"{model_short}: {pair_group[0][0]}↔{pair_group[0][1]}", fontsize=20)
        if j == len(models) - 1:
            ax.set_xlabel("Layer", fontsize=16)
        if i == 0:
            ax.set_ylabel("Translation Success Rate", fontsize=16)
        ax.grid(True)
        ax.tick_params(axis='both', which='major', labelsize=14)
        ax.legend(title="Lang Pair (Prompt Lang)", fontsize=14)

# Adjust layout and show plot
plt.tight_layout(rect=[0, 0.03, 1, 0.95])  # Adjust layout for title
plt.show()

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from ast import literal_eval

# Define models, languages, and prompt versions
models = ["Tower-Babel/Babel-9B-Chat", "google/gemma-3-12b-it", "meta-llama/Llama-2-7b-chat-hf"]
languages = ["English", "Korean", "German"]
language_pairs = [(src, tgt) for src in languages for tgt in languages if src != tgt]

# Define colors and line styles for visualization

# Create subplots
fig, axes = plt.subplots(len(models), len(language_pairs), figsize=(25, 15), sharex=True, sharey=True)
fig.suptitle("PatchScope Retrieval Rate per Layer (Models & Language Pairs)", fontsize=20)

for i, model in enumerate(models):
    model_short = model.split("/")[-1]
    for j, (src, tgt) in enumerate(language_pairs):
        ax = axes[i, j]
        prompt_versions = [f"{src}Prompt", f"{tgt}Prompt"]
        for prompt_version in prompt_versions:
            try:
                # Load dataset
                data_df = pd.read_csv(f"/home/hyujang/multilingual-inner-lexicon/data/RQ2/MUSE/{src}_{tgt}_1000.csv")
                data_df[tgt] = data_df[tgt].apply(literal_eval)
                data_dict = dict(zip(data_df[src], data_df[tgt]))

                # Load output
                output_path = f"/home/hyujang/multilingual-inner-lexicon/output/RQ2/PatchScope/num_token_20/{model_short}_{src}_to_{tgt}_{prompt_version}_withOriginalCode.csv"
                output_df = pd.read_csv(output_path)
                output_df["patchscope_result"] = output_df["patchscope_result"].str.replace("\n", " ")

                # Compute retrieval rate

                # Apply matching
                output_df['retrieved'] = output_df.apply(
                    lambda row: is_match(row['patchscope_result'], data_dict.get(row['word'], []), lang=tgt),
                    axis=1
                )
                
                retrieval_rate_by_layer = output_df.groupby('layer')['retrieved'].mean()

                # Plot retrieval rate
                ax.plot(
                    retrieval_rate_by_layer.index,
                    retrieval_rate_by_layer.values,
                    marker='o',
                    label=f"{prompt_version}",
                    alpha=0.5,
                    # linestyle=prompt_styles[prompt_version]
                )
            except FileNotFoundError:
                print(f"Data file for {model_short} ({src}-{tgt}, {prompt_version}) not found. Skipping...")

        # Set subplot title and labels
        ax.set_title(f"{model_short}: {src}-{tgt}")
        if i == len(models) - 1:
            ax.set_xlabel("Layer")
        if j == 0:
            ax.set_ylabel("Translation Success Rate")
        ax.grid(True)
        ax.legend(title="Prompt Version")

# Adjust layout and show plot
plt.tight_layout(rect=[0, 0.03, 1, 0.95])  # Adjust layout for title
plt.show()

In [None]:
# models = ["Tower-Babel/Babel-9B-Chat", "google/gemma-3-12b-it", "meta-llama/Llama-2-7b-chat-hf"]

model_short = "google/gemma-3-12b-it".split("/")[-1]
src = "Korean"
tgt = "English"
prompt_version = "EnglishPrompt"

data_df = pd.read_csv(f"/home/hyujang/multilingual-inner-lexicon/data/RQ2/MUSE/{src}_{tgt}_1000.csv")
data_df[tgt] = data_df[tgt].apply(literal_eval)
data_dict = dict(zip(data_df[src], data_df[tgt]))

# Load output
output_path = f"/home/hyujang/multilingual-inner-lexicon/output/RQ2/PatchScope/num_token_20/{model_short}_{src}_to_{tgt}_{prompt_version}_withOriginalCode.csv"
output_df = pd.read_csv(output_path)
output_df["patchscope_result"] = output_df["patchscope_result"].str.replace("\n", " ")

# Compute retrieval rate
output_df['retrieved'] = output_df.apply(
    lambda row: is_match(row['patchscope_result'], data_dict.get(row['word'], []), lang=tgt),
    axis=1
)
output_df["target"] = output_df["word"].apply(lambda x: data_dict.get(x, []))


In [None]:
output_df["patchscope_result"].value_counts().sort_values(ascending=False).head(10)

In [None]:
prompt_version = "KoreanPrompt"
output_path = f"/home/hyujang/multilingual-inner-lexicon/output/RQ2/PatchScope/num_token_20/{model_short}_{src}_to_{tgt}_{prompt_version}_withOriginalCode.csv"
output_df_2 = pd.read_csv(output_path)
output_df_2["patchscope_result"] = output_df_2["patchscope_result"].str.replace("\n", " ")

output_df_2['retrieved'] = output_df_2.apply(
    lambda row: is_match(row['patchscope_result'], data_dict.get(row['word'], []), lang=tgt),
    axis=1
)
output_df_2["target"] = output_df_2["word"].apply(lambda x: data_dict.get(x, []))


In [None]:
output_df_2["patchscope_result"].value_counts().sort_values(ascending=False).head(10)

In [None]:
pd.concat([output_df, output_df_2], axis=1).to_excel("/home/hyujang/multilingual-inner-lexicon/output/RQ2/PatchScope/num_token_20/not_retrieved.xlsx", index=False)

In [None]:
output_df[~output_df["retrieved"]].to_csv("/home/hyujang/multilingual-inner-lexicon/output/RQ2/PatchScope/num_token_20/not_retrieved.csv", index=False)

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from ast import literal_eval

# Define models, languages, and prompt versions
models = ["Tower-Babel/Babel-9B-Chat", "google/gemma-3-12b-it", "meta-llama/Llama-2-7b-chat-hf"]
languages = ["English", "Korean", "German"]
language_pairs = [(src, tgt) for src in languages for tgt in languages if src != tgt]

# Define colors and line styles for visualization

# Create subplots
fig, axes = plt.subplots(len(models), len(language_pairs), figsize=(25, 15), sharex=True, sharey=True)
fig.suptitle("PatchScope Retrieval Rate per Layer (Models & Language Pairs)", fontsize=20)

for i, model in enumerate(models):
    model_short = model.split("/")[-1]
    for j, (src, tgt) in enumerate(language_pairs):
        ax = axes[i, j]
        prompt_versions = [f"{src}Prompt", f"{tgt}Prompt"]
        for prompt_version in prompt_versions:
            # Load dataset
            data_df = pd.read_csv(f"/home/hyujang/multilingual-inner-lexicon/data/RQ2/MUSE/{src}_{tgt}_1000.csv")
            data_df[tgt] = data_df[tgt].apply(literal_eval)
            data_dict = dict(zip(data_df[src], data_df[tgt]))

            # Load output
            output_path = f"/home/hyujang/multilingual-inner-lexicon/output/RQ2/PatchScope/num_token_20/{model_short}_{src}_to_{tgt}_{prompt_version}_withOriginalCode.csv"
            output_df = pd.read_csv(output_path)
            output_df["patchscope_result"] = output_df["patchscope_result"].str.replace("\n", " ")

            # Compute retrieval rate

            # Apply matching
            output_df['retrieved'] = output_df.apply(
                lambda row: is_match(row['patchscope_result'], data_dict.get(row['word'], []), lang=tgt),
                axis=1
            )

            print(f"Processing {model_short} ({src}-{tgt}, {prompt_version})...")
            # print(output_df["patchscope_result"].value_counts().sort_values(ascending=False).head(10))
            
            value_counts = output_df["patchscope_result"].value_counts()

            n = 100
            count_over_n = (value_counts > n).sum()
            print(f"→ {count_over_n} unique answers occurred more than {n} times.")
            
            total = len(output_df)
            unique = output_df["patchscope_result"].nunique()
            duplicate_ratio = 1 - unique / total
            print(f"→ {duplicate_ratio:.2%} of outputs are duplicates.")

            top_k = 10
            top_k_total = value_counts.head(top_k).sum()
            print(f"→ Top {top_k} answers account for {top_k_total / total:.2%} of all outputs.")

            print()

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from ast import literal_eval
from scipy.stats import entropy

# Define models, languages, and prompt versions
models = ["Tower-Babel/Babel-9B-Chat", "google/gemma-3-12b-it", "meta-llama/Llama-2-7b-chat-hf"]
languages = ["English", "Korean", "German"]
language_pairs = [(src, tgt) for src in languages for tgt in languages if src != tgt]

# Summary storage
summary_rows = []
TOP_K = 10
THRESHOLD = 100

# Create subplots
fig, axes = plt.subplots(len(models), len(language_pairs), figsize=(25, 15), sharex=True, sharey=True)
fig.suptitle("PatchScope Retrieval Rate per Layer (Models & Language Pairs)", fontsize=20)

for i, model in enumerate(models):
    model_short = model.split("/")[-1]
    for j, (src, tgt) in enumerate(language_pairs):
        ax = axes[i, j]
        prompt_versions = [f"{src}Prompt", f"{tgt}Prompt"]

        for prompt_version in prompt_versions:
            try:
                # Load dataset
                data_df = pd.read_csv(f"/home/hyujang/multilingual-inner-lexicon/data/RQ2/MUSE/{src}_{tgt}_1000.csv")
                data_df[tgt] = data_df[tgt].apply(literal_eval)
                data_dict = dict(zip(data_df[src], data_df[tgt]))

                # Load model output
                output_path = f"/home/hyujang/multilingual-inner-lexicon/output/RQ2/PatchScope/num_token_20/{model_short}_{src}_to_{tgt}_{prompt_version}_withOriginalCode.csv"
                output_df = pd.read_csv(output_path)
                output_df["patchscope_result"] = output_df["patchscope_result"].str.replace("\n", " ")

                # Compute repetition statistics
                value_counts = output_df["patchscope_result"].value_counts()
                total = len(output_df)
                unique = value_counts.count()
                duplicate_ratio = 1 - unique / total
                count_over_threshold = (value_counts > THRESHOLD).sum()
                top_k_total = value_counts.head(TOP_K).sum()
                top_k_ratio = top_k_total / total
                probabilities = value_counts / value_counts.sum()
                ent = entropy(probabilities, base=2)

                summary_rows.append({
                    "Model": model_short,
                    "Source": src,
                    "Target": tgt,
                    "Prompt": prompt_version,
                    "Total Outputs": total,
                    "Unique Outputs": unique,
                    "Duplicate Ratio": f"{duplicate_ratio:.2%}",
                    f"# Outputs >{THRESHOLD}": count_over_threshold,
                    f"Top {TOP_K} %": f"{top_k_ratio:.2%}",
                    "Entropy (bits)": round(ent, 2)
                })

                # Optional: dummy retrieval check if needed
                output_df['retrieved'] = False  # or apply your match logic here

                # Plot retrieval rate if retrieved column is used
                if 'layer' in output_df.columns:
                    retrieval_rate_by_layer = output_df.groupby('layer')['retrieved'].mean()
                    ax.plot(
                        retrieval_rate_by_layer.index,
                        retrieval_rate_by_layer.values,
                        marker='o',
                        label=f"{prompt_version}",
                        alpha=0.5,
                    )

            except FileNotFoundError:
                print(f"File not found for {model_short} ({src}-{tgt}, {prompt_version})")
                continue

        ax.set_title(f"{model_short}: {src}-{tgt}")
        if i == len(models) - 1:
            ax.set_xlabel("Layer")
        if j == 0:
            ax.set_ylabel("Translation Success Rate")
        ax.grid(True)
        ax.legend(title="Prompt Version")

# Plot formatting
plt.tight_layout(rect=[0, 0.03, 1, 0.95])
plt.show()

# Create and save summary table
summary_df = pd.DataFrame(summary_rows)
summary_df = summary_df.sort_values(by=["Model", "Source", "Target", "Prompt"])
print(summary_df)

# Save to CSV
# summary_df.to_csv("output_repetition_summary.csv", index=False)


In [None]:
summary_df.to_csv("output_repetition_summary.csv", index=False)

## visualize all for a single language pair

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from ast import literal_eval

SOURCE_LANGUAGE = "Korean"
TARGET_LANGUAGE = "English"

# Define models and prompt versions
models = ["Tower-Babel/Babel-9B-Chat", "google/gemma-3-12b-it", "meta-llama/Llama-2-7b-chat-hf"]
prompt_versions = [f"{SOURCE_LANGUAGE}Prompt", f"{TARGET_LANGUAGE}Prompt"]

# Define colors and line styles for visualization
model_colors = {
    "Babel-9B-Chat": "#66c2a5",
    "gemma-3-12b-it": "#fc8d62",
    "Llama-2-7b-chat-hf": "#e78ac3"
}
prompt_styles = {
    f"{SOURCE_LANGUAGE}Prompt": "--",
    f"{TARGET_LANGUAGE}Prompt": "-"
}

plt.figure(figsize=(10, 6))

# Iterate through models and prompt versions
for model in models:
    model_short = model.split("/")[-1]
    for prompt_version in prompt_versions:
        try:
            data_df = pd.read_csv(f"/home/hyujang/multilingual-inner-lexicon/data/RQ2/MUSE/{SOURCE_LANGUAGE}_{TARGET_LANGUAGE}_1000.csv")
            data_df[f"{TARGET_LANGUAGE}"] = data_df[f"{TARGET_LANGUAGE}"].apply(literal_eval)
            data_dict = dict(zip(data_df[f"{SOURCE_LANGUAGE}"], data_df[f"{TARGET_LANGUAGE}"]))
            
            output_df = pd.read_csv(f"/home/hyujang/multilingual-inner-lexicon/output/RQ2/PatchScope/{model_short}_{SOURCE_LANGUAGE}_to_{TARGET_LANGUAGE}_{prompt_version}.csv")
            output_df["patchscope_result"] = output_df["patchscope_result"].str.replace("\n"," ")
            
            output_df['retrieved'] = output_df.apply(
                lambda row: any(value in str(row['patchscope_result']) for value in data_dict.get(row['word'], [])),
                axis=1
            )

            retrieval_rate_by_layer = output_df.groupby('layer')['retrieved'].mean()
            
            # Plot retrieval rate
            plt.plot(
                retrieval_rate_by_layer.index,
                retrieval_rate_by_layer.values,
                marker='o',
                label=f"{model_short} ({prompt_version})",
                color=model_colors.get(model_short, "black"),
                linestyle=prompt_styles.get(prompt_version, "-")
            )
        except FileNotFoundError:
            print(f"Data file for {model_short} with {prompt_version} not found. Skipping...")

# Add labels, title, and legend
plt.xlabel("Layer")
plt.ylabel("Translation Success Rate")
plt.title("PatchScope Retrieval Rate per Layer (Models & Prompt Versions)")
plt.grid(True)
plt.legend(title="Model & Prompt Version")
plt.tight_layout()
plt.show()

## visualize a single result

In [None]:
output_df['retrieved'] = output_df.apply(
    lambda row: any(value in str(row['patchscope_result']) for value in data_dict.get(row['word'], [])),
    axis=1
)

In [None]:
output_df = pd.read_csv(f"/home/hyujang/multilingual-inner-lexicon/output/RQ2/PatchScope/{MODEL_NAME}_{SOURCE_LANGUAGE}_to_{TARGET_LANGUAGE}_KoreanPrompt_withOriginalCode.csv")
output_df["patchscope_result"].value_counts()

In [None]:
output_df

In [None]:
output_df[output_df["retrieved"]]["patchscope_result"]

In [None]:
import pandas as pd
from ast import literal_eval

SOURCE_LANGUAGE = "English"
TARGET_LANGUAGE = "German"
MODEL_NAME = "google/gemma-3-12b-it"
# MODEL_NAME = "meta-llama/Llama-2-7b-chat-hf"
# MODEL_NAME = "Tower-Babel/Babel-9B-Chat"
MODEL_NAME = MODEL_NAME.split("/")[-1]  # Extract model name for output file

data_df = pd.read_csv(f"/home/hyujang/multilingual-inner-lexicon/data/RQ2/MUSE/{SOURCE_LANGUAGE}_{TARGET_LANGUAGE}_1000.csv")
data_df[f"{TARGET_LANGUAGE}"] = data_df[f"{TARGET_LANGUAGE}"].apply(literal_eval)
output_df = pd.read_csv(f"/home/hyujang/multilingual-inner-lexicon/output/RQ2/PatchScope/{MODEL_NAME}_{SOURCE_LANGUAGE}_to_{TARGET_LANGUAGE}_EnglishPrompt_withOriginalCode.csv")
output_df["patchscope_result"] = output_df["patchscope_result"].str.replace("\n"," ")
data_dict = dict(zip(data_df[f"{SOURCE_LANGUAGE}"], data_df[f"{TARGET_LANGUAGE}"]))
output_df['retrieved'] = output_df.apply(
    lambda row: any(value in str(row['patchscope_result']) for value in data_dict.get(row['word'], [])),
    axis=1
)

retrieval_rate = output_df['retrieved'].mean()

# print(output_df[output_df['retrieved']]["word"].value_counts())
print(output_df[output_df['retrieved']]["word"].unique())
en_prompt_true_list = output_df[output_df['retrieved']]["word"].unique()

import matplotlib.pyplot as plt
# Group by layer and compute retrieval rate
retrieval_rate_by_layer = output_df.groupby('layer')['retrieved'].mean()

# Plot the retrieval rate
plt.figure(figsize=(8, 5))
plt.plot(retrieval_rate_by_layer.index, retrieval_rate_by_layer.values, marker='o', linestyle='-', color='b')
plt.xlabel("Layer")
plt.ylabel("Translation Success Rate")
plt.title(f"PatchScope Retrieval Rate per Layer ({MODEL_NAME}, {SOURCE_LANGUAGE} to {TARGET_LANGUAGE})")
plt.grid(True)
plt.tight_layout()
plt.show()

In [None]:
import pandas as pd
from ast import literal_eval

SOURCE_LANGUAGE = "Korean"
TARGET_LANGUAGE = "English"
MODEL_NAME = "google/gemma-3-12b-it"
# MODEL_NAME = "meta-llama/Llama-2-7b-chat-hf"
# MODEL_NAME = "Tower-Babel/Babel-9B-Chat"
MODEL_NAME = MODEL_NAME.split("/")[-1]  # Extract model name for output file

data_df = pd.read_csv(f"/home/hyujang/multilingual-inner-lexicon/data/RQ2/MUSE/{SOURCE_LANGUAGE}_{TARGET_LANGUAGE}_1000.csv")
data_df[f"{TARGET_LANGUAGE}"] = data_df[f"{TARGET_LANGUAGE}"].apply(literal_eval)
output_df = pd.read_csv(f"/home/hyujang/multilingual-inner-lexicon/output/RQ2/PatchScope/{MODEL_NAME}_{SOURCE_LANGUAGE}_to_{TARGET_LANGUAGE}_KoreanPrompt_withOriginalCode_v2.csv")
output_df["patchscope_result"] = output_df["patchscope_result"].str.replace("\n"," ")
data_dict = dict(zip(data_df[f"{SOURCE_LANGUAGE}"], data_df[f"{TARGET_LANGUAGE}"]))
output_df['retrieved'] = output_df.apply(
    lambda row: any(value in str(row['patchscope_result']) for value in data_dict.get(row['word'], [])),
    axis=1
)

retrieval_rate = output_df['retrieved'].mean()

# print(output_df[output_df['retrieved']]["word"].value_counts())
print(output_df[output_df['retrieved']]["word"].unique())
en_prompt_true_list = output_df[output_df['retrieved']]["word"].unique()

import matplotlib.pyplot as plt
# Group by layer and compute retrieval rate
retrieval_rate_by_layer = output_df.groupby('layer')['retrieved'].mean()

# Plot the retrieval rate
plt.figure(figsize=(8, 5))
plt.plot(retrieval_rate_by_layer.index, retrieval_rate_by_layer.values, marker='o', linestyle='-', color='b')
plt.xlabel("Layer")
plt.ylabel("Translation Success Rate")
plt.title(f"PatchScope Retrieval Rate per Layer ({MODEL_NAME}, {SOURCE_LANGUAGE} to {TARGET_LANGUAGE})")
plt.grid(True)
plt.tight_layout()
plt.show()

## visualize specific files

In [None]:
MODEL_NAME = "gemma-3-12b-it"

output_paths = [f"/home/hyujang/multilingual-inner-lexicon/output/RQ2/PatchScope/{MODEL_NAME}_{SOURCE_LANGUAGE}_to_{TARGET_LANGUAGE}_KoreanPrompt.csv", 
                f"/home/hyujang/multilingual-inner-lexicon/output/RQ2/PatchScope/{MODEL_NAME}_{SOURCE_LANGUAGE}_to_{TARGET_LANGUAGE}_KoreanPrompt_v2.csv",
                # f"/home/hyujang/multilingual-inner-lexicon/output/RQ2/PatchScope/{MODEL_NAME}_{SOURCE_LANGUAGE}_to_{TARGET_LANGUAGE}_v3.csv"
                ]


plt.figure(figsize=(10, 6))

for output_path in output_paths:
    data_df = pd.read_csv(f"/home/hyujang/multilingual-inner-lexicon/data/RQ2/MUSE/{SOURCE_LANGUAGE}-{TARGET_LANGUAGE}_1000.csv")
    data_df[f"{TARGET_LANGUAGE}"] = data_df[f"{TARGET_LANGUAGE}"].apply(literal_eval)
    output_df = pd.read_csv(output_path)
    output_df["patchscope_result"] = output_df["patchscope_result"].str.replace("\n"," ")
    data_dict = dict(zip(data_df[f"{SOURCE_LANGUAGE}"], data_df[f"{TARGET_LANGUAGE}"]))
    output_df['retrieved'] = output_df.apply(
        lambda row: any(value in str(row['patchscope_result']) for value in data_dict.get(row['word'], [])),
        axis=1
    )

    retrieval_rate_by_layer = output_df.groupby('layer')['retrieved'].mean()
    # print(f"{output_path.split('/')[-1]}")
    plt.plot(
    retrieval_rate_by_layer.index,
    retrieval_rate_by_layer.values,
    marker='o',
    # label="hi"
    label=f"{output_path.split('/')[-1].replace(".csv","")}",  # Use the file name as label
    # color=language_colors.get(lang, "black"),  # Use the color for the language
    # linestyle=model_styles.get(model_short, "dashdot"),  # Use the line style for the model
        )


plt.xlabel("Layer")
plt.ylabel("Translation Success Rate")
plt.title(f"PatchScope Retrieval Rate per Layer ({MODEL_NAME}, {SOURCE_LANGUAGE} to {TARGET_LANGUAGE})")
plt.grid(True)
plt.legend()
plt.tight_layout()
plt.show()