In [None]:
%load_ext autoreload
%autoreload 2

import os, json
import pandas as pd
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import plotly.express as px

In [2]:
intervention_methods = ["ablation", "actadd"]
model_dirs = {
    "Qwen 1.8B": "runs/Qwen-1_8B-chat", "Qwen 7B": "runs/Qwen-7B-chat",
    "Gemma 2B": "runs/gemma-2b-it", "Gemma 7B": "runs/gemma-7b-it", 
    "Yi 6B": "runs/yi-6b-chat", 
    "Llama2 7B": "runs/Llama-2-7b-chat-hf", "Llama2 13B": "runs/Llama-2-13b-chat-hf",
    "Llama3 8B": "runs/Meta-Llama-3-8B-Instruct"
}

label_colors = {"overall": '#7f7f7f', "harmful": '#ff7f0e', "harmless": '#1f77b4', "unsafe": "#d62728", "safe": "#2ca02c"}
intervention_colors = {"baseline": '#1f77b4', "actadd": '#2ca02c', "ablation": '#d62728'}

In [5]:
def load_results(labels, filename_prefixes, results_dir="magnitude_eval"):
    df = pd.DataFrame({"model": [], "instruction_id": [], "label": [], "intervention": [], "refusal_score": []})
    for model, _dir in model_dirs.items():
        for i, label in enumerate(labels):
            for intervention in intervention_methods:
                results = json.load(open(os.path.join(_dir, f"{results_dir}/{filename_prefixes[i]}_{intervention}.json")))
                for j, x in enumerate(results):
                    df.loc[len(df.index)] = [model, j, label, intervention, x["refusal_score"]]

    baseline_df = pd.DataFrame({"model": [], "instruction_id": [], "label": [], "projection_ratio": [], "refusal_score": []})
    for model, _dir in model_dirs.items():
        for i, label in enumerate(labels):
            results = json.load(open(os.path.join(_dir, f"{results_dir}/{filename_prefixes[i]}_baseline.json")))
            for j, x in enumerate(results):
                baseline_df.loc[len(baseline_df.index)] = [model, j, label, x["projection_ratio"], x["refusal_score"]]
    return df, baseline_df

In [6]:
harm_df, harm_baseline_df = load_results(["harmful", "harmless"], ["harmful_test", "harmless_test"])
xstest_df, xstest_baseline_df = load_results(["unsafe", "safe"], ["xstest_unsafe", "xstest_safe"])

In [7]:
def plot_proj_ratio(baseline_df, model, labels, width=500, height=300):
    baseline = baseline_df[baseline_df.model == model].sort_values("instruction_id")
    fig = go.Figure()
    for label in labels:
        temp = baseline[baseline["label"] == label]
        fig.add_trace(go.Scatter(
            x=temp["projection_ratio"].tolist(), y=temp["refusal_score"].tolist(),
            name=label, mode="markers", marker_color=label_colors[label], marker_size=7
        ))
    
    fig.update_layout(
        width=width, height=height, title=f"Model: {model}", margin=dict(l=20, r=10, t=30, b=20), 
        font=dict(size=15), legend_title_font=dict(size=14), title_font=dict(size=14), plot_bgcolor='white',
    )
    fig.update_yaxes(
        title_text="Refusal Score", title_font=dict(size=15), title_standoff=5, zeroline = True, zerolinecolor='darkgrey', 
        mirror=True, showgrid=True, gridcolor='darkgrey', showline=True, linewidth=1, linecolor='darkgrey'
    )
    fig.update_xaxes(
        title_text="Projection Ratio", title_font=dict(size=15), title_standoff=5, zeroline = True, zerolinecolor='darkgrey', 
        mirror=True, showgrid=True, gridcolor='darkgrey', showline=True, linewidth=1, linecolor='darkgrey'
    )
    fig.update_annotations(font_size=14)
    fig.show()


def plot_intervention(baseline_df, df, model, labels, width=900, height=300, marker_size=5.5):
    fig = make_subplots(rows=1, cols=2, subplot_titles=[f"{labels[0]} Instructions", f"{labels[1]} Instructions"], shared_yaxes=True)
    showlegend = True
    
    for i, label in enumerate(labels):
        intervened = df[(df.model == model) & (df["label"] == label)].sort_values("instruction_id")
        baseline = baseline_df[(baseline_df.model == model) & (baseline_df["label"] == label)].sort_values("instruction_id")
        
        fig.add_trace(go.Scatter(
                x=baseline["projection_ratio"].tolist(), y=baseline["refusal_score"].tolist(), name="baseline"
                , mode="markers", marker_color=intervention_colors["baseline"], showlegend=showlegend, marker_size=marker_size
            ), row=1, col=i+1)

        if label in ["Harmless/Safe", "harmless", "unsafe"]:
            intervention = "actadd"
        else:
            intervention = "ablation"
        temp = intervened[intervened["intervention"] == intervention]
        fig.add_trace(go.Scatter(
            x=baseline["projection_ratio"].tolist(), y=temp["refusal_score"].tolist(), name=intervention,
            mode="markers", marker_color=intervention_colors[intervention], showlegend=True, marker_size=marker_size
        ), row=1, col=i+1)

        showlegend = False

    fig.update_layout(
        width=width, height=height, title=f"Model: {model}", title_x=0.45, margin=dict(l=20, r=10, t=40, b=20), 
        font=dict(size=14), legend_title_font=dict(size=14), title_font=dict(size=14), plot_bgcolor='white',
    )
    fig.update_yaxes(
        title_text="Refusal Score", title_font=dict(size=15), title_standoff=5, row=1)
    fig.update_yaxes(zeroline = True, zerolinecolor='darkgrey', mirror=True, showline=True, linecolor='darkgrey', linewidth=1, row=1, showticklabels=True)
    fig.update_xaxes(
        title_text="Projection Ratio", title_font=dict(size=15), title_standoff=5, row=1,
        zeroline = True, zerolinecolor='darkgrey', mirror=True, showline=True, linewidth=1, linecolor='darkgrey'
    )
    fig.update_annotations(font_size=14)
    fig.show()

## Projection Ratio VS Refusal Score w/o Intervention

In [None]:
baseline_df = pd.concat((harm_baseline_df, xstest_baseline_df))
for model in model_dirs:
    plot_proj_ratio(baseline_df, model, labels=["harmful", "harmless", "safe", "unsafe"], width=550, height=350)

## Intervention Results

In [8]:
label_mapping = {"safe": "Harmless/Safe", "harmless": "Harmless/Safe", "harmful": "Harmful/Unsafe", "unsafe": "Harmful/Unsafe"}
baseline_df = pd.concat((harm_baseline_df, xstest_baseline_df))
baseline_df["label"] = baseline_df["label"].map(label_mapping)
intervention_df = pd.concat((harm_df, xstest_df))
intervention_df["label"] = intervention_df["label"].map(label_mapping)
for model in model_dirs:
    plot_intervention(baseline_df, intervention_df, model, labels=["Harmful/Unsafe", "Harmless/Safe"], width=900, height=300)

python(75917) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.


## Model Refusal Rate on XSTest

In [None]:
temp = xstest_baseline_df[["model", "instruction_id", "refusal_score", "label"]].copy()
temp["intervention"] = "baseline"
temp = pd.concat((temp, xstest_df))
temp = temp.groupby(["intervention", "label", "model"]).apply(lambda x: x[x["refusal_score"] > 0].shape[0] / x.shape[0], include_groups=False).to_frame("Score (%)").reset_index()
fig = px.bar(temp, x="model", y="Score (%)", color="intervention", pattern_shape="label", width=800, height=300, color_discrete_map=intervention_colors)
fig.update_layout(
    barmode='group',
    plot_bgcolor='white', font=dict(size=14), 
    margin=dict(l=20, r=20, t=40, b=20),
    showlegend=True,
    legend_title_text=None,
    bargroupgap=0,
    bargap=0.25
)
fig.update_xaxes(
    mirror=True, zeroline = True, zerolinecolor='darkgrey', tickfont=dict(size=14),
    showline=True, linewidth=1, linecolor='darkgrey',
    title_text=None,
)
fig.update_yaxes(
    mirror=True, zeroline = True, zerolinecolor='darkgrey',
    title_font=dict(size=15), tickfont=dict(size=14),
    showline=True, linewidth=1, linecolor='darkgrey',
    title_text="Refusal Rate (%)",
    range=[0,1]
)
fig.show()