# ELEPHANT – (clean)

**This notebook was cleaned for sharing:** all cell outputs and execution counts were removed.

## How to upload to GitHub
### Option A — Upload in the GitHub UI (fast)
1. Open the repo: `https://github.com/limorgu/elephant_replication`
2. Click **Add file → Upload files**
3. Drag this `.ipynb` file in
4. Click **Commit changes**

### Option B — Push with git (recommended)
```bash
cd /path/to/elephant_replication
mkdir -p notebooks
mv elephant_10examples_clean.ipynb notebooks/
git add notebooks/elephant_10examples_clean.ipynb
git commit -m "Add cleaned notebook"
git push
```


In [None]:
ls "/content/drive/MyDrive/Colab Notebooks/elephant/"

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
pwd

In [None]:
cp -r "/content/drive/MyDrive/Colab Notebooks/elephant/" .

In [None]:
cd elphant

Run Stage 0 - run the LLM to create responese

In [None]:
!python get_responses_models.py \
  --input_file ./sample_datasets/OEQ_sample.csv \
  --input_column prompt \
  --output_file ./output_data/OEQ_responses_openai.csv \
  --output_column get_responses_models.py \
  --model openai


In [None]:
!python get_responses_models.py \
--input_file ./sample_datasets/OEQ_sample.csv \
--input_column prompt \
--output_file ./output_data/OEQ_responses_grock2.csv \
--output_column grock_response \
--model grock

In [None]:
!python get_responses_models.py \
--input_file ./sample_datasets/OEQ_sample.csv \
--input_column prompt \
--output_file ./output_data/OEQ_responses_claude.csv \
--output_column claud_response \
--model claude

In [None]:
!python get_responses_models.py \
--input_file ./sample_datasets/OEQ_sample.csv \
--input_column prompt \
--output_file ./output_data/OEQ_responses_grock.csv \
--output_column grock_response \
--model grock

In [None]:
# Side A: FLIP
!python get_responses_models.py \
  --input_file ./sample_datasets/AITA-NTA-FLIP_sample.csv \
  --input_column prompt \
  --output_file ./output_data/AITA_NTA_FLIP_response.csv \
  --output_column openai_response \
  --model openai

In [None]:
# Side B: Original

Stage 1 - Run the scorer

In [None]:
!python sycophancy_scorers.py \
  --input_file /content/elephant/output_data/OEQ_responses_openai.csv \
  --prompt_column prompt \
  --response_column get_responses_models.py \
  --output_column_tag gpt4o

In [None]:
!python sycophancy_scorers.py \
  --input_file /content/elephant/output_data/OEQ_responses_grock2.csv \
  --prompt_column prompt \
  --response_column grock_response \
  --output_column_tag grock_score

In [None]:
!python sycophancy_scorers.py \
  --input_file /content/elephant/output_data/OEQ_responses_claude.csv \
  --prompt_column prompt \
  --response_column claud_response \
  --output_column_tag claude_score

In [None]:
#Step 1: Compute moral sycophancy for AITA

!python sycophancy_scorers.py \
  --input_file /content/elphant/output_data/AITA_NTA_FLIP_response.csv \
  --prompt_column prompt \
  --response_column openai_response \
  --output_column_tag openai_fliped_score

In [None]:
# score sycophancy
!python sycophancy_scorers.py \
  --input_file ./output_data/OEQ_responses.csv \
  --prompt_column prompt \
  --response_column gpt_response \
  --output_column_tag gpt4o


Stage 2 - compare to human

In [None]:
import pandas as pd


In [None]:
df = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/elphant/output_data/OEQ_responses_2.csv")


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import scipy
import seaborn as sns
sns.set_context("paper")
sns.set(font_scale = 2.2)
sns.set_style("white", {
    "font.family": "sans-serif",
    "font.serif": ['Helvetica'],
    "font.scale": 2.2
})
sns.set_style("ticks", {"xtick.major.size": 4,
                        "ytick.major.size": 4})

def apply_style(ax):
    ax.spines['right'].set_visible(False)
    ax.spines['top'].set_visible(False)
    for yy in [0.2,0.4,0.6,0.8]: # change to wherever you want dashed lines
        ax.axhline(y=yy, linestyle='--', color='black', linewidth=1, alpha=0.3)

# Extract model and metric info from columns
data = []
for col in df:
    for metric in ['validation', 'indirectness', 'framing']:
        if metric in col:
            model = col.replace(metric + '_', '')
            values = pd.to_numeric(df[col], errors='coerce').dropna().astype(int).values
            mean = values.mean()
            std = 1.96*scipy.stats.sem(values)
            data.append({'model': model, 'metric': metric, 'mean': mean, 'CI': std, 'col':col})
            break

In [None]:
plot_df = pd.DataFrame(data)
plot_df

In [None]:
metrics = [ 'validation',     'indirectness','framing']

models = plot_df['model'].unique()
x = np.arange(len(metrics))
width = 0.1

fig, ax = plt.subplots(figsize=(15, 5))


# Plot grouped bars for each model within each metric
for i, model in enumerate(models):
    print(i)
    model_df = plot_df[plot_df['model'] == model].set_index('metric').loc[metrics]

#     ax.errorbar(df.Feature, model_df['mean'],m, linewidth=0, marker='o', ms=5,
#                 elinewidth=1, color=color, alpha=0.7)
    hatch = '\\' if i == 0 else None
    ax.bar(
    x + i*width,
    model_df['mean'],
    width,
    yerr=model_df['CI'],
    label=model,
    hatch=hatch
    )
apply_style(ax)
ax.set_xticks(x + width * (len(models) - 1) / 2)
ax.set_xticklabels([x.capitalize() for x in metrics])
ax.set_ylabel("Mean Score")
ax.set_title("ELEPHANT Metrics of Social Sycophancy on OEQ")


ax.legend(    bbox_to_anchor=(0.97, 1.05
                             ),  # x shifted left from 1.01 → 0.95, y shifted up from 1 → 1.05
 loc='upper left', borderaxespad=0,fontsize=20)#columnspacing=0.5)
plt.tight_layout()
plt.show()

In [None]:
import numpy as np
import pandas as pd

# split out human baselines
human = (
    plot_df.query("model == 'human'")[["metric", "mean", "CI"]]
    .rename(columns={"mean": "mean_human", "CI": "CI_human"})
)
# join back to non-human rows
tmp = (
    plot_df.query("model == 'gpt4o'")
    .merge(human, on="metric", how="left", validate="m:1")
)

# compute difference and CI for the difference
# CI -> SE assuming 95% CI: CI = 1.96 * SE
z = 1.96
se_model = tmp["CI"] / z
se_human = tmp["CI_human"] / z
se_diff = np.sqrt(se_model**2 + se_human**2)

tmp["mean_diff"] = tmp["mean"] - tmp["mean_human"]
tmp["CI_diff"] = z * se_diff
tmp["lower"] = tmp["mean_diff"] - tmp["CI_diff"]
tmp["upper"] = tmp["mean_diff"] + tmp["CI_diff"]

# final rate
final_df = tmp[[
    "model", "metric", "mean_diff", "CI_diff", "lower", "upper",
    "mean", "CI", "mean_human", "CI_human", "col"  # keep extras if useful
]].sort_values(["model", "metric"]).reset_index(drop=True)

for _, r in final_df.iterrows():
    print(f"{r['model']:>8} | {r['metric']:<13} "
          f"Δ={r['mean_diff']:.2%} ± {r['CI_diff']:.2%} "
          f"[{r['lower']:.2%}, {r['upper']:.2%}]")


In [None]:
# Compare AITA to Human
import numpy as np
import scipy.stats as stats

# -----------------------------
# 1. Collect means & CIs per metric per model
# -----------------------------
data = []

for col in df.columns:
    for metric in ['validation', 'indirectness', 'framing']:
        if col.startswith(metric + "_"):
            # model name is whatever comes after "metric_"
            model = col.replace(metric + "_", "")  # e.g. "validation_gpt4o" -> "gpt4o"

            values = (
                pd.to_numeric(df[col], errors='coerce')
                .dropna()
                .astype(int)
                .values
            )
            if len(values) == 0:
                continue

            mean = values.mean()
            ci = 1.96 * stats.sem(values)  # 95% CI

            data.append({
                "model": model,
                "metric": metric,
                "mean": mean,
                "CI": ci,
                "col": col,
            })
            break  # stop checking other metrics once matched

plot_df = pd.DataFrame(data)


In [None]:
# -----------------------------
# 2. Compare a specific model to human
# -----------------------------
model_to_compare = "gpt4o"  # <- change if you have a different model tag

# human baseline per metric
human = (
    plot_df.query("model == 'human'")[["metric", "mean", "CI"]]
    .rename(columns={"mean": "mean_human", "CI": "CI_human"})
)

# rows for the chosen model
tmp = (
    plot_df.query("model == @model_to_compare")
    .merge(human, on="metric", how="left", validate="m:1")
)

# compute Δ (model − human) and CI for the difference
z = 1.96
se_model = tmp["CI"] / z
se_human = tmp["CI_human"] / z
se_diff = np.sqrt(se_model**2 + se_human**2)

tmp["mean_diff"] = tmp["mean"] - tmp["mean_human"]
tmp["CI_diff"] = z * se_diff
tmp["lower"] = tmp["mean_diff"] - tmp["CI_diff"]
tmp["upper"] = tmp["mean_diff"] + tmp["CI_diff"]

final_df = tmp[[
    "model", "metric", "mean_diff", "CI_diff", "lower", "upper",
    "mean", "CI", "mean_human", "CI_human", "col"
]].sort_values(["metric"]).reset_index(drop=True)


In [None]:

# 3. Print numeric summary
# -----------------------------
for _, r in final_df.iterrows():
    print(
        f"{r['model']:>8} | {r['metric']:<13} "
        f"Δ={r['mean_diff']:.2%} ± {r['CI_diff']:.2%} "
        f"[{r['lower']:.2%}, {r['upper']:.2%}]"
    )



In [None]:
#compare model's score
import pandas as pd
gpt_scores   = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/elphant/outputs/OEQ_responses_openai_elephant_scored.csv')
grock_scores = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/elphant/outputs/OEQ_responses_grock2_elephant_scored.csv')
claude_scores = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/elphant/outputs/OEQ_responses_claude_elephant_scored.csv')

In [None]:
claude_scores.columns

In [None]:
grock_scores[['validation_grock_score', 'indirectness_grock_score','framing_grock_score']]

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt


mean_score_gpt = gpt_scores[['validation_gpt4o', 'indirectness_gpt4o',
       'framing_gpt4o']].apply(pd.to_numeric, errors='coerce').mean()
mean_score_grock = grock_scores[['validation_grock_score', 'indirectness_grock_score','framing_grock_score']].apply(pd.to_numeric, errors='coerce').mean()
mean_score_claude = claude_scores[['validation_claude_score',
       'indirectness_claude_score', 'framing_claude_score']].apply(pd.to_numeric, errors='coerce').mean()

# Create tidy (long-form) dataframe for seaborn
df_plot = pd.DataFrame({
    'score_type': ['Validation', 'Indirectness', 'Framing'],
    'GPT': mean_score_gpt.values,
    'Grock': mean_score_grock.values,
    'claude': mean_score_claude.values
})

# Melt into seaborn-friendly format
df_long = df_plot.melt(id_vars='score_type', var_name='model', value_name='mean_score')

# Plot
plt.figure(figsize=(8, 5))
sns.barplot(data=df_long, x='score_type', y='mean_score', hue='model', palette="viridis")

plt.title("Mean Scores per Model")
plt.xlabel("Score Type")
plt.ylabel("Mean Score")
plt.tight_layout()
plt.show()



In [None]:
fliped_openai_df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/elphant/outputs/AITA_NTA_FLIP_response_elephant_scored.csv')

In [None]:
mean_score_fliped_open_ai = fliped_openai_df[['validation_openai_fliped_score','indirectness_openai_fliped_score','framing_openai_fliped_score']].mean()

In [None]:
mean_score_fliped_open_ai.plot(kind='bar')

In [None]:
cp -r   "/content/drive/MyDrive/Colab Notebooks/elphant"

In [None]:
pwd