In [1]:
import os
import pandas as pd
import pickle
import numpy as np

import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
sns.set()

plt.rcParams['figure.dpi'] = 600
plt.rcParams['savefig.dpi'] = 600

In [2]:
explainer_map = {'conceptshap': 'ConceptX', 'conceptshap_r': 'ConceptX-R', 'conceptshap_a': 'ConceptX-A', 'conceptshap': 'ConceptX', 'tokenshap': 'TokenSHAP', 'random': 'Random'}
explainer_order = ["Random", "TokenSHAP", "ConceptX", "ConceptX-R", "ConceptX-A"]
MODEL_NAMES = {"gpt2": "GPT-2", "gemma":"Gemma-2-2B", "gpt4o-mini": "GPT-4o-mini", "llama-3-3b": "Llama-3.2-3B"}

In [3]:
save_dir = "/cluster/home/kamara/conceptx"
seed_ = 0

In [4]:
folder_path = os.path.join(save_dir, f"results/classification")  # Replace with your folder path

# Initialize a list to store the dataframes
df_list = []

# Loop through each file in the folder
for root, dirs, files in os.walk(folder_path):
    for file in files:
        full_path = os.path.join(root, file)
        parts = file.split('_')
        print("parts: ", parts)
        
        df = pd.read_csv(full_path)

        if "batch" in parts[1]:
            dataset_idx = 3
        else:
            dataset_idx = 1
        df["dataset"] = parts[dataset_idx]
        df["model"] = parts[dataset_idx + 1]
        df["explainer"] = parts[dataset_idx + 2]
        
        if len(parts) > dataset_idx + 4:
            df["baseline"] = parts[dataset_idx + 3]
            seed_idx = dataset_idx + 4
        else:
            df["baseline"] = None
            seed_idx = dataset_idx + 3

        df["seed"] = parts[seed_idx].split(".")[0]
        df_list.append(df)

# Concatenate all dataframes in the list
df_all = pd.concat(df_list, ignore_index=True)
df_all


parts:  ['classification', 'sentiment', 'llama-3-3b', 'conceptshap', '0.csv']
parts:  ['classification', 'sentiment', 'llama-3-3b', 'conceptshap', 'aspect', '0.csv']
parts:  ['classification', 'sentiment', 'llama-3-3b', 'random', '0.csv']
parts:  ['classification', 'sentiment', 'llama-3-3b', 'tokenshap', '0.csv']


Unnamed: 0,id,input,p0,p_highest,p_label,aspect,highest_token,label,dataset,model,explainer,baseline,seed
0,4,That sucks to hear. I hate days like that,0.934348,0.931191,0.931191,negative,sucks,sucks,sentiment,llama-3-3b,conceptshap,,0
1,8,Is sad when people`s phones are dead,0.916236,0.878405,0.515679,negative,dead,sad,sentiment,llama-3-3b,conceptshap,,0
2,22,gonna have a lazy day today,0.56593,0.172573,0.172573,negative,lazy,lazy,sentiment,llama-3-3b,conceptshap,,0
3,24,http://twitpic.com/3ernb - Loving The Jumper,0.830895,0.838044,0.073611,positive,Jumper,Loving,sentiment,llama-3-3b,conceptshap,,0
4,31,LUCY IS HATING ON MY GWEG!,0.921375,0.79402,0.133989,negative,GWEG,HATING,sentiment,llama-3-3b,conceptshap,,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2282,3008,Class The 50d is supposed to come today,0.098482266,0.132293,0.058549,positive,Class,come,sentiment,llama-3-3b,tokenshap,,0
2283,3012,Your sweetie is very lucky too liberty,0.9661167,0.962924,0.959541,positive,very,sweetie,sentiment,llama-3-3b,tokenshap,,0
2284,3032,Its helping me the way i think,0.84456706,0.60768,0.010549,positive,way,helping,sentiment,llama-3-3b,tokenshap,,0
2285,3035,benson Woohoo Well Done I started Macarenaing too,0.97537756,0.963633,0.91461,positive,too,Woohoo,sentiment,llama-3-3b,tokenshap,,0


In [9]:
# Update 'explainer' column based on 'baseline' conditions
df_all["explainer"] = df_all["explainer"] + df_all["baseline"].map({"reference": "_r", "aspect": "_a"}).fillna("")
# Convert relevant columns to numeric
df_all["p0"] = pd.to_numeric(df_all["p0"], errors="coerce")
df_all["p_highest"] = pd.to_numeric(df_all["p_highest"], errors="coerce")
df_all["p_label"] = pd.to_numeric(df_all["p_label"], errors="coerce")


In [10]:
# Choose model and dataset
model_name = "llama-3-3b"
dataset = "sentiment"

In [11]:
df_all

Unnamed: 0,id,input,p0,p_highest,p_label,aspect,highest_token,label,dataset,model,explainer,baseline,seed
0,4,That sucks to hear. I hate days like that,0.934348,0.931191,0.931191,negative,sucks,sucks,sentiment,llama-3-3b,conceptshap,,0
1,8,Is sad when people`s phones are dead,0.916236,0.878405,0.515679,negative,dead,sad,sentiment,llama-3-3b,conceptshap,,0
2,22,gonna have a lazy day today,0.565930,0.172573,0.172573,negative,lazy,lazy,sentiment,llama-3-3b,conceptshap,,0
3,24,http://twitpic.com/3ernb - Loving The Jumper,0.830895,0.838044,0.073611,positive,Jumper,Loving,sentiment,llama-3-3b,conceptshap,,0
4,31,LUCY IS HATING ON MY GWEG!,0.921375,0.794020,0.133989,negative,GWEG,HATING,sentiment,llama-3-3b,conceptshap,,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2282,3008,Class The 50d is supposed to come today,0.098482,0.132293,0.058549,positive,Class,come,sentiment,llama-3-3b,tokenshap,,0
2283,3012,Your sweetie is very lucky too liberty,0.966117,0.962924,0.959541,positive,very,sweetie,sentiment,llama-3-3b,tokenshap,,0
2284,3032,Its helping me the way i think,0.844567,0.607680,0.010549,positive,way,helping,sentiment,llama-3-3b,tokenshap,,0
2285,3035,benson Woohoo Well Done I started Macarenaing too,0.975378,0.963633,0.914610,positive,too,Woohoo,sentiment,llama-3-3b,tokenshap,,0


In [12]:
df = df_all[df_all["p0"]>0.5]

# Compute differences
df["p0_p_highest_diff"] = df["p0"] - df["p_highest"]
df["p0_p_label_diff"] = df["p0"] - df["p_label"]

# Compute statistics per explainer
stats = df.groupby("explainer")[["p0_p_highest_diff", "p0_p_label_diff"]].agg(["mean", "var"])

# Print or save results
print(stats)

                p0_p_highest_diff           p0_p_label_diff          
                             mean       var            mean       var
explainer                                                            
conceptshap              0.133623  0.054725        0.358734  0.091630
conceptshap_a_a          0.132195  0.052321        0.356777  0.091995
random                   0.103507  0.037143        0.365913  0.093298
tokenshap                0.122349  0.040676        0.364951  0.093533


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["p0_p_highest_diff"] = df["p0"] - df["p_highest"]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["p0_p_label_diff"] = df["p0"] - df["p_label"]
