In [1]:
import pandas as pd
import glob

# Sample Overlaps

In [9]:
from src.utils import read_file

In [None]:
test23 = read_file("data/NewsEmp2023/WASSA23_essay_level_test.tsv")
dev24 = read_file("data/NewsEmp2024/trac3_EMP_dev.csv")
print(f"2023 test samples: {len(test23)}")
common_samples = set(test23["essay"]).intersection(set(dev24["essay"]))
print(f"Common samples: {len(common_samples)}")
remaining_samples = test23[~test23["essay"].isin(common_samples)]
print(f"Remaining samples: {len(remaining_samples)}")

# Being super sure there is no overlap
first_n_char = 25
still_common = set(sample[:first_n_char] for sample in remaining_samples["essay"]).intersection(set(sample[:first_n_char] for sample in dev24["essay"]))
print(f"Still common samples: {len(still_common)}")

2023 test samples: 100
Common samples: 44
Remaining samples: 56
Still common samples: 0


In [26]:
test23.columns

Index(['conversation_id', 'article_id', 'essay', 'speaker_id', 'gender',
       'education', 'race', 'age', 'income', 'speaker_number', 'split',
       'essay_id'],
      dtype='object')

# Noisy Samples

In [2]:
import pandas as pd


In [3]:
def get_top_diff(file_tsv, n=10):
    df = pd.read_csv(file_tsv, sep="\t")
    df["diff"] = df["empathy"] - df["llm_empathy"]
    df["diff"] = df["diff"].abs()
    df = df.sort_values(by="diff", ascending=False)
    print(df.head(n))
    return df

In [30]:
df = get_top_diff("data/NewsEmp2024/trac3_EMP_train_llama.tsv")

     conversation_id  article_id person_id  \
487              488         395      p031   
594               95         233      p038   
940              441           7      p022   
981              482         292      p031   
393              394         270      p010   
860              361         163      p022   
472              473         297      p024   
91                92         233      p022   
825              326          89      p024   
4                  5          35      p022   

                                                 essay   empathy  \
487  This is so sad and tragic. The most selfish th...  7.000000   
594  This is sad. So many young women just used by ...  1.000000   
940  After reading the article, my heart just break...  1.000000   
981  It's amazing that people still debate the issu...  7.000000   
393  I read the article on the China mining disaste...  7.000000   
860  I feel really bad for the girl that lost her e...  1.000000   
472  Anytime a tr

In [32]:
print(df.loc[393, "essay"])

I read the article on the China mining disaster.   There were 33 miners trapped in the mine.  Only two of them survived.  Officials stated whoever was responsible would be punished.   Smaller mines were shut down immediately until further notice.   China has always been known for the deadliest mining.  


In [4]:
df = get_top_diff("data/NewsEmp2024/trac3_EMP_train_gpt.tsv")

     conversation_id  article_id person_id  \
393              394         270      p010   
981              482         292      p031   
940              441           7      p022   
487              488         395      p031   
904              405          92      p022   
825              326          89      p024   
91                92         233      p022   
860              361         163      p022   
998              499         103      p068   
263              264          34      p022   

                                                 essay  empathy  \
393  I read the article on the China mining disaste...      7.0   
981  It's amazing that people still debate the issu...      7.0   
940  After reading the article, my heart just break...      1.0   
487  This is so sad and tragic. The most selfish th...      7.0   
904  After reading the article, you just can't help...      1.0   
825  When crimes like this happened it is always go...      6.0   
91   What happened in th

# Consistency of LLM Annotation

In [15]:
import krippendorff
import numpy as np
import pandas as pd
from sklearn.metrics import mean_absolute_error
from src.utils import read_file


In [None]:
def annotation_consistency() -> pd.DataFrame:
    tr_24 = pd.read_csv("data/NewsEmp2024/trac3_EMP_train_llama.tsv", sep="\t")
    tr_23 = pd.read_csv("data/NewsEmp2023/WASSA23_essay_level_with_labels_train_llama.tsv", sep="\t")
    dv_23 = pd.read_csv("data/NewsEmp2023/WASSA23_essay_level_dev_llama.tsv", sep="\t")

    print(tr_24.shape, tr_23.shape, dv_23.shape)

    common_cols = tr_23.columns.intersection(dv_23.columns)
    tr_23 = tr_23[common_cols]
    dv_23 = dv_23[common_cols]
    tr_dv_23 = pd.concat([tr_23, dv_23], ignore_index=True)
    print(tr_dv_23.shape)

    tr_24 = tr_24.drop_duplicates(subset=["essay"])
    tr_dv_23 = tr_dv_23.drop_duplicates(subset=["essay"])
    print(tr_24.shape, tr_dv_23.shape)

    tr_24 = tr_24.rename(columns={"llm_empathy": "llm_empathy_1"})
    tr_dv_23 = tr_dv_23.rename(columns={"llm_empathy": "llm_empathy_2"})

    merged_df = pd.merge(tr_24, tr_dv_23, on="essay", how="inner", validate="one_to_one")

    common_cols = tr_24.columns.intersection(tr_dv_23.columns)
    for col in common_cols:
        if col == "essay":
            continue

        if merged_df[col + "_x"].equals(merged_df[col + "_y"]):
            print(f"{col} is equal")
            merged_df.drop(col + "_y", axis=1, inplace=True)
            merged_df.rename(columns={col + "_x": col}, inplace=True)
        else:
            print(f"{col} is not equal")

    llm_empathy_1 = torch.tensor(merged_df["llm_empathy_1"].values)
    llm_empathy_2 = torch.tensor(merged_df["llm_empathy_2"].values)

    pcc = pearson_corrcoef(llm_empathy_1, llm_empathy_2).item()
    ccc = concordance_corrcoef(llm_empathy_1, llm_empathy_2).item()
    rmse = mean_squared_error(llm_empathy_1, llm_empathy_2, squared=False).item()
    pcc = round(pcc, 3)
    ccc = round(ccc, 3)
    rmse = round(rmse, 3)

    print(f"PCC: {pcc}, CCC: {ccc}, RMSE: {rmse}")

    merged_df["llm_diff"] = np.abs(merged_df["llm_empathy_1"] - merged_df["llm_empathy_2"])
    mean_diff = merged_df["llm_diff"].mean().round(3)
    std_diff = merged_df["llm_diff"].std().round(3)
    print(f"Difference - Mean: {mean_diff}, Std: {std_diff}")

    return merged_df

In [19]:
gpt = read_file("data/NewsEmp2024/trac3_EMP_train_gpt.tsv")
llama = read_file("data/NewsEmp2024/trac3_EMP_train_llama.tsv")


In [29]:
tr23 = read_file("data/NewsEmp2023/WASSA23_essay_level_with_labels_train_llama.tsv")
dv23 = read_file("data/NewsEmp2023/WASSA23_essay_level_dev_llama.tsv")
tr24 = read_file("data/NewsEmp2024/trac3_EMP_train_llama.tsv")

In [30]:
common_col = set(tr23.columns).intersection(set(dv23.columns))
tr23 = tr23[list(common_col)]
dv23 = dv23[list(common_col)]
intra1 = pd.concat([tr23, dv23])

In [31]:
intra1 = intra1.rename(columns={"llm_empathy": "llm_empathy_1"})
tr24 = tr24.rename(columns={"llm_empathy": "llm_empathy_2"})

merged_df = pd.merge(intra1, tr24, on="essay", how="inner")

In [20]:
def _measure_consistency(x, y):
    x = x.to_numpy()
    y = y.to_numpy()
    kr_alpha =  krippendorff.alpha((x, y), level_of_measurement="interval")
    mean_diff = mean_absolute_error(x, y)
    std = np.std(np.abs(x - y))

    scores = {
        "kr_alpha": kr_alpha,
        "mean_diff": mean_diff,
        "std": std,
    }
    print(scores)

    

In [32]:
_measure_consistency(merged_df["llm_empathy_1"], merged_df["llm_empathy_2"])

{'kr_alpha': 0.9911124565274333, 'mean_diff': 0.10303030303030304, 'std': 0.20653715145300666}


In [24]:
# Inter-LLM consistency
_measure_consistency(llama["empathy"], llama["llm_empathy"])
_measure_consistency(gpt["empathy"], gpt["llm_empathy"])
_measure_consistency(gpt["llm_empathy"], llama["llm_empathy"])

{'kr_alpha': 0.27089326729430385, 'mean_diff': 1.7153761904761904, 'std': 1.3416888117410164}
{'kr_alpha': 0.19190195858747383, 'mean_diff': 1.8091785714285715, 'std': 1.2709864220828826}
{'kr_alpha': 0.7960720891694217, 'mean_diff': 0.7848166666666667, 'std': 0.7025072318726217}
