In [2]:
import numpy as np
import pandas as pd
from scipy.stats import bernoulli
from matplotlib import pyplot as plt

In [5]:
hoyle_data = pd.read_csv("data/hoyle_automated_2021.csv", header=0)
hoyle_data.head(5).T
# Looks like a wide file of human ratings. Automatic coherence measures 
# should have 1 unique value per dataset/model/topic_idx

Unnamed: 0,0,1,2,3,4
dataset,wikitext,wikitext,wikitext,wikitext,wikitext
model,mallet,mallet,mallet,mallet,mallet
topic_idx,0,0,0,0,0
human_idx,0,1,2,3,4
c_npmi_10_full,0.098871,0.098871,0.098871,0.098871,0.098871
c_npmi_10_nytimes_full,0.089354,0.089354,0.089354,0.089354,0.089354
c_npmi_10_test,0.09923,0.09923,0.09923,0.09923,0.09923
c_npmi_10_train,0.092744,0.092744,0.092744,0.092744,0.092744
c_npmi_10_val,0.038283,0.038283,0.038283,0.038283,0.038283
c_uci_full,0.912214,0.912214,0.912214,0.912214,0.912214


In [15]:
# Checking uniqueness of automatic measure for each dataset/model/topic_idx
# Spoiler: it's unique
measures = [c for c in hoyle_data.columns if c[:2] == "c_"] + ["u_mass_full"]
measure_cnt = hoyle_data.groupby(["dataset", "model", "topic_idx"]).agg({measure:"nunique" for measure in measures})
for c in measure_cnt:
    print(f"{c} : {measure_cnt[c].max()}")

c_npmi_10_full : 1
c_npmi_10_nytimes_full : 1
c_npmi_10_test : 1
c_npmi_10_train : 1
c_npmi_10_val : 1
c_uci_full : 1
c_v_full : 1
c_v_nytimes_full : 1
c_v_test : 1
c_v_train : 1
c_v_val : 1
c_npmi_10_wikitext_full : 1
c_v_wikitext_full : 1
u_mass_full : 1


In [4]:
# Removing automated measures
hoyle_data = hoyle_data.drop(columns = [c for c in hoyle_data.columns if c[:2] == "c_"] + ["u_mass_full"])
hoyle_data.head(5).T

Unnamed: 0,0,1,2,3,4
dataset,wikitext,wikitext,wikitext,wikitext,wikitext
model,mallet,mallet,mallet,mallet,mallet
topic_idx,0,0,0,0,0
human_idx,0,1,2,3,4
scores_raw,2,3,1,3,3
confidences_raw,1,1,1,1,1
task,ratings,ratings,ratings,ratings,ratings


In [21]:
# Checking if measure ranges are as expected.
# Spoiler: They are.
print(hoyle_data[hoyle_data["task"]=="intrusions"]["scores_raw"].value_counts())
print(hoyle_data[hoyle_data["task"]=="ratings"]["scores_raw"].value_counts())

1    5555
0    2245
Name: scores_raw, dtype: int64
3    2761
2    1318
1     421
Name: scores_raw, dtype: int64


In [23]:
hoyle_data.groupby("topic_idx").agg({"scores_raw":["sum"], "confidences_raw":["sum"]})

Unnamed: 0_level_0,scores_raw,confidences_raw
Unnamed: 0_level_1,sum,sum
topic_idx,Unnamed: 1_level_2,Unnamed: 2_level_2
0,305,212
1,361,204
2,349,226
3,337,228
4,320,217
5,302,240
6,367,239
7,315,227
8,310,219
9,370,241


In [51]:
test = hoyle_data[(hoyle_data["task"]=="intrusions") & (hoyle_data["topic_idx"]==0)]

In [57]:
test[test["human_idx"]==0]

Unnamed: 0,dataset,model,topic_idx,human_idx,scores_raw,confidences_raw,task
4500,wikitext,mallet,0,0,1,1,intrusions
5800,wikitext,dvae,0,0,0,0,intrusions
7100,wikitext,etm,0,0,0,1,intrusions
8400,nytimes,mallet,0,0,1,1,intrusions
9700,nytimes,dvae,0,0,0,1,intrusions
11000,nytimes,etm,0,0,1,1,intrusions
