In [2]:
import pandas as pd
import numpy as np

### Intrusion raw data
Columns represent model/topic
Rows represent crowdworker

Each entry consists of two information, rating/confidence, 
where rating=6 is correct and confidence=7 is correct

Source: https://github.com/ahoho/topics/blob/dev/data/human/CreateJointFile.py

In [6]:
wikitext = pd.read_csv("data/wikitext_intrusion.csv", index_col=0)
nytimes =  pd.read_csv("data/nytimes_intrusion.csv", index_col=0)
corpus = wikitext

In [7]:
# Checking the number of ratings by each person
corpus.count(axis="columns").value_counts()

38    160
dtype: int64

In [8]:
# Checking the number of ratings each topic received
corpus.count(axis="rows").value_counts().sort_index()

26     1
28     2
30     2
31     4
32     5
33     4
34     4
35     5
36    10
37     8
38     6
39    11
40    10
41    18
42     8
43     7
44     9
45     6
46     7
47     6
48     6
49     4
50     1
52     2
53     3
56     1
dtype: int64

In [5]:
# Checking columns include 50 topics from each model
check = pd.DataFrame([e.split("_")[1:] for e in corpus.columns], columns=["model", "topic"])
check["topic"] = check["topic"].astype("int")
check.groupby("model").agg({"topic":"mean"})

Unnamed: 0_level_0,topic
model,Unnamed: 1_level_1
dvae,24.5
etm,24.5
mallet,24.5


In [6]:
# Creating dataframe from raw wikitext data
data = pd.DataFrame(columns = ["corpus", "model", "topic", "rater", "intrusion", "confidence"])

for corpus in [wikitext, nytimes]:
    for c_name in corpus.columns:
        c=pd.DataFrame(corpus[c_name])
        c["rater"] = c.index
        c=c.dropna(how="any")
        c[["intrusion", "confidence"]]=c[c_name].str.split(",", expand=True)
        c["corpus"], c["model"], c["topic"] = c_name.split("_")
        c = c.drop(columns=c_name)
        data=pd.concat([data, c], axis="rows", ignore_index=True)

data["intrusion"] = data["intrusion"].apply(lambda x:1 if x=="6" else 0)
data["confidence"] = data["confidence"].apply(lambda x:1 if x=="7" else 0)
data["topic"] = data["topic"].astype(int)
data["rater"] = data["rater"].astype(int)

In [7]:
# Checking if formatted data counts are same as original
print(data.shape)
print(wikitext.count().sum()+nytimes.count().sum())

(12198, 6)
12198


In [28]:
# Checking for discrepencies in counts across all totals
for corpus in ["nytimes", "wikitext"]:
    for model in ["dvae", "etm", "mallet"]:
        for topic in range(50):
            corpus_raw = eval(corpus)
            
            # Total raters
            processed = data[(data["corpus"]==corpus) & (data["model"]==model) & (data["topic"]==topic)]
            raw = corpus_raw[f"{corpus}_{model}_{topic}"].dropna()
            
            # No. of successful intrusion tasks
            processed_intrusion_1s = processed[processed["intrusion"]==1]["rater"].shape[0]
            raw_intrusion_1s = raw[raw.str[0]=="6"].shape[0]

            # No. of successful confidence
            processed_confidence_1s = processed[processed["confidence"]==1]["rater"].shape[0]
            raw_confidence_1 = raw[raw.str[-1]=="7"].shape[0]
            
            if (processed.shape[0] != raw.shape[0] or
                processed_intrusion_1s != raw_intrusion_1s or 
                processed_confidence_1s != raw_confidence_1
            ):
                raise Exception(f"{corpus}, {model}, {topic}")
                
print("all good! :)")

all good! :)


In [31]:
# Spot checks
corpus = "nytimes"
corpus_raw = eval(corpus)
model = "etm"
topic = 19
compare = "confidence"

processed = data[(data["corpus"]==corpus) & (data["model"]==model) & (data["topic"]==topic)]
raw = corpus_raw[f"{corpus}_{model}_{topic}"].dropna()

if compare == "intrusion":
    print(processed[processed["intrusion"]==1]["rater"])
    print(raw[raw.str[0]=="6"])
elif compare == "confidence":
    print(processed[processed["confidence"]==1]["rater"])
    print(raw[raw.str[-1]=="7"])

11946      0
11947      1
11948      9
11949     10
11950     15
11951     22
11952     25
11953     27
11954     31
11955     34
11956     35
11957     43
11958     47
11959     49
11960     52
11962     58
11963     62
11964     74
11965     76
11966     84
11967     91
11968     94
11969    103
11970    104
11971    107
11972    111
11973    112
11974    114
11976    125
11977    127
11978    131
11979    134
11980    145
11981    148
11982    150
11983    152
Name: rater, dtype: int64
0      6,7
1      6,7
9      1,7
10     6,7
15     6,7
22     6,7
25     5,7
27     6,7
31     2,7
34     2,7
35     6,7
43     6,7
47     6,7
49     6,7
52     5,7
58     6,7
62     6,7
74     6,7
76     6,7
84     6,7
91     6,7
94     6,7
103    6,7
104    1,7
107    6,7
111    5,7
112    2,7
114    6,7
125    6,7
127    5,7
131    6,7
134    6,7
145    6,7
148    5,7
150    5,7
152    1,7
Name: nytimes_etm_19, dtype: object


In [32]:
data.to_csv("data/unit_level_ratings.csv")