In [19]:
from getopt import getopt
import cloudpickle
import pickle
import sys
import os
import numpy as np
import pandas as pd
# import seaborn as sns
from scipy.special import logit, expit
from scipy.stats import uniform, norm, bernoulli, betabinom
from statsmodels.stats.proportion import proportions_ztest
# from matplotlib import pyplot as plt
import pymc as pm
import arviz as az
from modeltools import mcmc_diagnostics, create_summary_stat
from downcast import downcast_df
import jax
from pymc.sampling_jax import sample_numpyro_nuts
from time import time, sleep
from datetime import timedelta

In [38]:
def resample(all_ids, param, size, bound=0.1):
    # resampling raters and topics such that effects sum to 0.

    s = model["summary_stat"][model["summary_stat"]["param"]==param].copy(deep=True)

    if param == "za":
        s[["a", "b"]] = s["param_num"].str.split(", ", expand=True)
        s["param_num"] = (s["a"].astype(int)*50 + s["b"].astype(int)).astype(str)

    mean_sum = 9999
    while mean_sum < -bound or mean_sum > bound:
        ids = np.random.choice(all_ids, size=size, replace=True)
        mean_sum = sum([s[(s["param_num"]==str(i))]["mean"].item() for i in ids])

    return ids

def postrr_var(n_success, total):
    a = n_success+1
    b = total-n_success+1
    return a*b/((a+b+1)*(a+b)**2)

def postrr_p(n_success, total):
    a = n_success+1
    b = total-n_success+1
    return betabinom.pmf(
        n=1,
        k=1,
        a=a,
        b=b
    )

In [10]:
# Main
process_n=None
n_runs=30
trials_per_sim=1
optimal_allocation=False
seed=42
sim_name=None
chain_method = "vectorized"

SAMPLE_JAX = True
N_PROCESSES = 6

start_time = time()

# simulate_scores
sim_id=0
p_diff=0.08
n_raters=40
scores_per_r=40
total_scores=None
trials_per_sim=1
seed=42
optimal_allocation=False

In [5]:
# Reading in data
raw_data = pd.read_csv("data/unit_level_ratings.csv",index_col = 0)
raw_data = raw_data.sort_values(by=["corpus", "model", "topic"])

# Creating identifier for each corpus, model, and topic
# Identifier is unique for topic 
corpus_ids = (raw_data.groupby(["corpus"], as_index=False)
    .agg({"intrusion":"count"})
    .drop(columns="intrusion"))
corpus_ids["corpus_id"] = corpus_ids.index

model_ids = (raw_data.groupby(["model"], as_index=False)
    .agg({"intrusion":"count"})
    .drop(columns="intrusion"))
model_ids["model_id"] = model_ids.index

cordel_ids = (raw_data.groupby(["corpus", "model"], as_index=False)
    .agg({"intrusion":"count"})
    .drop(columns="intrusion"))
cordel_ids["cordel_id"] = cordel_ids.index 

topic_ids = (raw_data.groupby(["corpus", "model", "topic"], as_index=False)
    .agg({"intrusion":"count"})
    .drop(columns="intrusion"))
topic_ids["topic_id"] = topic_ids["topic"].astype(np.int16)

rater_ids = (raw_data.groupby(["corpus", "rater"], as_index=False)
    .agg({"intrusion":"count"})
    .drop(columns="intrusion"))
rater_ids["rater_id"] = rater_ids.index 

d1 = pd.merge(raw_data, corpus_ids, on=["corpus"], how="left")
d2 = pd.merge(d1, model_ids, on=["model"], how="left")
d3 = pd.merge(d2, cordel_ids, on=["corpus","model"], how="left")
d4 = pd.merge(d3, rater_ids, on=["corpus", "rater"], how="left")
data = pd.merge(d4, topic_ids, on=["corpus", "model", "topic"], how="left")
data = data[["corpus_id", "model_id", "cordel_id", "topic_id", "rater_id", "intrusion", "confidence"]]
data, na_s = downcast_df(data)

# Setting up numpy arrays for pymc
corpus_array = np.array(data["corpus_id"])
n_corpora = data["corpus_id"].nunique()

model_array = np.array(data["model_id"])
n_models = data["model_id"].nunique()

cordel_array = np.array(data["cordel_id"])
n_cordels = data["cordel_id"].nunique()

topic_array = np.array([data["cordel_id"], data["topic_id"]])
n_topics = data["topic_id"].nunique()

rater_array = np.array(data["rater_id"])
obs_n_raters = data["rater_id"].nunique()

score_array = np.array(data["intrusion"])

# Adding cordel id to topic_ids dataframe
topic_cordel_ids = pd.merge(topic_ids, cordel_ids, on=["corpus", "model"], how="left")

# Reading model
with open("bayesian_model/glmm.pickle", "rb") as f:
    model = cloudpickle.load(f)

In [26]:
# Setting numpy seed
np.random.seed(seed)

# Creating df schema
ps_data = pd.DataFrame(columns=["trial_id", "sim_cordel_id", "sim_topic_id", "sim_rater_id", 
                                "cordel_id", "topic_id", "rater_id"], dtype=np.int16)

for trial_id in range(trials_per_sim):

    # data template
    sim_data = pd.DataFrame(columns=["trial_id", "cordel_id", "topic_id", "rater_id"])  

    # Raters in this simulation
    raters = resample(data["rater_id"].unique(), param="zr", size=n_raters, bound=1)

    # Topics in this simulation (topic_cordel_ids index values)
    sim_topics_0 = resample(range(len(topic_cordel_ids)), param="za", size=50, bound=1)
    sim_topics_1 = resample(range(len(topic_cordel_ids)), param="za", size=50, bound=1)
    sim_topics = np.concatenate((sim_topics_0, sim_topics_1))

    # Loop - used to contain uniform sampling algorithm could use cleanup
    # Produces df containing cross product between raters and topics
    for sim_rater_id, rater in enumerate(raters):
        rated_topics = np.array(range(100))

        rated_topics_idx = sim_topics[rated_topics]

    #     Append topics to simulation
        d=topic_cordel_ids.loc[rated_topics_idx, ["topic_id", "cordel_id"]]
        d["sim_rater_id"]=sim_rater_id
        d["sim_topic_id"]=rated_topics
        d["rater_id"]=rater

        sim_data = pd.concat([sim_data, d], axis="rows", ignore_index=True)

#     Adding one topic/rater interaction into df
    sim_data["trial_id"] = trial_id
    sim_data.loc[sim_data["sim_topic_id"].isin(range(0,50)),["sim_cordel_id"]] = 0
    sim_data.loc[sim_data["sim_topic_id"].isin(range(50,100)),["sim_cordel_id"]] = 1
#     sim_data = pd.merge(sim_data, topic_counts[["cordel_id", "topic_id", "sim_cordel_id"]]
#                         ,on=["cordel_id", "topic_id"], how="left")
    sim_data=sim_data.astype(np.int16)

#     Appending interaction to ds.
    ps_data = pd.concat([ps_data, sim_data], ignore_index=True)

#         print(f"Completed simulating topic/rater interactions in {time() - startt:.2f}s")

#     Simulating Scores
pymc_model = model["model"]
trace = model["trace"].copy()

# Calculating proposed logodds means
# model1 = model0 + p
# https://www.wolframalpha.com/input?i=solve+for+x+and+y%2C+x%2By%3Dc%2C+1%2F%281%2Be%5E-x%29-1%2F%281%2Be%5E-y%29%3Dp
mean_model_logodds = model["summary_stat"][model["summary_stat"]["param"]=="mu"]["mean"].mean()
c = 2*mean_model_logodds
C = np.exp(-c)
det = p_diff**2-2*C*(p_diff**2-2)+(C**2)*(p_diff**2)
quad = (-p_diff*(C+1)+det**0.5)/(2*(p_diff+1))
proposed_model1_mean = -np.log(quad)
proposed_model0_mean = c-proposed_model1_mean

# Setting trace of cordel 0 and cordel 1 to proposed values
trace.posterior["mu"].loc[dict(mu_dim_0=0)] = proposed_model0_mean
trace.posterior["mu"].loc[dict(mu_dim_0=1)] = proposed_model1_mean

sim_scores = pd.DataFrame(columns=["trial_id", "sim_cordel_id", "sim_topic_id", "sim_rater_id", "cordel_id", "topic_id", "rater_id", "intrusion", ]
                   ,dtype=np.int16)

# TODO: add chain options
# for trial_id in range(trials_per_sim):
trial_id=0
    # Setting data containing rater/topic interaction
sim_data = ps_data[ps_data["trial_id"]==trial_id]
sim_rater_array = np.array(sim_data["rater_id"], dtype=int)
topic_array = np.array([sim_data["cordel_id"], sim_data["topic_id"]], dtype=int)
cordel_array = np.array(sim_data["sim_cordel_id"], dtype=int)

# Running simulation
with pymc_model:
    pm.set_data({
        "raters":sim_rater_array, 
        "topics":topic_array, 
        "cordels":cordel_array})
    postrr_sim=pm.sample_posterior_predictive(trace.posterior.sel(
        {"chain":[0], "draw":[np.random.randint(trials_per_sim) if trials_per_sim==1 else trial_id]})
        ,predictions=True, progressbar=False, random_seed=np.random.randint(2**20))

# Adding results to sim_scores
s = (postrr_sim.predictions.to_dataframe().reset_index()
      .rename(columns={"s":"intrusion"}))
trial_sim_scores = pd.concat([sim_data.reset_index(drop=True)
                             ,s["intrusion"]], axis="columns").astype(np.int16)

Sampling: [s]
INFO:pymc:Sampling: [s]


In [11]:
# if optimal_allocation:
    # Optimal topic allocation scores
scores = trial_sim_scores[:0]

if total_scores == None:
    total_scores = n_raters*scores_per_r

# Allocate topics for each rater
for sim_rater_id in range(n_raters):
    # Checking if all scores have been allocated
    if total_scores <= 0:
        break

    # Calculating variance for each topic's posterior distribution
    s = (scores.groupby("sim_topic_id").agg({"intrusion":"sum"})
         .rename(columns={"intrusion":"sum"}).reset_index())
    c = (scores.groupby("sim_topic_id").agg({"intrusion":"count"})
         .rename(columns={"intrusion":"count"}).reset_index())
    topic_var = pd.merge(s, c, on="sim_topic_id")

    # Create df with zeros if no data exists
    if len(topic_var) < 100:
        missing_topic_ids = [i for i in range(100) if i not in np.array(topic_var["sim_topic_id"])]
        missings = pd.DataFrame({"sim_topic_id":missing_topic_ids
                                  ,"sum":[0]*len(missing_topic_ids)
                                  ,"count":[0]*len(missing_topic_ids)})
        topic_var = pd.concat([topic_var, missings])

    # Calculating posterior variances
    topic_var["variance"] = postrr_var(topic_var["sum"], topic_var["count"])

    # Finding minimum priority value of second column
    cutoff = topic_var["variance"].max()/3**0.5

    # Allocating topics
    allocated_topics = (topic_var[topic_var["variance"]>=cutoff]
        .sort_values("variance", ascending=False))[:scores_per_r]["sim_topic_id"]
    total_scores -= len(allocated_topics)

    selected_scores = trial_sim_scores[(trial_sim_scores["sim_rater_id"]==sim_rater_id)&
                        (trial_sim_scores["sim_topic_id"].isin(allocated_topics))]
    scores = pd.concat([scores, selected_scores])

OSError: Cannot save file into a non-existent directory: 'data/None'

In [27]:
# Running total of scores
counts = np.zeros(100)
scores = trial_sim_scores[:0]
for sim_rater_id, rater in enumerate(raters):
    # Set the probability. Topics with fewer samples have higher probability
    counts = counts-counts.min()+1
    p = 1/counts**20
    p = p/p.sum()

    # Sample according to probability
    allocated_topics = np.random.choice(range(100), size=scores_per_r, replace=False, p=p)
    counts[allocated_topics] += 1

    selected_scores = trial_sim_scores[(trial_sim_scores["sim_rater_id"]==sim_rater_id)&
                    (trial_sim_scores["sim_topic_id"].isin(allocated_topics))]
    scores = pd.concat([scores, selected_scores])

In [15]:
topic_var

Unnamed: 0,sim_topic_id,sum,count,variance,p,var0,var1,expected_var
0,0,10,11,0.009298,0.909091,0.011224,0.008163,0.008442
1,1,12,14,0.008961,0.857143,0.009996,0.008074,0.008348
2,2,4,14,0.012638,0.285714,0.011534,0.012687,0.011864
3,3,9,22,0.009722,0.409091,0.009231,0.009477,0.009331
4,4,7,7,0.009877,1.000000,0.014545,0.008182,0.008182
...,...,...,...,...,...,...,...,...
95,95,10,11,0.009298,0.909091,0.011224,0.008163,0.008442
96,96,11,23,0.009600,0.478261,0.009204,0.009259,0.009231
97,97,10,11,0.009298,0.909091,0.011224,0.008163,0.008442
98,98,7,7,0.009877,1.000000,0.014545,0.008182,0.008182


In [39]:
topic_var["variance"]=postrr_var(topic_var["sum"], topic_var["count"])
topic_var["p"] = postrr_p(topic_var["sum"], topic_var["count"])
topic_var["var0"] = postrr_var(topic_var["sum"], topic_var["count"]+1)
topic_var["var1"] = postrr_var(topic_var["sum"]+1, topic_var["count"]+1)
topic_var["expected_var"] = topic_var["p"]*topic_var["var1"]+(1-topic_var["p"])*topic_var["var0"]
topic_var["var_reduction"] = topic_var["expected_var"]-topic_var["variance"]

In [45]:
topic_var.sort_values("var_diff")

Unnamed: 0,sim_topic_id,sum,count,p,var0,var1,expected_var,variance,var_diff
27,27,8,16,0.500000,0.012465,0.012465,0.012465,0.013158,-0.000693
89,89,8,16,0.500000,0.012465,0.012465,0.012465,0.013158,-0.000693
74,74,8,16,0.500000,0.012465,0.012465,0.012465,0.013158,-0.000693
96,96,8,16,0.500000,0.012465,0.012465,0.012465,0.013158,-0.000693
15,15,8,16,0.500000,0.012465,0.012465,0.012465,0.013158,-0.000693
...,...,...,...,...,...,...,...,...,...
51,51,16,16,0.944444,0.004709,0.002493,0.002616,0.002762,-0.000145
39,39,16,16,0.944444,0.004709,0.002493,0.002616,0.002762,-0.000145
80,80,16,16,0.944444,0.004709,0.002493,0.002616,0.002762,-0.000145
36,36,16,16,0.944444,0.004709,0.002493,0.002616,0.002762,-0.000145


In [21]:
betabinom.pmf(n=1,k=0,a=8,b=1)

0.8888888888888886

In [28]:
scores

Unnamed: 0,trial_id,sim_cordel_id,sim_topic_id,sim_rater_id,cordel_id,topic_id,rater_id,intrusion
0,0,0,0,0,5,27,252,1
1,0,0,1,0,4,46,252,1
5,0,0,5,0,1,35,252,1
8,0,0,8,0,4,19,252,1
19,0,0,19,0,4,21,252,0
...,...,...,...,...,...,...,...,...
3991,0,1,91,39,4,27,194,0
3992,0,1,92,39,2,10,194,1
3993,0,1,93,39,0,33,194,1
3996,0,1,96,39,4,35,194,0


In [29]:
s = (scores.groupby("sim_topic_id").agg({"intrusion":"sum"})
     .rename(columns={"intrusion":"sum"}).reset_index())
c = (scores.groupby("sim_topic_id").agg({"intrusion":"count"})
     .rename(columns={"intrusion":"count"}).reset_index())
topic_var = pd.merge(s, c, on="sim_topic_id")

In [34]:
topic_var

Unnamed: 0,sim_topic_id,sum,count
0,0,14,16
1,1,13,16
2,2,3,16
3,3,5,16
4,4,14,16
...,...,...,...
95,95,14,16
96,96,8,16
97,97,14,16
98,98,11,16
