In [1]:
from getopt import getopt
import cloudpickle
import pickle
import sys
import os
import numpy as np
import pandas as pd
# import seaborn as sns
from scipy.special import logit, expit
from scipy.stats import uniform, norm, bernoulli, betabinom
from statsmodels.stats.proportion import proportions_ztest
# from matplotlib import pyplot as plt
import pymc as pm
import arviz as az
from modeltools import mcmc_diagnostics, create_summary_stat
from downcast import downcast_df
import jax
from pymc.sampling_jax import sample_numpyro_nuts
from time import time, sleep
from datetime import timedelta

2022-12-24 09:28:33.606262: W external/org_tensorflow/tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2022-12-24 09:28:33.924693: W external/org_tensorflow/tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2022-12-24 09:28:33.945890: W external/org_tensorflow/tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory


In [2]:
# GPU setting
SAMPLE_JAX = True
N_PROCESSES = 6

# Time
start_time = time()

# Reading in data
raw_data = pd.read_csv("data/unit_level_ratings.csv",index_col = 0)
raw_data = raw_data.sort_values(by=["corpus", "model", "topic"])

# Creating identifier for each corpus, model, and topic
# Identifier is unique for topic 
corpus_ids = (raw_data.groupby(["corpus"], as_index=False)
    .agg({"intrusion":"count"})
    .drop(columns="intrusion"))
corpus_ids["corpus_id"] = corpus_ids.index

model_ids = (raw_data.groupby(["model"], as_index=False)
    .agg({"intrusion":"count"})
    .drop(columns="intrusion"))
model_ids["model_id"] = model_ids.index

cordel_ids = (raw_data.groupby(["corpus", "model"], as_index=False)
    .agg({"intrusion":"count"})
    .drop(columns="intrusion"))
cordel_ids["cordel_id"] = cordel_ids.index 

topic_ids = (raw_data.groupby(["corpus", "model", "topic"], as_index=False)
    .agg({"intrusion":"count"})
    .drop(columns="intrusion"))
topic_ids["topic_id"] = topic_ids["topic"].astype(np.int16)

rater_ids = (raw_data.groupby(["corpus", "rater"], as_index=False)
    .agg({"intrusion":"count"})
    .drop(columns="intrusion"))
rater_ids["rater_id"] = rater_ids.index 

d1 = pd.merge(raw_data, corpus_ids, on=["corpus"], how="left")
d2 = pd.merge(d1, model_ids, on=["model"], how="left")
d3 = pd.merge(d2, cordel_ids, on=["corpus","model"], how="left")
d4 = pd.merge(d3, rater_ids, on=["corpus", "rater"], how="left")
data = pd.merge(d4, topic_ids, on=["corpus", "model", "topic"], how="left")
data = data[["corpus_id", "model_id", "cordel_id", "topic_id", "rater_id", "intrusion", "confidence"]]
data, na_s = downcast_df(data)

# Setting up numpy arrays for pymc
corpus_array = np.array(data["corpus_id"])
n_corpora = data["corpus_id"].nunique()

model_array = np.array(data["model_id"])
n_models = data["model_id"].nunique()

cordel_array = np.array(data["cordel_id"])
n_cordels = data["cordel_id"].nunique()

topic_array = np.array([data["cordel_id"], data["topic_id"]])
n_topics = data["topic_id"].nunique()

rater_array = np.array(data["rater_id"])
obs_n_raters = data["rater_id"].nunique()

score_array = np.array(data["intrusion"])

# Adding cordel id to topic_ids dataframe
topic_cordel_ids = pd.merge(topic_ids, cordel_ids, on=["corpus", "model"], how="left")

# Reading model
with open("bayesian_model/glmm.pickle", "rb") as f:
    inferred_glmm = cloudpickle.load(f)

In [9]:
# Join ss to itself and find point estmates for topic effects
ss = inferred_glmm["summary_stat"]
za = ss[ss["param"]=="za"]
za[["sim_cordel_id", "sim_topic_id"]] = za["param_num"].str.split(",", expand=True)
za["sim_cordel_id"] = za["sim_cordel_id"].str.strip().astype(int)
za["sim_topic_id"] = za["sim_topic_id"].str.strip().astype(int)

In [10]:
ss[ss["param"]=="za"]

Unnamed: 0,param,param_num,mean,sd,hdi_3%,hdi_97%,mcse_mean,mcse_sd,ess_bulk,ess_tail,r_hat
321,za,"0, 0",-1.1357,0.2887,-1.6494,-0.5442,0.0212,0.0150,185.8798,555.2656,
322,za,"0, 1",-0.1367,0.3716,-0.8340,0.5077,0.0225,0.0159,272.9664,516.4476,
323,za,"0, 2",0.2953,0.3248,-0.2933,0.9033,0.0192,0.0137,296.8024,495.7466,
324,za,"0, 3",1.1639,0.4572,0.4052,2.0516,0.0199,0.0141,529.0294,502.3017,
325,za,"0, 4",-0.1818,0.3036,-0.7903,0.3374,0.0193,0.0137,247.5332,609.4406,
...,...,...,...,...,...,...,...,...,...,...,...
616,za,"5, 45",-0.8146,0.2770,-1.3137,-0.2959,0.0155,0.0111,326.1145,538.7747,
617,za,"5, 46",-0.6670,0.3074,-1.2094,-0.0587,0.0172,0.0121,322.8708,410.4931,
618,za,"5, 47",-0.7286,0.2738,-1.2181,-0.1968,0.0164,0.0116,276.0288,471.7395,
619,za,"5, 48",0.2532,0.3406,-0.4513,0.8043,0.0179,0.0135,378.7252,467.7994,


In [11]:
# Aggregate raw data to find model precision
data

Unnamed: 0,corpus_id,model_id,cordel_id,topic_id,rater_id,intrusion,confidence
0,0,0,0,0,0,1,1
1,0,0,0,0,1,1,1
2,0,0,0,0,6,0,1
3,0,0,0,0,8,1,1
4,0,0,0,0,19,1,1
...,...,...,...,...,...,...,...
12193,1,2,5,49,289,1,1
12194,1,2,5,49,302,1,1
12195,1,2,5,49,303,1,1
12196,1,2,5,49,312,1,1


In [None]:
# Join and scatterplot topic effect vs model precision