In [None]:
import numpy as np
import pandas as pd
from scipy.special import logit, expit
from scipy.stats import bernoulli
from matplotlib import pyplot as plt
import optuna
import pymc as pm
import arviz as az
from modeltools import mcmc_diagnostics
from downcast import downcast_df

In [None]:
raw_data = pd.read_csv("data/unit_level_ratings.csv",index_col = 0)
raw_data = raw_data.sort_values(by=["corpus", "model", "topic"])

In [None]:
# Creating identifier for each corpus, model, and topic
# Identifier is unique for topic 
corpus_ids = (raw_data.groupby(["corpus"], as_index=False)
    .agg({"intrusion":"count"})
    .drop(columns="intrusion"))
corpus_ids["corpus_id"] = corpus_ids.index

model_ids = (raw_data.groupby(["model"], as_index=False)
    .agg({"intrusion":"count"})
    .drop(columns="intrusion"))
model_ids["model_id"] = model_ids.index

cordal_ids = (raw_data.groupby(["corpus", "model"], as_index=False)
    .agg({"intrusion":"count"})
    .drop(columns="intrusion"))
cordal_ids["cordal_id"] = cordal_ids.index 

topic_ids = (raw_data.groupby(["corpus", "model", "topic"], as_index=False)
    .agg({"intrusion":"count"})
    .drop(columns="intrusion"))
topic_ids["topic_id"] = topic_ids.index 

rater_ids = (raw_data.groupby(["corpus", "rater"], as_index=False)
    .agg({"intrusion":"count"})
    .drop(columns="intrusion"))
rater_ids["rater_id"] = rater_ids.index 


d1 = pd.merge(raw_data, corpus_ids, on=["corpus"], how="left")
d2 = pd.merge(d1, model_ids, on=["model"], how="left")
d3 = pd.merge(d2, cordal_ids, on=["corpus","model"], how="left")
d4 = pd.merge(d3, rater_ids, on=["corpus", "rater"], how="left")
data = pd.merge(d4, topic_ids, on=["corpus", "model", "topic"], how="left")
data = data[["corpus_id", "model_id", "cordal_id", "topic_id", "rater_id", "intrusion", "confidence"]]
data, na_s = downcast_df(data)

In [None]:
# Setting up numpy arrays for pymc
corpora_array = np.array(data["corpus_id"])
n_corpora = data["corpus_id"].nunique()

models_array = np.array(data["model_id"])
n_models = data["model_id"].nunique()

cordals_array = np.array(data["cordal_id"])
n_cordals = data["cordal_id"].nunique()

topics_array = np.array(data["topic_id"])
n_topics = data["topic_id"].nunique()

raters_array = np.array(data["rater_id"])
n_raters = data["rater_id"].nunique()

scores_array = np.array(data["intrusion"])

## Finding optimal prior distributions

In [None]:
glm_topic_corpora_model = {"model":pm.Model()}
prior_mean = logit(0.75)

In [None]:
pd.read_csv("divergence_schema.csv", index_col=0).to_csv("divergence_log.csv")
pd.read_csv("summary_stat_schema.csv", index_col=0).to_csv("summary_stat_log.csv")

In [None]:
# Topic and cordal model

def objective(trial):
    
    cm_scale_param = trial.suggest_float("cm_scale_param", 1e-2, 1, log=True)
    a_scale_param = trial.suggest_float("a_scale_param", 1e-2, 1, log=True)

    glm_topic_corpora_model["model"] = pm.Model()
    with glm_topic_corpora_model["model"]:
        # Hyperparameter priors
        sigma_c = pm.Exponential("sigma_c", lam=2)
        zc = pm.Normal("zc",mu=prior_mean, sigma=cm_scale_param, shape=n_corpora)
        sigma_m = pm.Exponential("sigma_m", lam=2)
        zm = pm.Normal("zm",mu=0, sigma=cm_scale_param, shape=n_models)
        sigma_a = pm.Exponential("sigma_a", lam=2)
        za = pm.Normal("za",mu=0, sigma=a_scale_param, shape=n_topics)

        p = pm.math.invlogit(
            za[topics_array]*sigma_a+
            zc[corpora_array]*sigma_c+
            zm[models_array]*sigma_m)
        s = pm.Bernoulli("s", p=p, observed=scores_array)

        glm_topic_corpora_model["trace"]=pm.sample(cores=2)

    # Recording divergences
    diverge = pd.DataFrame(glm_topic_corpora_model["trace"].sample_stats["diverging"]).sum(axis="columns")
    total_divergences = diverge.sum()
    diverge = pd.DataFrame(diverge).T.rename(columns={0:"0", 1:"1"})
    diverge["c_sigma"] = cm_scale_param
    diverge["a_sigma"] = a_scale_param
    all_divergences = pd.read_csv("divergence_log.csv", index_col=0)
    all_divergences = pd.concat([all_divergences, diverge], axis="rows")
    all_divergences.to_csv("divergence_log.csv")

    # Recording summary_stats
    summary_stat = az.summary(glm_topic_corpora_model["trace"], round_to=4).reset_index()
    summary_stat["param"] = summary_stat["index"].str.split("[").str[0]
    summary_stat["param_num"] = summary_stat["index"].str.split("[").str[1].str[:-1]
    summary_stat["param"] = summary_stat["param"].astype("category")
    summary_stat["param_num"] = summary_stat["param_num"].astype("category")
    summary_stat = summary_stat[["param", "param_num"]+list(summary_stat.columns[1:-2])]
    summary_stat["c_sigma"] = cm_scale_param
    summary_stat["a_sigma"] = a_scale_param
    all_stats = pd.read_csv("summary_stat_log.csv", index_col=0)
    all_stats = pd.concat([all_stats, summary_stat], axis="rows")
    all_stats.to_csv("summary_stat_log.csv")
    return total_divergences

In [None]:
study = optuna.create_study()
study.optimize(objective, n_trials=25)