In [None]:
from getopt import getopt
import cloudpickle
import pickle
import sys
import os
import numpy as np
import pandas as pd
# import seaborn as sns
from scipy.special import logit, expit
from scipy.stats import uniform, norm, bernoulli
from statsmodels.stats.proportion import proportions_ztest
# from matplotlib import pyplot as plt
import pymc as pm
import arviz as az
from modeltools import mcmc_diagnostics, create_summary_stat
from downcast import downcast_df
import jax
from pymc.sampling_jax import sample_numpyro_nuts
from time import time, sleep
from datetime import timedelta

In [None]:
def simulate_scores(model, p_diff=0.08, n_raters=40, scores_per_r=40, trials_per_sim=1_000, seed=42):

    # Setting numpy seed
    np.random.seed(seed)

    # Creating df schema
    ps_data = pd.DataFrame(columns=["trial_id", "sim_cordel_id", "sim_topic_id", "sim_rater_id", 
                                    "cordel_id", "topic_id", "rater_id"], dtype=np.int16)

    for trial_id in range(trials_per_sim):

        # data template
        sim_data = pd.DataFrame(columns=["trial_id", "cordel_id", "topic_id", "rater_id"])  

        # Raters in this simulation
        raters = resample(data["rater_id"].unique(), param="zr", size=n_raters, bound=1)

        # Topics in this simulation (topic_cordel_ids index values)
        sim_topics_0 = resample(range(len(topic_cordel_ids)), param="za", size=50, bound=1)
        sim_topics_1 = resample(range(len(topic_cordel_ids)), param="za", size=50, bound=1)
        sim_topics = np.concatenate((sim_topics_0, sim_topics_1))

        # Current number of scores for each topic
        counts = np.zeros(100)

        for sim_rater_id, rater in enumerate(raters):
        #     Simulate scores for every rater and topic
            rated_topics = np.array(range(100))
            rated_topics_idx = sim_topics[rated_topics]

        #     Append topics to simulation
            d=topic_cordel_ids.loc[rated_topics_idx, ["topic_id", "cordel_id"]]
            d["sim_rater_id"]=sim_rater_id
            d["sim_topic_id"]=rated_topics
            d["rater_id"]=rater

            sim_data = pd.concat([sim_data, d], axis="rows", ignore_index=True)

    #     Adding one topic/rater interaction into df
        sim_data["trial_id"] = trial_id
        sim_data.loc[sim_data["sim_topic_id"].isin(range(0,50)),["sim_cordel_id"]] = 0
        sim_data.loc[sim_data["sim_topic_id"].isin(range(50,100)),["sim_cordel_id"]] = 1
    #     sim_data = pd.merge(sim_data, topic_counts[["cordel_id", "topic_id", "sim_cordel_id"]]
    #                         ,on=["cordel_id", "topic_id"], how="left")
        sim_data=sim_data.astype(np.int16)

    #     Appending interaction to ds.
        ps_data = pd.concat([ps_data, sim_data], ignore_index=True)

#         print(f"Completed simulating topic/rater interactions in {time() - startt:.2f}s")

#     Simulating Scores
    pymc_model = model["model"]
    trace = model["trace"].copy()

    # Calculating proposed logodds means
    # model1 = model0 + p
    # https://www.wolframalpha.com/input?i=solve+for+x+and+y%2C+x%2By%3Dc%2C+1%2F%281%2Be%5E-x%29-1%2F%281%2Be%5E-y%29%3Dp
    mean_model_logodds = model["summary_stat"][model["summary_stat"]["param"]=="mu"]["mean"].mean()
    c = 2*mean_model_logodds
    C = np.exp(-c)
    det = p_diff**2-2*C*(p_diff**2-2)+(C**2)*(p_diff**2)
    quad = (-p_diff*(C+1)+det**0.5)/(2*(p_diff+1))
    proposed_model1_mean = -np.log(quad)
    proposed_model0_mean = c-proposed_model1_mean

    # Setting trace of cordel 0 and cordel 1 to proposed values
    trace.posterior["mu"].loc[dict(mu_dim_0=0)] = proposed_model0_mean
    trace.posterior["mu"].loc[dict(mu_dim_0=1)] = proposed_model1_mean

    sim_scores = pd.DataFrame(columns=["trial_id", "sim_cordel_id", "sim_topic_id", "sim_rater_id", "cordel_id", "topic_id", "rater_id", "intrusion", ]
                       ,dtype=np.int16)

# TODO: add chain options
    for trial_id in range(trials_per_sim):
        # Setting data containing rater/topic interaction
        sim_data = ps_data[ps_data["trial_id"]==trial_id]
        sim_rater_array = np.array(sim_data["rater_id"], dtype=int)
        topic_array = np.array([sim_data["cordel_id"], sim_data["topic_id"]], dtype=int)
        cordel_array = np.array(sim_data["sim_cordel_id"], dtype=int)

        # Running simulation
        with pymc_model:
            pm.set_data({
                "raters":sim_rater_array, 
                "topics":topic_array, 
                "cordels":cordel_array})
            postrr_sim=pm.sample_posterior_predictive(trace.posterior.sel(
                {"chain":[0], "draw":[np.random.randint(trials_per_sim) if trials_per_sim==1 else trial_id]})
                ,predictions=True, progressbar=False, random_seed=np.random.randint(2**20))

        # Adding results to sim_scores
        s = (postrr_sim.predictions.to_dataframe().reset_index()
              .rename(columns={"s":"intrusion"}))
        trial_sim_scores = pd.concat([sim_data.reset_index(drop=True)
                                     ,s["intrusion"]], axis="columns").astype(np.int16)
        sim_scores = pd.concat([sim_scores, trial_sim_scores], axis="index", ignore_index=True)
        
#         Add algorithm here
        
    return sim_scores