In [1]:
import numpy as np
import pandas as pd
from scipy.stats import bernoulli, uniform
from scipy.special import logit, expit
from sklearn.linear_model import LogisticRegression

## Tong Chen's Model
https://github.com/T0ngChen/multiwave/blob/master/sim.r


In [62]:
r_dir = "data/chen_optimal_2020/"

In [63]:
# Reading in data from R
x_vars = ["x", "z1", "z2"]
data1 = pd.read_csv(r_dir+"data1.csv", index_col=0)
dm = data1[x_vars]
dm.insert(0, "intercept", [1]*1000)

In [64]:
# Fitting Logistic Regression
model = LogisticRegression()
model.fit(X=data1[x_vars], y=data1["y"])
fitted_values = model.predict_proba(data1[x_vars])[:,1]
resid = data1["y"] - fitted_values

In [66]:
# Estimating influence
Ihat = (dm.T*fitted_values*(1-fitted_values))@dm/len(dm)
infl = (dm.T*resid).T@np.linalg.inv(Ihat)
infl.columns = ["infl_intercept"] + ["infl_"+ x for x in x_vars]

In [67]:
# Calculating optimal allocation for stratas
aa = pd.concat([data1, infl], axis="columns")
std = (aa.groupby("stra")
         .agg({"infl_x":"std"})
         .rename(columns={"infl_x":"std"})
         .reset_index())
n = (aa.groupby("stra")
       .agg({"infl_x":"count"})
       .rename(columns={"infl_x":"count"})
       .reset_index())
oa = pd.merge(std, n, on="stra")
NS = oa["std"]*oa["count"]
NS

0     693.098742
1    1166.974278
2     558.806858
3     846.160103
4     441.229744
5     635.172133
6     311.252347
7     583.062125
dtype: float64

## Intrusion Model

In [2]:
from getopt import getopt
import cloudpickle
import pickle
import sys
import os
import numpy as np
import pandas as pd
# import seaborn as sns
from scipy.special import logit, expit
from scipy.stats import uniform, norm, bernoulli, betabinom
from statsmodels.stats.proportion import proportions_ztest
# from matplotlib import pyplot as plt
import pymc as pm
import arviz as az
from modeltools import mcmc_diagnostics, create_summary_stat
from downcast import downcast_df
import jax
from pymc.sampling_jax import sample_numpyro_nuts
from time import time, sleep
from datetime import timedelta



In [3]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LogisticRegression

In [4]:
# Main
process_n=None
n_runs=30
trials_per_sim=1
optimal_allocation=False
seed=42
sim_name="7_topic_allocation"
chain_method = "vectorized"

SAMPLE_JAX = True
N_PROCESSES = 6

start_time = time()

# simulate_scores
sim_id=0
p_diff=0.08
n_raters=40
scores_per_r=40
total_scores=None
trials_per_sim=1
seed=42
optimal_allocation=False

In [5]:
def resample(all_ids, param, size, bound=0.1):
    # resampling raters and topics such that effects sum to 0.

    s = model["summary_stat"][model["summary_stat"]["param"]==param].copy(deep=True)

    if param == "za":
        s[["a", "b"]] = s["param_num"].str.split(", ", expand=True)
        s["param_num"] = (s["a"].astype(int)*50 + s["b"].astype(int)).astype(str)

    mean_sum = 9999
    while mean_sum < -bound or mean_sum > bound:
        ids = np.random.choice(all_ids, size=size, replace=True)
        mean_sum = sum([s[(s["param_num"]==str(i))]["mean"].item() for i in ids])

    return ids

In [6]:
# Reading in data
raw_data = pd.read_csv("data/unit_level_ratings.csv",index_col = 0)
raw_data = raw_data.sort_values(by=["corpus", "model", "topic"])

# Creating identifier for each corpus, model, and topic
# Identifier is unique for topic 
corpus_ids = (raw_data.groupby(["corpus"], as_index=False)
    .agg({"intrusion":"count"})
    .drop(columns="intrusion"))
corpus_ids["corpus_id"] = corpus_ids.index

model_ids = (raw_data.groupby(["model"], as_index=False)
    .agg({"intrusion":"count"})
    .drop(columns="intrusion"))
model_ids["model_id"] = model_ids.index

cordel_ids = (raw_data.groupby(["corpus", "model"], as_index=False)
    .agg({"intrusion":"count"})
    .drop(columns="intrusion"))
cordel_ids["cordel_id"] = cordel_ids.index 

topic_ids = (raw_data.groupby(["corpus", "model", "topic"], as_index=False)
    .agg({"intrusion":"count"})
    .drop(columns="intrusion"))
topic_ids["topic_id"] = topic_ids["topic"].astype(np.int16)

rater_ids = (raw_data.groupby(["corpus", "rater"], as_index=False)
    .agg({"intrusion":"count"})
    .drop(columns="intrusion"))
rater_ids["rater_id"] = rater_ids.index 

d1 = pd.merge(raw_data, corpus_ids, on=["corpus"], how="left")
d2 = pd.merge(d1, model_ids, on=["model"], how="left")
d3 = pd.merge(d2, cordel_ids, on=["corpus","model"], how="left")
d4 = pd.merge(d3, rater_ids, on=["corpus", "rater"], how="left")
data = pd.merge(d4, topic_ids, on=["corpus", "model", "topic"], how="left")
data = data[["corpus_id", "model_id", "cordel_id", "topic_id", "rater_id", "intrusion", "confidence"]]
data, na_s = downcast_df(data)

# Setting up numpy arrays for pymc
corpus_array = np.array(data["corpus_id"])
n_corpora = data["corpus_id"].nunique()

model_array = np.array(data["model_id"])
n_models = data["model_id"].nunique()

cordel_array = np.array(data["cordel_id"])
n_cordels = data["cordel_id"].nunique()

topic_array = np.array([data["cordel_id"], data["topic_id"]])
n_topics = data["topic_id"].nunique()

rater_array = np.array(data["rater_id"])
obs_n_raters = data["rater_id"].nunique()

score_array = np.array(data["intrusion"])

# Adding cordel id to topic_ids dataframe
topic_cordel_ids = pd.merge(topic_ids, cordel_ids, on=["corpus", "model"], how="left")

# Reading model
with open("bayesian_model/glmm.pickle", "rb") as f:
    model = cloudpickle.load(f)

In [7]:
# Setting numpy seed
np.random.seed(seed)

# Creating df schema
ps_data = pd.DataFrame(columns=["trial_id", "sim_cordel_id", "sim_topic_id", "sim_rater_id", 
                                "cordel_id", "topic_id", "rater_id"], dtype=np.int16)

for trial_id in range(trials_per_sim):

    # data template
    sim_data = pd.DataFrame(columns=["trial_id", "cordel_id", "topic_id", "rater_id"])  

    # Raters in this simulation
    raters = resample(data["rater_id"].unique(), param="zr", size=n_raters, bound=1)

    # Topics in this simulation (topic_cordel_ids index values)
    sim_topics_0 = resample(range(len(topic_cordel_ids)), param="za", size=50, bound=1)
    sim_topics_1 = resample(range(len(topic_cordel_ids)), param="za", size=50, bound=1)
    sim_topics = np.concatenate((sim_topics_0, sim_topics_1))

    # Loop - used to contain uniform sampling algorithm could use cleanup
    # Produces df containing cross product between raters and topics
    for sim_rater_id, rater in enumerate(raters):
        rated_topics = np.array(range(100))

        rated_topics_idx = sim_topics[rated_topics]

    #     Append topics to simulation
        d=topic_cordel_ids.loc[rated_topics_idx, ["topic_id", "cordel_id"]]
        d["sim_rater_id"]=sim_rater_id
        d["sim_topic_id"]=rated_topics
        d["rater_id"]=rater

        sim_data = pd.concat([sim_data, d], axis="rows", ignore_index=True)

#     Adding one topic/rater interaction into df
    sim_data["trial_id"] = trial_id
    sim_data.loc[sim_data["sim_topic_id"].isin(range(0,50)),["sim_cordel_id"]] = 0
    sim_data.loc[sim_data["sim_topic_id"].isin(range(50,100)),["sim_cordel_id"]] = 1
#     sim_data = pd.merge(sim_data, topic_counts[["cordel_id", "topic_id", "sim_cordel_id"]]
#                         ,on=["cordel_id", "topic_id"], how="left")
    sim_data=sim_data.astype(np.int16)

#     Appending interaction to ds.
    ps_data = pd.concat([ps_data, sim_data], ignore_index=True)

#         print(f"Completed simulating topic/rater interactions in {time() - startt:.2f}s")

#     Simulating Scores
pymc_model = model["model"]
trace = model["trace"].copy()

# Calculating proposed logodds means
# model1 = model0 + p
# https://www.wolframalpha.com/input?i=solve+for+x+and+y%2C+x%2By%3Dc%2C+1%2F%281%2Be%5E-x%29-1%2F%281%2Be%5E-y%29%3Dp
mean_model_logodds = model["summary_stat"][model["summary_stat"]["param"]=="mu"]["mean"].mean()
c = 2*mean_model_logodds
C = np.exp(-c)
det = p_diff**2-2*C*(p_diff**2-2)+(C**2)*(p_diff**2)
quad = (-p_diff*(C+1)+det**0.5)/(2*(p_diff+1))
proposed_model1_mean = -np.log(quad)
proposed_model0_mean = c-proposed_model1_mean

# Setting trace of cordel 0 and cordel 1 to proposed values
trace.posterior["mu"].loc[dict(mu_dim_0=0)] = proposed_model0_mean
trace.posterior["mu"].loc[dict(mu_dim_0=1)] = proposed_model1_mean

sim_scores = pd.DataFrame(columns=["trial_id", "sim_cordel_id", "sim_topic_id", "sim_rater_id", "cordel_id", "topic_id", "rater_id", "intrusion", ]
                   ,dtype=np.int16)

# TODO: add chain options
for trial_id in range(1):#trials_per_sim):
    # Setting data containing rater/topic interaction
    sim_data = ps_data[ps_data["trial_id"]==trial_id]
    sim_rater_array = np.array(sim_data["rater_id"], dtype=int)
    topic_array = np.array([sim_data["cordel_id"], sim_data["topic_id"]], dtype=int)
    cordel_array = np.array(sim_data["sim_cordel_id"], dtype=int)

    # Running simulation
    with pymc_model:
        pm.set_data({
            "raters":sim_rater_array, 
            "topics":topic_array, 
            "cordels":cordel_array})
        postrr_sim=pm.sample_posterior_predictive(trace.posterior.sel(
            {"chain":[0], "draw":[np.random.randint(trials_per_sim) if trials_per_sim==1 else trial_id]})
            ,predictions=True, progressbar=False, random_seed=np.random.randint(2**20))

    # Adding results to sim_scores
    s = (postrr_sim.predictions.to_dataframe().reset_index()
          .rename(columns={"s":"intrusion"}))
    trial_sim_scores = pd.concat([sim_data.reset_index(drop=True)
                                 ,s["intrusion"]], axis="columns").astype(np.int16)



Sampling: [s]
INFO:pymc:Sampling: [s]


In [8]:
enc = OneHotEncoder()
enc_vars = ["sim_cordel_id", "sim_topic_id", "sim_rater_id"]

In [9]:
enc.fit(trial_sim_scores[enc_vars])

In [10]:
dm=enc.transform(trial_sim_scores[enc_vars]).toarray()

In [11]:
# Fitting Logistic Regression
reg = LogisticRegression()
reg.fit(X=dm, y=trial_sim_scores["intrusion"])
fitted_values = reg.predict_proba(dm)[:,1]
resid = np.array(trial_sim_scores["intrusion"] - fitted_values)

In [12]:
# Estimating influence
Ihat = (dm.T*fitted_values*(1-fitted_values))@dm/len(dm)
infl = (dm.T*resid).T@np.linalg.inv(Ihat)

var_len = [len(x) for x in enc.categories_]
col_index = []
for var_cats in enc.categories_:
    col_index += list(var_cats)
    
infl = pd.DataFrame(infl,columns = pd.MultiIndex.from_tuples(zip(
    ["ohe_cordel_id"]*var_len[0]+
    ["ohe_topic_id"]*var_len[1]+
    ["ohe_rater_id"]*var_len[2],
    col_index), names=["feature", "category"]))

In [53]:
infl_c0 = infl[("ohe_cordel_id", 0)].to_numpy().reshape(-1, 1)
infl_c1 = infl[("ohe_cordel_id", 1)].to_numpy().reshape(-1, 1)
infl_t0 = infl[[("ohe_topic_id", i) for i in range(0, 50)]].to_numpy()
infl_t1 = infl[[("ohe_topic_id", i) for i in range(50, 100)]].to_numpy()


In [61]:
(infl_c0 + infl_t0).mean(axis=1)

array([ 11.86220718,  25.43954961, -45.73964839, ...,  -4.49847509,
        40.6213066 , -25.08654023])

In [19]:
temp0 = trial_sim_scores.copy(deep=True)
temp0.columns = pd.MultiIndex.from_tuples(zip(["label_encoding"]*temp0.shape[1],
    temp0.columns))
temp = pd.concat([temp0, infl], axis="columns")

In [36]:
(temp["ohe_topic_id"].to_numpy().T+temp[("ohe_cordel_id", 0)].to_numpy()).shape



# Dead end, we can't transform the influence of a log-odds parameter to the influence of a probability parameter.




(100, 4000)

In [20]:
temp[("estimator", "cordel0")] = temp[]

Unnamed: 0_level_0,label_encoding,label_encoding,label_encoding,label_encoding,label_encoding,label_encoding,label_encoding,label_encoding,ohe_cordel_id,ohe_cordel_id,...,ohe_rater_id,ohe_rater_id,ohe_rater_id,ohe_rater_id,ohe_rater_id,ohe_rater_id,ohe_rater_id,ohe_rater_id,ohe_rater_id,ohe_rater_id
Unnamed: 0_level_1,trial_id,sim_cordel_id,sim_topic_id,sim_rater_id,cordel_id,topic_id,rater_id,intrusion,0,1,...,30,31,32,33,34,35,36,37,38,39
0,0,0,0,0,5,27,252,1,7.645094,-1.029301,...,2.080327,3.836373,-13.883396,-0.907581,-5.279558,-1.675720,2.300096,-5.163627,-4.907581,-4.547696
1,0,0,1,0,4,46,252,1,-2.357252,-0.312478,...,-4.336267,-1.673066,-20.357062,-8.009866,-9.178265,-5.999468,7.347729,-8.673066,-8.009866,-8.167868
2,0,0,2,0,1,25,252,0,-2.630155,-0.462208,...,-5.170353,-3.277249,11.188266,-6.384146,-6.437595,-7.063456,-20.635868,0.722751,-13.384146,11.883096
3,0,0,3,0,3,3,252,0,-2.930544,0.099016,...,6.735246,-1.534959,14.114017,-8.805164,1.059734,9.005451,-28.913730,12.465041,-0.805164,10.870348
4,0,0,4,0,2,43,252,1,13.091925,-0.784013,...,23.440650,33.310501,4.659758,31.180352,17.115278,21.570798,36.091392,17.310501,17.180352,-7.494276
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3995,0,1,95,39,5,13,194,1,-2.858385,-4.009840,...,-1.767661,-2.295722,-0.296999,-2.823784,-1.462814,-1.128630,-0.377353,-2.045722,-1.212814,23.695341
3996,0,1,96,39,4,35,194,0,8.705613,34.355758,...,9.287716,10.970866,-2.454957,10.654017,4.495592,7.446141,0.871965,10.970866,2.495592,-139.009900
3997,0,1,97,39,1,32,194,1,2.136118,-1.500713,...,-0.134833,-1.038499,0.895007,-1.192164,-0.393997,-0.433000,0.633494,-0.538499,-0.143997,19.746040
3998,0,1,98,39,0,41,194,0,13.138301,3.470259,...,10.538312,16.444860,-6.975673,18.351408,11.304682,9.585037,-2.994430,16.444860,7.304682,-199.498869


In [41]:

sd = (temp.groupby("sim_topic_id")
         .agg({"infl_x":"std"})
         .rename(columns={"infl_x":"sd"})
         .reset_index())
n = (temp.groupby("sim_topic_id")
       .agg({"infl_x":"count"})
       .rename(columns={"infl_x":"count"})
       .reset_index())
oa = pd.merge(sd, n, on="sim_topic_id")
NS = oa["sd"]*oa["count"]

KeyError: "Column(s) ['infl_x'] do not exist"

In [36]:
enc.categories_

[array([0, 1], dtype=int16),
 array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
        17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33,
        34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50,
        51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67,
        68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84,
        85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99],
       dtype=int16),
 array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
        17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33,
        34, 35, 36, 37, 38, 39], dtype=int16)]

In [None]:
if optimal_allocation:
    # Optimal topic allocation scores
    scores = trial_sim_scores[:0]

    if total_scores == None:
        total_scores = n_raters*scores_per_r

    # Allocate topics for each rater
    for sim_rater_id in range(n_raters):
        # Checking if all scores have been allocated
        if total_scores <= 0:
            break

        # Calculating variance for each topic's posterior distribution
        s = (scores.groupby("sim_topic_id").agg({"intrusion":"sum"})
             .rename(columns={"intrusion":"sum"}).reset_index())
        c = (scores.groupby("sim_topic_id").agg({"intrusion":"count"})
             .rename(columns={"intrusion":"count"}).reset_index())
        topic_var = pd.merge(s, c, on="sim_topic_id")

        # Create df with zeros if no data exists
        if len(topic_var) < 100:
            missing_topic_ids = [i for i in range(100) if i not in np.array(topic_var["sim_topic_id"])]
            missings = pd.DataFrame({"sim_topic_id":missing_topic_ids
                                      ,"sum":[0]*len(missing_topic_ids)
                                      ,"count":[0]*len(missing_topic_ids)})
            topic_var = pd.concat([topic_var, missings])

        # Calculating reduction in variance
        topic_var["variance"] = postrr_var(topic_var["sum"], topic_var["count"])
        topic_var["p"] = postrr_p(topic_var["sum"], topic_var["count"])
        topic_var["var0"] = postrr_var(topic_var["sum"], topic_var["count"]+1)
        topic_var["var1"] = postrr_var(topic_var["sum"]+1, topic_var["count"]+1)
        topic_var["expected_var"] = topic_var["p"]*topic_var["var1"]+(1-topic_var["p"])*topic_var["var0"]
        topic_var["var_diff"] = topic_var["expected_var"]-topic_var["variance"]

        # Allocating topics
        allocated_topics = topic_var.sort_values("var_diff", ascending=True)[:scores_per_r]["sim_topic_id"]
        total_scores -= scores_per_r

        selected_scores = trial_sim_scores[(trial_sim_scores["sim_rater_id"]==sim_rater_id)&
                            (trial_sim_scores["sim_topic_id"].isin(allocated_topics))]
        scores = pd.concat([scores, selected_scores])
else:
    # Running total of scores
    counts = np.zeros(100)
    scores = trial_sim_scores[:0]
    for sim_rater_id, rater in enumerate(raters):
        # Set the probability. Topics with fewer samples have higher probability
        counts = counts-counts.min()+1
        p = 1/counts**20
        p = p/p.sum()

        # Sample according to probability
        allocated_topics = np.random.choice(range(100), size=scores_per_r, replace=False, p=p)
        counts[allocated_topics] += 1

        selected_scores = trial_sim_scores[(trial_sim_scores["sim_rater_id"]==sim_rater_id)&
                        (trial_sim_scores["sim_topic_id"].isin(allocated_topics))]
        scores = pd.concat([scores, selected_scores])

sim_scores = pd.concat([sim_scores, scores], axis="index", ignore_index=True)
sim_scores.to_csv(f"data/{sim_name}/score_{sim_id}.csv", index=False)

In [None]:
# Optimal topic allocation scores
scores = trial_sim_scores[:0]

if total_scores == None:
    total_scores = n_raters*scores_per_r

# Allocate topics for each rater
for sim_rater_id in range(n_raters):
    # Checking if all scores have been allocated
    if total_scores <= 0:
        break

    # Calculating variance for each topic's posterior distribution
    s = (scores.groupby("sim_topic_id").agg({"intrusion":"sum"})
         .rename(columns={"intrusion":"sum"}).reset_index())
    c = (scores.groupby("sim_topic_id").agg({"intrusion":"count"})
         .rename(columns={"intrusion":"count"}).reset_index())
    topic_var = pd.merge(s, c, on="sim_topic_id")

    # Create df with zeros if no data exists
    if len(topic_var) < 100:
        missing_topic_ids = [i for i in range(100) if i not in np.array(topic_var["sim_topic_id"])]
        missings = pd.DataFrame({"sim_topic_id":missing_topic_ids
                                  ,"sum":[0]*len(missing_topic_ids)
                                  ,"count":[0]*len(missing_topic_ids)})
        topic_var = pd.concat([topic_var, missings])

    # Calculating reduction in variance
    topic_var["variance"] = postrr_var(topic_var["sum"], topic_var["count"])
    topic_var["p"] = postrr_p(topic_var["sum"], topic_var["count"])
    topic_var["var0"] = postrr_var(topic_var["sum"], topic_var["count"]+1)
    topic_var["var1"] = postrr_var(topic_var["sum"]+1, topic_var["count"]+1)
    topic_var["expected_var"] = topic_var["p"]*topic_var["var1"]+(1-topic_var["p"])*topic_var["var0"]
    topic_var["var_diff"] = topic_var["expected_var"]-topic_var["variance"]

    # Allocating topics
    allocated_topics = topic_var.sort_values("var_diff", ascending=True)[:scores_per_r]["sim_topic_id"]
    total_scores -= scores_per_r

    selected_scores = trial_sim_scores[(trial_sim_scores["sim_rater_id"]==sim_rater_id)&
                        (trial_sim_scores["sim_topic_id"].isin(allocated_topics))]
    scores = pd.concat([scores, selected_scores])
    
sim_scores = pd.concat([sim_scores, scores], axis="index", ignore_index=True)
sim_scores.to_csv(f"data/{sim_name}/score_{sim_id}.csv", index=False)