# FluVaccine dataset generation (v0)

Generated for Hugging Face: https://huggingface.co/datasets/jmaasch/compositional_causal_reasoning/

Version 0.

Code by Jacqueline Maasch | April 2025

In [1]:
# General importations.
import pandas as pd
import numpy as np

from utils import Utils
from flu_vaccine import FluVaccine
from dataset_generator import DataSetGenerator

In [2]:
u = Utils()
dg = DataSetGenerator()

In [3]:
path = "static_datasets/flu_vaccine_v0/"

## Step 1: Get raw dataset.

In [4]:
# x levels of graphical complexity (captured by BCC size).
# y tasks per graphical complexity level.
# z samples per task.
# w replicates per sample.
# = x*y*z*w subtasks.
graph_sizes = [[6,4,6],[7,5,7],[8,6,8]]
n_tasks_per_size = 1
n_samples_per_task = 2000
reps_per_sample = 5
bcc_type = "wheel"
causal_functions = "random"

df = dg.get_dataset(task_generator = FluVaccine,
                    graph_sizes = graph_sizes,
                    n_tasks_per_size = n_tasks_per_size,
                    n_samples_per_task = n_samples_per_task, 
                    reps_per_sample = reps_per_sample, 
                    causal_functions = causal_functions, 
                    bcc_type = bcc_type)

print(df.info())
display(df.head(5))
display(df.tail(5))

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30000 entries, 0 to 29999
Data columns (total 19 columns):
 #   Column                                  Non-Null Count  Dtype 
---  ------                                  --------------  ----- 
 0   Task ID                                 30000 non-null  object
 1   Context ID                              30000 non-null  object
 2   Sample ID                               30000 non-null  object
 3   Nodes per BCC                           30000 non-null  object
 4   Replicate ID                            30000 non-null  int64 
 5   DAG adjacency matrix                    30000 non-null  object
 6   DAG nodes                               30000 non-null  object
 7   CCT adjacency matrix                    30000 non-null  object
 8   CCT nodes                               30000 non-null  object
 9   Exogenous variables                     30000 non-null  object
 10  Bernoulli parameters                    30000 non-null  object
 11  Gl

Unnamed: 0,Task ID,Context ID,Sample ID,Nodes per BCC,Replicate ID,DAG adjacency matrix,DAG nodes,CCT adjacency matrix,CCT nodes,Exogenous variables,Bernoulli parameters,Global quantity,Local quantities,Compositions,Causal context,Sample context,Factual queries,Interventional queries (cause = True),Interventional queries (cause = False)
0,0.0.0,0,0,"[6, 4, 6]",0,"[[0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0], [...","[Melany, Lauri, Kaci, Elda, Karly, Jonnie, Vir...","[[0, 1, 1, 1], [0, 0, 1, 1], [0, 0, 0, 1], [0,...","[Melany, Jonnie, Lilie, Dahlia]","[RAZQ, 3PY4, I4KS, QT1I, FRNR, MMFX, IYAC, HXM...","[0.7, 0.7, 0.5, 0.7, 0.8, 0.7, 0.7, 0.5, 0.8, ...","(Melany, Dahlia)","[(Melany, Jonnie), (Melany, Lilie), (Jonnie, L...","[[(Melany, Jonnie), (Jonnie, Lilie), (Lilie, D...",A group of friends is considering whether or n...,"During the previous flu season, Melany had a ...",{'Dahlia': {'Prompt': 'Did Dahlia get vaccinat...,"{('Melany', 'Dahlia'): {'Prompt': 'Now, suppos...","{('Melany', 'Dahlia'): {'Prompt': 'Now, suppos..."
1,0.0.1,0,0,"[6, 4, 6]",1,"[[0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0], [...","[Melany, Lauri, Kaci, Elda, Karly, Jonnie, Vir...","[[0, 1, 1, 1], [0, 0, 1, 1], [0, 0, 0, 1], [0,...","[Melany, Jonnie, Lilie, Dahlia]","[RAZQ, 3PY4, I4KS, QT1I, FRNR, MMFX, IYAC, HXM...","[0.7, 0.7, 0.5, 0.7, 0.8, 0.7, 0.7, 0.5, 0.8, ...","(Melany, Dahlia)","[(Melany, Jonnie), (Melany, Lilie), (Jonnie, L...","[[(Melany, Jonnie), (Jonnie, Lilie), (Lilie, D...",A group of friends is considering whether or n...,"During the previous flu season, Melany had a ...",{'Dahlia': {'Prompt': 'Did Dahlia get vaccinat...,"{('Melany', 'Dahlia'): {'Prompt': 'Now, suppos...","{('Melany', 'Dahlia'): {'Prompt': 'Now, suppos..."
2,0.0.2,0,0,"[6, 4, 6]",2,"[[0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0], [...","[Melany, Lauri, Kaci, Elda, Karly, Jonnie, Vir...","[[0, 1, 1, 1], [0, 0, 1, 1], [0, 0, 0, 1], [0,...","[Melany, Jonnie, Lilie, Dahlia]","[RAZQ, 3PY4, I4KS, QT1I, FRNR, MMFX, IYAC, HXM...","[0.7, 0.7, 0.5, 0.7, 0.8, 0.7, 0.7, 0.5, 0.8, ...","(Melany, Dahlia)","[(Melany, Jonnie), (Melany, Lilie), (Jonnie, L...","[[(Melany, Jonnie), (Jonnie, Lilie), (Lilie, D...",A group of friends is considering whether or n...,"During the previous flu season, Melany had a ...",{'Dahlia': {'Prompt': 'Did Dahlia get vaccinat...,"{('Melany', 'Dahlia'): {'Prompt': 'Now, suppos...","{('Melany', 'Dahlia'): {'Prompt': 'Now, suppos..."
3,0.0.3,0,0,"[6, 4, 6]",3,"[[0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0], [...","[Melany, Lauri, Kaci, Elda, Karly, Jonnie, Vir...","[[0, 1, 1, 1], [0, 0, 1, 1], [0, 0, 0, 1], [0,...","[Melany, Jonnie, Lilie, Dahlia]","[RAZQ, 3PY4, I4KS, QT1I, FRNR, MMFX, IYAC, HXM...","[0.7, 0.7, 0.5, 0.7, 0.8, 0.7, 0.7, 0.5, 0.8, ...","(Melany, Dahlia)","[(Melany, Jonnie), (Melany, Lilie), (Jonnie, L...","[[(Melany, Jonnie), (Jonnie, Lilie), (Lilie, D...",A group of friends is considering whether or n...,"During the previous flu season, Melany had a ...",{'Dahlia': {'Prompt': 'Did Dahlia get vaccinat...,"{('Melany', 'Dahlia'): {'Prompt': 'Now, suppos...","{('Melany', 'Dahlia'): {'Prompt': 'Now, suppos..."
4,0.0.4,0,0,"[6, 4, 6]",4,"[[0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0], [...","[Melany, Lauri, Kaci, Elda, Karly, Jonnie, Vir...","[[0, 1, 1, 1], [0, 0, 1, 1], [0, 0, 0, 1], [0,...","[Melany, Jonnie, Lilie, Dahlia]","[RAZQ, 3PY4, I4KS, QT1I, FRNR, MMFX, IYAC, HXM...","[0.7, 0.7, 0.5, 0.7, 0.8, 0.7, 0.7, 0.5, 0.8, ...","(Melany, Dahlia)","[(Melany, Jonnie), (Melany, Lilie), (Jonnie, L...","[[(Melany, Jonnie), (Jonnie, Lilie), (Lilie, D...",A group of friends is considering whether or n...,"During the previous flu season, Melany had a ...",{'Dahlia': {'Prompt': 'Did Dahlia get vaccinat...,"{('Melany', 'Dahlia'): {'Prompt': 'Now, suppos...","{('Melany', 'Dahlia'): {'Prompt': 'Now, suppos..."


Unnamed: 0,Task ID,Context ID,Sample ID,Nodes per BCC,Replicate ID,DAG adjacency matrix,DAG nodes,CCT adjacency matrix,CCT nodes,Exogenous variables,Bernoulli parameters,Global quantity,Local quantities,Compositions,Causal context,Sample context,Factual queries,Interventional queries (cause = True),Interventional queries (cause = False)
29995,2.1999.0,2,1999,"[8, 6, 8]",0,"[[0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,...","[Lady, Candi, Khalilah, Tatia, Delma, Marleen,...","[[0, 1, 1, 1], [0, 0, 1, 1], [0, 0, 0, 1], [0,...","[Lady, Riya, Scarlet, Bella]","[CDUC, 3J0G, PDF8, XKKR, 0P28, 3AUI, LAOK, DYH...","[0.6, 0.6, 0.8, 0.8, 0.6, 0.8, 0.5, 0.5, 0.8, ...","(Lady, Bella)","[(Lady, Riya), (Lady, Scarlet), (Riya, Scarlet...","[[(Lady, Riya), (Riya, Scarlet), (Scarlet, Bel...",A group of friends is considering whether or n...,"During the previous flu season, Lady had a lo...",{'Bella': {'Prompt': 'Did Bella get vaccinated...,"{('Lady', 'Bella'): {'Prompt': 'Now, suppose t...","{('Lady', 'Bella'): {'Prompt': 'Now, suppose t..."
29996,2.1999.1,2,1999,"[8, 6, 8]",1,"[[0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,...","[Lady, Candi, Khalilah, Tatia, Delma, Marleen,...","[[0, 1, 1, 1], [0, 0, 1, 1], [0, 0, 0, 1], [0,...","[Lady, Riya, Scarlet, Bella]","[CDUC, 3J0G, PDF8, XKKR, 0P28, 3AUI, LAOK, DYH...","[0.6, 0.6, 0.8, 0.8, 0.6, 0.8, 0.5, 0.5, 0.8, ...","(Lady, Bella)","[(Lady, Riya), (Lady, Scarlet), (Riya, Scarlet...","[[(Lady, Riya), (Riya, Scarlet), (Scarlet, Bel...",A group of friends is considering whether or n...,"During the previous flu season, Lady had a lo...",{'Bella': {'Prompt': 'Did Bella get vaccinated...,"{('Lady', 'Bella'): {'Prompt': 'Now, suppose t...","{('Lady', 'Bella'): {'Prompt': 'Now, suppose t..."
29997,2.1999.2,2,1999,"[8, 6, 8]",2,"[[0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,...","[Lady, Candi, Khalilah, Tatia, Delma, Marleen,...","[[0, 1, 1, 1], [0, 0, 1, 1], [0, 0, 0, 1], [0,...","[Lady, Riya, Scarlet, Bella]","[CDUC, 3J0G, PDF8, XKKR, 0P28, 3AUI, LAOK, DYH...","[0.6, 0.6, 0.8, 0.8, 0.6, 0.8, 0.5, 0.5, 0.8, ...","(Lady, Bella)","[(Lady, Riya), (Lady, Scarlet), (Riya, Scarlet...","[[(Lady, Riya), (Riya, Scarlet), (Scarlet, Bel...",A group of friends is considering whether or n...,"During the previous flu season, Lady had a lo...",{'Bella': {'Prompt': 'Did Bella get vaccinated...,"{('Lady', 'Bella'): {'Prompt': 'Now, suppose t...","{('Lady', 'Bella'): {'Prompt': 'Now, suppose t..."
29998,2.1999.3,2,1999,"[8, 6, 8]",3,"[[0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,...","[Lady, Candi, Khalilah, Tatia, Delma, Marleen,...","[[0, 1, 1, 1], [0, 0, 1, 1], [0, 0, 0, 1], [0,...","[Lady, Riya, Scarlet, Bella]","[CDUC, 3J0G, PDF8, XKKR, 0P28, 3AUI, LAOK, DYH...","[0.6, 0.6, 0.8, 0.8, 0.6, 0.8, 0.5, 0.5, 0.8, ...","(Lady, Bella)","[(Lady, Riya), (Lady, Scarlet), (Riya, Scarlet...","[[(Lady, Riya), (Riya, Scarlet), (Scarlet, Bel...",A group of friends is considering whether or n...,"During the previous flu season, Lady had a lo...",{'Bella': {'Prompt': 'Did Bella get vaccinated...,"{('Lady', 'Bella'): {'Prompt': 'Now, suppose t...","{('Lady', 'Bella'): {'Prompt': 'Now, suppose t..."
29999,2.1999.4,2,1999,"[8, 6, 8]",4,"[[0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,...","[Lady, Candi, Khalilah, Tatia, Delma, Marleen,...","[[0, 1, 1, 1], [0, 0, 1, 1], [0, 0, 0, 1], [0,...","[Lady, Riya, Scarlet, Bella]","[CDUC, 3J0G, PDF8, XKKR, 0P28, 3AUI, LAOK, DYH...","[0.6, 0.6, 0.8, 0.8, 0.6, 0.8, 0.5, 0.5, 0.8, ...","(Lady, Bella)","[(Lady, Riya), (Lady, Scarlet), (Riya, Scarlet...","[[(Lady, Riya), (Riya, Scarlet), (Scarlet, Bel...",A group of friends is considering whether or n...,"During the previous flu season, Lady had a lo...",{'Bella': {'Prompt': 'Did Bella get vaccinated...,"{('Lady', 'Bella'): {'Prompt': 'Now, suppose t...","{('Lady', 'Bella'): {'Prompt': 'Now, suppose t..."


In [5]:
df.to_csv(path+"flu_vaccine_v0.csv", index = False)  

## Step 2: Process factual and counterfactual prompts.

In [6]:
# Process prompts.
df_factual, df_cf = dg.process_prompts()

In [7]:
print(df_factual.info())
display(df_factual.head(5))

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 90000 entries, 0 to 89999
Data columns (total 9 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   Task ID        90000 non-null  object
 1   Context ID     90000 non-null  int64 
 2   Sample ID      90000 non-null  int64 
 3   Replicate ID   90000 non-null  int64 
 4   Nodes per BCC  90000 non-null  object
 5   Effect         90000 non-null  object
 6   Context        90000 non-null  object
 7   Question       90000 non-null  object
 8   True           90000 non-null  int64 
dtypes: int64(4), object(5)
memory usage: 6.2+ MB
None


Unnamed: 0,Task ID,Context ID,Sample ID,Replicate ID,Nodes per BCC,Effect,Context,Question,True
0,0.0.0,0,0,0,"[6, 4, 6]",Dahlia,A group of friends is considering whether or n...,Did Dahlia get vaccinated? Begin your response...,0
1,0.0.0,0,0,0,"[6, 4, 6]",Jonnie,A group of friends is considering whether or n...,Did Jonnie get vaccinated? Begin your response...,1
2,0.0.0,0,0,0,"[6, 4, 6]",Lilie,A group of friends is considering whether or n...,Did Lilie get vaccinated? Begin your response ...,1
3,0.0.1,0,0,1,"[6, 4, 6]",Dahlia,A group of friends is considering whether or n...,Did Dahlia get vaccinated? Begin your response...,0
4,0.0.1,0,0,1,"[6, 4, 6]",Jonnie,A group of friends is considering whether or n...,Did Jonnie get vaccinated? Begin your response...,1


In [8]:
print(df_cf.info())
display(df_cf.head(5))

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 180000 entries, 0 to 179999
Data columns (total 13 columns):
 #   Column                    Non-Null Count   Dtype 
---  ------                    --------------   ----- 
 0   Task ID                   180000 non-null  object
 1   Context ID                180000 non-null  int64 
 2   Sample ID                 180000 non-null  int64 
 3   Replicate ID              180000 non-null  int64 
 4   Nodes per BCC             180000 non-null  object
 5   Cause-effect pair         180000 non-null  object
 6   Cause                     180000 non-null  object
 7   Effect                    180000 non-null  object
 8   Context                   180000 non-null  object
 9   Question (cause = True)   180000 non-null  object
 10  True (cause = True)       180000 non-null  int64 
 11  Question (cause = False)  180000 non-null  object
 12  True (cause = False)      180000 non-null  int64 
dtypes: int64(5), object(8)
memory usage: 17.9+ MB
None


Unnamed: 0,Task ID,Context ID,Sample ID,Replicate ID,Nodes per BCC,Cause-effect pair,Cause,Effect,Context,Question (cause = True),True (cause = True),Question (cause = False),True (cause = False)
0,0.0.0,0,0,0,"[6, 4, 6]","(Melany, Dahlia)",Melany,Dahlia,A group of friends is considering whether or n...,"Now, suppose that Melany got vaccinated regard...",0,"Now, suppose that Melany did not get vaccinate...",0
1,0.0.0,0,0,0,"[6, 4, 6]","(Melany, Jonnie)",Melany,Jonnie,A group of friends is considering whether or n...,"Now, suppose that Melany got vaccinated regard...",1,"Now, suppose that Melany did not get vaccinate...",0
2,0.0.0,0,0,0,"[6, 4, 6]","(Melany, Lilie)",Melany,Lilie,A group of friends is considering whether or n...,"Now, suppose that Melany got vaccinated regard...",1,"Now, suppose that Melany did not get vaccinate...",1
3,0.0.0,0,0,0,"[6, 4, 6]","(Jonnie, Lilie)",Jonnie,Lilie,A group of friends is considering whether or n...,"Now, suppose that Jonnie got vaccinated regard...",1,"Now, suppose that Jonnie did not get vaccinate...",1
4,0.0.0,0,0,0,"[6, 4, 6]","(Jonnie, Dahlia)",Jonnie,Dahlia,A group of friends is considering whether or n...,"Now, suppose that Jonnie got vaccinated regard...",0,"Now, suppose that Jonnie did not get vaccinate...",0


In [9]:
df_cf.head(30)

Unnamed: 0,Task ID,Context ID,Sample ID,Replicate ID,Nodes per BCC,Cause-effect pair,Cause,Effect,Context,Question (cause = True),True (cause = True),Question (cause = False),True (cause = False)
0,0.0.0,0,0,0,"[6, 4, 6]","(Melany, Dahlia)",Melany,Dahlia,A group of friends is considering whether or n...,"Now, suppose that Melany got vaccinated regard...",0,"Now, suppose that Melany did not get vaccinate...",0
1,0.0.0,0,0,0,"[6, 4, 6]","(Melany, Jonnie)",Melany,Jonnie,A group of friends is considering whether or n...,"Now, suppose that Melany got vaccinated regard...",1,"Now, suppose that Melany did not get vaccinate...",0
2,0.0.0,0,0,0,"[6, 4, 6]","(Melany, Lilie)",Melany,Lilie,A group of friends is considering whether or n...,"Now, suppose that Melany got vaccinated regard...",1,"Now, suppose that Melany did not get vaccinate...",1
3,0.0.0,0,0,0,"[6, 4, 6]","(Jonnie, Lilie)",Jonnie,Lilie,A group of friends is considering whether or n...,"Now, suppose that Jonnie got vaccinated regard...",1,"Now, suppose that Jonnie did not get vaccinate...",1
4,0.0.0,0,0,0,"[6, 4, 6]","(Jonnie, Dahlia)",Jonnie,Dahlia,A group of friends is considering whether or n...,"Now, suppose that Jonnie got vaccinated regard...",0,"Now, suppose that Jonnie did not get vaccinate...",0
5,0.0.0,0,0,0,"[6, 4, 6]","(Lilie, Dahlia)",Lilie,Dahlia,A group of friends is considering whether or n...,"Now, suppose that Lilie got vaccinated regardl...",0,"Now, suppose that Lilie did not get vaccinated...",0
6,0.0.1,0,0,1,"[6, 4, 6]","(Melany, Dahlia)",Melany,Dahlia,A group of friends is considering whether or n...,"Now, suppose that Melany got vaccinated regard...",0,"Now, suppose that Melany did not get vaccinate...",0
7,0.0.1,0,0,1,"[6, 4, 6]","(Melany, Jonnie)",Melany,Jonnie,A group of friends is considering whether or n...,"Now, suppose that Melany got vaccinated regard...",1,"Now, suppose that Melany did not get vaccinate...",0
8,0.0.1,0,0,1,"[6, 4, 6]","(Melany, Lilie)",Melany,Lilie,A group of friends is considering whether or n...,"Now, suppose that Melany got vaccinated regard...",1,"Now, suppose that Melany did not get vaccinate...",1
9,0.0.1,0,0,1,"[6, 4, 6]","(Jonnie, Lilie)",Jonnie,Lilie,A group of friends is considering whether or n...,"Now, suppose that Jonnie got vaccinated regard...",1,"Now, suppose that Jonnie did not get vaccinate...",1


In [10]:
df_factual.to_csv(path+"flu_vaccine_factual_v0.csv", index = False)  

In [11]:
df_cf.to_csv(path+"flu_vaccine_counterfactual_v0.csv", index = False)  

In [12]:
l = len(df_factual[(df_factual["Context ID"] == 0) & (df_factual["Effect"] == "surgery")])
print("\nTotal factual q's per quantity per task:", l)


Total factual q's per quantity per task: 0


In [13]:
l = len(df_cf[(df_cf["Context ID"] == 0) & (df_cf["Cause-effect pair"] == ("pain", "surgery"))])
print("\nTotal counterfactual q's per quantity per task:", l)


Total counterfactual q's per quantity per task: 0


## Step 3: Get ground truth PNS values.

Get dictionary mapping cause-effect pairs to their PNS value.

Keys are the Context ID. Values are dictionaries whose keys are the cause-effect pair and whose values are the finite sample PNS computed using ground truth response vectors.

In [14]:
pns_dict = dg.get_pns_dict(verbose = False)
display(pns_dict)

{0: {"('Melany', 'Dahlia')": 0.0065,
  "('Melany', 'Jonnie')": 0.1575,
  "('Melany', 'Lilie')": 0.023,
  "('Jonnie', 'Lilie')": 0.132,
  "('Jonnie', 'Dahlia')": 0.0445,
  "('Lilie', 'Dahlia')": 0.3055,
  "[('Melany', 'Jonnie'), ('Jonnie', 'Lilie'), ('Lilie', 'Dahlia')]": 0.0063513449999999996,
  "[('Melany', 'Jonnie'), ('Jonnie', 'Dahlia')]": 0.007008749999999999,
  "[('Melany', 'Lilie'), ('Lilie', 'Dahlia')]": 0.0070265},
 1: {"('Armida', 'Keesha')": 0.0645,
  "('Armida', 'Oney')": 0.355,
  "('Armida', 'Alvera')": 0.2195,
  "('Oney', 'Alvera')": 0.612,
  "('Oney', 'Keesha')": 0.1675,
  "('Alvera', 'Keesha')": 0.2865,
  "[('Armida', 'Oney'), ('Oney', 'Alvera'), ('Alvera', 'Keesha')]": 0.062244989999999986,
  "[('Armida', 'Oney'), ('Oney', 'Keesha')]": 0.0594625,
  "[('Armida', 'Alvera'), ('Alvera', 'Keesha')]": 0.06288674999999999},
 2: {"('Lady', 'Bella')": 0.076,
  "('Lady', 'Riya')": 0.275,
  "('Lady', 'Scarlet')": 0.16,
  "('Riya', 'Scarlet')": 0.565,
  "('Riya', 'Bella')": 0.275,


In [15]:
# Save with numpy.
np.save(path+"flu_vaccine_pns_dict_v0.npy", pns_dict) 

# Test loading.
#pns_dict_loaded = np.load("flu_vaccine_pns_dict.npy",
#                          allow_pickle = "TRUE").item()
#display(pns_dict_loaded)

## Step 4: Compute internal consistency thresholds.

Return a dictionary that maps compositions to their correctness threshold
for internal compositional consistency evaluation. Thresholds are the RAE
for each composition relative to the global quantity of interest, times a
multiplier of the user's choice. 

* RAE = (abs(global PNS - composition PNS) / global PNS)
* Threhold = RAE*multiplier
        
This method of obtaining the threshold accounts for the innate error owed
to PNS estimation on finite samples, while the multiplier represents the
user's tolerance level for errors larger than the finite sample error.

Keys are the Context ID. Values are dictionaries whose keys are the causal composition (denoted by a list of cause-effect pairs whose PNS values are multiplied) and whose values are the internal consistency threshold.

For public use, we export threholds with multiplier 1.0 so that the end user can select 
their own multiplier downstream.

In [16]:
# Not for export.
threshold_dict = dg.get_internal_consistency_thresholds(multiplier = 1.25)
display(threshold_dict)

{0: {"[('Melany', 'Jonnie'), ('Jonnie', 'Lilie'), ('Lilie', 'Dahlia')]": 0.02858750000000003,
  "[('Melany', 'Jonnie'), ('Jonnie', 'Dahlia')]": 0.09783653846153839,
  "[('Melany', 'Lilie'), ('Lilie', 'Dahlia')]": 0.10125000000000002},
 1: {"[('Armida', 'Oney'), ('Oney', 'Alvera'), ('Alvera', 'Keesha')]": 0.04370174418604682,
  "[('Armida', 'Oney'), ('Oney', 'Keesha')]": 0.09762596899224806,
  "[('Armida', 'Alvera'), ('Alvera', 'Keesha')]": 0.031264534883721135},
 2: {"[('Lady', 'Riya'), ('Riya', 'Scarlet'), ('Scarlet', 'Bella')]": 0.03357730263157897,
  "[('Lady', 'Riya'), ('Riya', 'Bella')]": 0.006167763157894514,
  "[('Lady', 'Scarlet'), ('Scarlet', 'Bella')]": 0.002631578947368314}}

In [17]:
# For export.
threshold_dict = dg.get_internal_consistency_thresholds(multiplier = 1.0)
display(threshold_dict)

{0: {"[('Melany', 'Jonnie'), ('Jonnie', 'Lilie'), ('Lilie', 'Dahlia')]": 0.022870000000000022,
  "[('Melany', 'Jonnie'), ('Jonnie', 'Dahlia')]": 0.07826923076923072,
  "[('Melany', 'Lilie'), ('Lilie', 'Dahlia')]": 0.08100000000000002},
 1: {"[('Armida', 'Oney'), ('Oney', 'Alvera'), ('Alvera', 'Keesha')]": 0.03496139534883745,
  "[('Armida', 'Oney'), ('Oney', 'Keesha')]": 0.07810077519379845,
  "[('Armida', 'Alvera'), ('Alvera', 'Keesha')]": 0.025011627906976905},
 2: {"[('Lady', 'Riya'), ('Riya', 'Scarlet'), ('Scarlet', 'Bella')]": 0.02686184210526318,
  "[('Lady', 'Riya'), ('Riya', 'Bella')]": 0.004934210526315611,
  "[('Lady', 'Scarlet'), ('Scarlet', 'Bella')]": 0.002105263157894651}}

In [18]:
# Save with numpy.
np.save(path+"flu_vaccine_threshold_dict_v0.npy", threshold_dict) 

# Test loading.
#threshold_dict_loaded = np.load("flu_vaccine_threshold_dict.npy",
#                                allow_pickle = "TRUE").item()
#display(threshold_dict_loaded)