# ClinicalNotes dataset generation (v0)

Generated for Hugging Face: https://huggingface.co/datasets/jmaasch/compositional_causal_reasoning/

Version 0.

Code by Jacqueline Maasch | April 2025

In [1]:
# General importations.
import pandas as pd
import numpy as np

from utils import Utils
from clinical_notes import ClinicalNotes
from dataset_generator import DataSetGenerator

In [2]:
u = Utils()
dg = DataSetGenerator()

In [3]:
path = "static_datasets/clinical_notes_v0/"

## Step 1: Get raw dataset.

In [4]:
# x levels of graphical complexity (captured by BCC size).
# y tasks per graphical complexity level.
# z samples per task.
# w replicates per sample.
# = x*y*z*w subtasks.
graph_sizes = [[6,4,6],[7,5,7]]
n_tasks_per_size = 1
n_samples_per_task = 4000
reps_per_sample = 5
bcc_type = "wheel"
n_extra_vars = 4

df = dg.get_dataset(task_generator = ClinicalNotes,
                    graph_sizes = graph_sizes,
                    n_tasks_per_size = n_tasks_per_size,
                    n_samples_per_task = n_samples_per_task, 
                    reps_per_sample = reps_per_sample, 
                    n_extra_vars = n_extra_vars, 
                    bcc_type = bcc_type)

print(df.info())
display(df.head(5))
display(df.tail(5))

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 19 columns):
 #   Column                                  Non-Null Count  Dtype 
---  ------                                  --------------  ----- 
 0   Task ID                                 50000 non-null  object
 1   Context ID                              50000 non-null  object
 2   Sample ID                               50000 non-null  object
 3   Nodes per BCC                           50000 non-null  object
 4   Replicate ID                            50000 non-null  int64 
 5   DAG adjacency matrix                    50000 non-null  object
 6   DAG nodes                               50000 non-null  object
 7   CCT adjacency matrix                    50000 non-null  object
 8   CCT nodes                               50000 non-null  object
 9   Exogenous variables                     50000 non-null  object
 10  Bernoulli parameters                    50000 non-null  object
 11  Gl

Unnamed: 0,Task ID,Context ID,Sample ID,Nodes per BCC,Replicate ID,DAG adjacency matrix,DAG nodes,CCT adjacency matrix,CCT nodes,Exogenous variables,Bernoulli parameters,Global quantity,Local quantities,Compositions,Causal context,Sample context,Factual queries,Interventional queries (cause = True),Interventional queries (cause = False)
0,0.0.0,0,0,"[3, 3, 3]",0,"[[0, 1, 1, 0, 0, 0, 0], [0, 0, 1, 0, 0, 0, 0],...","[pain, RJS8, TTW1, OT3C, OVYI, PF2V, surgery]","[[0, 1, 1, 1], [0, 0, 1, 1], [0, 0, 0, 1], [0,...","[pain, TTW1, OVYI, surgery]","[GT81, KG95, MELA, M348, 4Y2Y, DLY7, PQK1]","[0.4, 0.6, 0.4, 0.7, 0.6, 0.7, 0.8]","(pain, surgery)","[(pain, TTW1), (pain, OVYI), (TTW1, OVYI), (TT...","[[(pain, TTW1), (TTW1, OVYI), (OVYI, surgery)]...",Chronic disease N9KAOX sometimes requires surg...,"Now, we will review the history and physical n...",{'surgery': {'Prompt': 'Given these history an...,"{('pain', 'surgery'): {'Prompt': 'Now suppose ...","{('pain', 'surgery'): {'Prompt': 'Now suppose ..."
1,0.0.1,0,0,"[3, 3, 3]",1,"[[0, 1, 1, 0, 0, 0, 0], [0, 0, 1, 0, 0, 0, 0],...","[pain, RJS8, TTW1, OT3C, OVYI, PF2V, surgery]","[[0, 1, 1, 1], [0, 0, 1, 1], [0, 0, 0, 1], [0,...","[pain, TTW1, OVYI, surgery]","[GT81, KG95, MELA, M348, 4Y2Y, DLY7, PQK1]","[0.4, 0.6, 0.4, 0.7, 0.6, 0.7, 0.8]","(pain, surgery)","[(pain, TTW1), (pain, OVYI), (TTW1, OVYI), (TT...","[[(pain, TTW1), (TTW1, OVYI), (OVYI, surgery)]...",Chronic disease N9KAOX sometimes requires surg...,"Now, we will review the history and physical n...",{'surgery': {'Prompt': 'Given these history an...,"{('pain', 'surgery'): {'Prompt': 'Now suppose ...","{('pain', 'surgery'): {'Prompt': 'Now suppose ..."
2,0.0.2,0,0,"[3, 3, 3]",2,"[[0, 1, 1, 0, 0, 0, 0], [0, 0, 1, 0, 0, 0, 0],...","[pain, RJS8, TTW1, OT3C, OVYI, PF2V, surgery]","[[0, 1, 1, 1], [0, 0, 1, 1], [0, 0, 0, 1], [0,...","[pain, TTW1, OVYI, surgery]","[GT81, KG95, MELA, M348, 4Y2Y, DLY7, PQK1]","[0.4, 0.6, 0.4, 0.7, 0.6, 0.7, 0.8]","(pain, surgery)","[(pain, TTW1), (pain, OVYI), (TTW1, OVYI), (TT...","[[(pain, TTW1), (TTW1, OVYI), (OVYI, surgery)]...",Chronic disease N9KAOX sometimes requires surg...,"Now, we will review the history and physical n...",{'surgery': {'Prompt': 'Given these history an...,"{('pain', 'surgery'): {'Prompt': 'Now suppose ...","{('pain', 'surgery'): {'Prompt': 'Now suppose ..."
3,0.0.3,0,0,"[3, 3, 3]",3,"[[0, 1, 1, 0, 0, 0, 0], [0, 0, 1, 0, 0, 0, 0],...","[pain, RJS8, TTW1, OT3C, OVYI, PF2V, surgery]","[[0, 1, 1, 1], [0, 0, 1, 1], [0, 0, 0, 1], [0,...","[pain, TTW1, OVYI, surgery]","[GT81, KG95, MELA, M348, 4Y2Y, DLY7, PQK1]","[0.4, 0.6, 0.4, 0.7, 0.6, 0.7, 0.8]","(pain, surgery)","[(pain, TTW1), (pain, OVYI), (TTW1, OVYI), (TT...","[[(pain, TTW1), (TTW1, OVYI), (OVYI, surgery)]...",Chronic disease N9KAOX sometimes requires surg...,"Now, we will review the history and physical n...",{'surgery': {'Prompt': 'Given these history an...,"{('pain', 'surgery'): {'Prompt': 'Now suppose ...","{('pain', 'surgery'): {'Prompt': 'Now suppose ..."
4,0.0.4,0,0,"[3, 3, 3]",4,"[[0, 1, 1, 0, 0, 0, 0], [0, 0, 1, 0, 0, 0, 0],...","[pain, RJS8, TTW1, OT3C, OVYI, PF2V, surgery]","[[0, 1, 1, 1], [0, 0, 1, 1], [0, 0, 0, 1], [0,...","[pain, TTW1, OVYI, surgery]","[GT81, KG95, MELA, M348, 4Y2Y, DLY7, PQK1]","[0.4, 0.6, 0.4, 0.7, 0.6, 0.7, 0.8]","(pain, surgery)","[(pain, TTW1), (pain, OVYI), (TTW1, OVYI), (TT...","[[(pain, TTW1), (TTW1, OVYI), (OVYI, surgery)]...",Chronic disease N9KAOX sometimes requires surg...,"Now, we will review the history and physical n...",{'surgery': {'Prompt': 'Given these history an...,"{('pain', 'surgery'): {'Prompt': 'Now suppose ...","{('pain', 'surgery'): {'Prompt': 'Now suppose ..."


Unnamed: 0,Task ID,Context ID,Sample ID,Nodes per BCC,Replicate ID,DAG adjacency matrix,DAG nodes,CCT adjacency matrix,CCT nodes,Exogenous variables,Bernoulli parameters,Global quantity,Local quantities,Compositions,Causal context,Sample context,Factual queries,Interventional queries (cause = True),Interventional queries (cause = False)
49995,1.4999.0,1,4999,"[4, 4, 4]",0,"[[0, 1, 1, 1, 0, 0, 0, 0, 0, 0], [0, 0, 1, 1, ...","[pain, PGHO, UOB7, V4SB, Q9PR, BR85, UVXF, UJ5...","[[0, 1, 1, 1], [0, 0, 1, 1], [0, 0, 0, 1], [0,...","[pain, V4SB, UVXF, surgery]","[PLD7, OYZC, AVY0, LBQT, EM8D, 5CDN, N937, NH8...","[0.7, 0.7, 0.4, 0.7, 0.5, 0.4, 0.5, 0.6, 0.6, ...","(pain, surgery)","[(pain, V4SB), (pain, UVXF), (V4SB, UVXF), (V4...","[[(pain, V4SB), (V4SB, UVXF), (UVXF, surgery)]...",Chronic disease XNXDTV sometimes requires surg...,"Now, we will review the history and physical n...",{'surgery': {'Prompt': 'Given these history an...,"{('pain', 'surgery'): {'Prompt': 'Now suppose ...","{('pain', 'surgery'): {'Prompt': 'Now suppose ..."
49996,1.4999.1,1,4999,"[4, 4, 4]",1,"[[0, 1, 1, 1, 0, 0, 0, 0, 0, 0], [0, 0, 1, 1, ...","[pain, PGHO, UOB7, V4SB, Q9PR, BR85, UVXF, UJ5...","[[0, 1, 1, 1], [0, 0, 1, 1], [0, 0, 0, 1], [0,...","[pain, V4SB, UVXF, surgery]","[PLD7, OYZC, AVY0, LBQT, EM8D, 5CDN, N937, NH8...","[0.7, 0.7, 0.4, 0.7, 0.5, 0.4, 0.5, 0.6, 0.6, ...","(pain, surgery)","[(pain, V4SB), (pain, UVXF), (V4SB, UVXF), (V4...","[[(pain, V4SB), (V4SB, UVXF), (UVXF, surgery)]...",Chronic disease XNXDTV sometimes requires surg...,"Now, we will review the history and physical n...",{'surgery': {'Prompt': 'Given these history an...,"{('pain', 'surgery'): {'Prompt': 'Now suppose ...","{('pain', 'surgery'): {'Prompt': 'Now suppose ..."
49997,1.4999.2,1,4999,"[4, 4, 4]",2,"[[0, 1, 1, 1, 0, 0, 0, 0, 0, 0], [0, 0, 1, 1, ...","[pain, PGHO, UOB7, V4SB, Q9PR, BR85, UVXF, UJ5...","[[0, 1, 1, 1], [0, 0, 1, 1], [0, 0, 0, 1], [0,...","[pain, V4SB, UVXF, surgery]","[PLD7, OYZC, AVY0, LBQT, EM8D, 5CDN, N937, NH8...","[0.7, 0.7, 0.4, 0.7, 0.5, 0.4, 0.5, 0.6, 0.6, ...","(pain, surgery)","[(pain, V4SB), (pain, UVXF), (V4SB, UVXF), (V4...","[[(pain, V4SB), (V4SB, UVXF), (UVXF, surgery)]...",Chronic disease XNXDTV sometimes requires surg...,"Now, we will review the history and physical n...",{'surgery': {'Prompt': 'Given these history an...,"{('pain', 'surgery'): {'Prompt': 'Now suppose ...","{('pain', 'surgery'): {'Prompt': 'Now suppose ..."
49998,1.4999.3,1,4999,"[4, 4, 4]",3,"[[0, 1, 1, 1, 0, 0, 0, 0, 0, 0], [0, 0, 1, 1, ...","[pain, PGHO, UOB7, V4SB, Q9PR, BR85, UVXF, UJ5...","[[0, 1, 1, 1], [0, 0, 1, 1], [0, 0, 0, 1], [0,...","[pain, V4SB, UVXF, surgery]","[PLD7, OYZC, AVY0, LBQT, EM8D, 5CDN, N937, NH8...","[0.7, 0.7, 0.4, 0.7, 0.5, 0.4, 0.5, 0.6, 0.6, ...","(pain, surgery)","[(pain, V4SB), (pain, UVXF), (V4SB, UVXF), (V4...","[[(pain, V4SB), (V4SB, UVXF), (UVXF, surgery)]...",Chronic disease XNXDTV sometimes requires surg...,"Now, we will review the history and physical n...",{'surgery': {'Prompt': 'Given these history an...,"{('pain', 'surgery'): {'Prompt': 'Now suppose ...","{('pain', 'surgery'): {'Prompt': 'Now suppose ..."
49999,1.4999.4,1,4999,"[4, 4, 4]",4,"[[0, 1, 1, 1, 0, 0, 0, 0, 0, 0], [0, 0, 1, 1, ...","[pain, PGHO, UOB7, V4SB, Q9PR, BR85, UVXF, UJ5...","[[0, 1, 1, 1], [0, 0, 1, 1], [0, 0, 0, 1], [0,...","[pain, V4SB, UVXF, surgery]","[PLD7, OYZC, AVY0, LBQT, EM8D, 5CDN, N937, NH8...","[0.7, 0.7, 0.4, 0.7, 0.5, 0.4, 0.5, 0.6, 0.6, ...","(pain, surgery)","[(pain, V4SB), (pain, UVXF), (V4SB, UVXF), (V4...","[[(pain, V4SB), (V4SB, UVXF), (UVXF, surgery)]...",Chronic disease XNXDTV sometimes requires surg...,"Now, we will review the history and physical n...",{'surgery': {'Prompt': 'Given these history an...,"{('pain', 'surgery'): {'Prompt': 'Now suppose ...","{('pain', 'surgery'): {'Prompt': 'Now suppose ..."


In [5]:
df.to_csv(path+"clinical_notes_v0.csv", index = False)  

## Step 2: Process factual and counterfactual prompts.

In [6]:
# Process prompts.
df_factual, df_cf = dg.process_prompts()

In [7]:
print(df_factual.info())
display(df_factual.head(5))

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150000 entries, 0 to 149999
Data columns (total 9 columns):
 #   Column         Non-Null Count   Dtype 
---  ------         --------------   ----- 
 0   Task ID        150000 non-null  object
 1   Context ID     150000 non-null  int64 
 2   Sample ID      150000 non-null  int64 
 3   Replicate ID   150000 non-null  int64 
 4   Nodes per BCC  150000 non-null  object
 5   Effect         150000 non-null  object
 6   Context        150000 non-null  object
 7   Question       150000 non-null  object
 8   True           150000 non-null  int64 
dtypes: int64(4), object(5)
memory usage: 10.3+ MB
None


Unnamed: 0,Task ID,Context ID,Sample ID,Replicate ID,Nodes per BCC,Effect,Context,Question,True
0,0.0.0,0,0,0,"[3, 3, 3]",surgery,Chronic disease N9KAOX sometimes requires surg...,"Given these history and physical notes, will t...",1
1,0.0.0,0,0,0,"[3, 3, 3]",TTW1,Chronic disease N9KAOX sometimes requires surg...,"Given these history and physical notes, will l...",1
2,0.0.0,0,0,0,"[3, 3, 3]",OVYI,Chronic disease N9KAOX sometimes requires surg...,"Given these history and physical notes, will l...",1
3,0.0.1,0,0,1,"[3, 3, 3]",surgery,Chronic disease N9KAOX sometimes requires surg...,"Given these history and physical notes, will t...",1
4,0.0.1,0,0,1,"[3, 3, 3]",TTW1,Chronic disease N9KAOX sometimes requires surg...,"Given these history and physical notes, will l...",1


In [8]:
print(df_cf.info())
display(df_cf.head(5))

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 300000 entries, 0 to 299999
Data columns (total 13 columns):
 #   Column                    Non-Null Count   Dtype 
---  ------                    --------------   ----- 
 0   Task ID                   300000 non-null  object
 1   Context ID                300000 non-null  int64 
 2   Sample ID                 300000 non-null  int64 
 3   Replicate ID              300000 non-null  int64 
 4   Nodes per BCC             300000 non-null  object
 5   Cause-effect pair         300000 non-null  object
 6   Cause                     300000 non-null  object
 7   Effect                    300000 non-null  object
 8   Context                   300000 non-null  object
 9   Question (cause = True)   300000 non-null  object
 10  True (cause = True)       300000 non-null  int64 
 11  Question (cause = False)  300000 non-null  object
 12  True (cause = False)      300000 non-null  int64 
dtypes: int64(5), object(8)
memory usage: 29.8+ MB
None


Unnamed: 0,Task ID,Context ID,Sample ID,Replicate ID,Nodes per BCC,Cause-effect pair,Cause,Effect,Context,Question (cause = True),True (cause = True),Question (cause = False),True (cause = False)
0,0.0.0,0,0,0,"[3, 3, 3]","(pain, surgery)",pain,surgery,Chronic disease N9KAOX sometimes requires surg...,Now suppose that the patient will be in signif...,1,Now suppose that the patient will not be in pa...,1
1,0.0.0,0,0,0,"[3, 3, 3]","(pain, TTW1)",pain,TTW1,Chronic disease N9KAOX sometimes requires surg...,Now suppose that the patient will be in signif...,1,Now suppose that the patient will not be in pa...,1
2,0.0.0,0,0,0,"[3, 3, 3]","(pain, OVYI)",pain,OVYI,Chronic disease N9KAOX sometimes requires surg...,Now suppose that the patient will be in signif...,1,Now suppose that the patient will not be in pa...,1
3,0.0.0,0,0,0,"[3, 3, 3]","(TTW1, OVYI)",TTW1,OVYI,Chronic disease N9KAOX sometimes requires surg...,Now suppose that lab TTW1 will be elevated reg...,1,Now suppose that lab TTW1 will not be elevated...,1
4,0.0.0,0,0,0,"[3, 3, 3]","(TTW1, surgery)",TTW1,surgery,Chronic disease N9KAOX sometimes requires surg...,Now suppose that lab TTW1 will be elevated reg...,1,Now suppose that lab TTW1 will not be elevated...,1


In [9]:
df_factual.to_csv(path+"clinical_notes_factual_v0.csv", index = False)  

In [10]:
df_cf.to_csv(path+"clinical_notes_counterfactual_v0.csv", index = False)  

In [11]:
l = len(df_factual[(df_factual["Context ID"] == 0) & (df_factual["Effect"] == "surgery")])
print("\nTotal factual q's per quantity per task:", l)


Total factual q's per quantity per task: 25000


In [12]:
l = len(df_cf[(df_cf["Context ID"] == 0) & (df_cf["Cause-effect pair"] == ("pain", "surgery"))])
print("\nTotal counterfactual q's per quantity per task:", l)


Total counterfactual q's per quantity per task: 25000


## Step 3: Get ground truth PNS values.

Get dictionary mapping cause-effect pairs to their PNS value.

Keys are the Context ID. Values are dictionaries whose keys are the cause-effect pair and whose values are the finite sample PNS computed using ground truth response vectors.

In [13]:
pns_dict = dg.get_pns_dict(verbose = False)
display(pns_dict)

{0: {"('pain', 'surgery')": 0.0252,
  "('pain', 'TTW1')": 0.2332,
  "('pain', 'OVYI')": 0.0298,
  "('TTW1', 'OVYI')": 0.1236,
  "('TTW1', 'surgery')": 0.0982,
  "('OVYI', 'surgery')": 0.7946,
  "[('pain', 'TTW1'), ('TTW1', 'OVYI'), ('OVYI', 'surgery')]": 0.022903168991999998,
  "[('pain', 'TTW1'), ('TTW1', 'surgery')]": 0.02290024,
  "[('pain', 'OVYI'), ('OVYI', 'surgery')]": 0.023679079999999998},
 1: {"('pain', 'surgery')": 0.0038,
  "('pain', 'V4SB')": 0.0546,
  "('pain', 'UVXF')": 0.008,
  "('V4SB', 'UVXF')": 0.1494,
  "('V4SB', 'surgery')": 0.0684,
  "('UVXF', 'surgery')": 0.496,
  "[('pain', 'V4SB'), ('V4SB', 'UVXF'), ('UVXF', 'surgery')]": 0.00404599104,
  "[('pain', 'V4SB'), ('V4SB', 'surgery')]": 0.0037346400000000004,
  "[('pain', 'UVXF'), ('UVXF', 'surgery')]": 0.003968}}

In [14]:
# Save with numpy.
np.save(path+"clinical_notes_pns_dict_v0.npy", pns_dict) 

# Test loading.
#pns_dict_loaded = np.load("clinical_notes_pns_dict.npy",
#                          allow_pickle = "TRUE").item()
#display(pns_dict_loaded)

## Step 4: Compute internal consistency thresholds.

Return a dictionary that maps compositions to their correctness threshold
for internal compositional consistency evaluation. Thresholds are the RAE
for each composition relative to the global quantity of interest, times a
multiplier of the user's choice. 

* RAE = (abs(global PNS - composition PNS) / global PNS)
* Threhold = RAE*multiplier
        
This method of obtaining the threshold accounts for the innate error owed
to PNS estimation on finite samples, while the multiplier represents the
user's tolerance level for errors larger than the finite sample error.

Keys are the Context ID. Values are dictionaries whose keys are the causal composition (denoted by a list of cause-effect pairs whose PNS values are multiplied) and whose values are the internal consistency threshold.

For public use, we export threholds with multiplier 1.0 so that the end user can select 
their own multiplier downstream.

In [15]:
# Not for export.
threshold_dict = dg.get_internal_consistency_thresholds(multiplier = 1.25)
display(threshold_dict)

{0: {"[('pain', 'TTW1'), ('TTW1', 'OVYI'), ('OVYI', 'surgery')]": 0.11393010952380966,
  "[('pain', 'TTW1'), ('TTW1', 'surgery')]": 0.11407539682539689,
  "[('pain', 'OVYI'), ('OVYI', 'surgery')]": 0.07544246031746042},
 1: {"[('pain', 'V4SB'), ('V4SB', 'UVXF'), ('UVXF', 'surgery')]": 0.08091810526315801,
  "[('pain', 'V4SB'), ('V4SB', 'surgery')]": 0.021499999999999853,
  "[('pain', 'UVXF'), ('UVXF', 'surgery')]": 0.0552631578947369}}

In [16]:
# For export.
threshold_dict = dg.get_internal_consistency_thresholds(multiplier = 1.0)
display(threshold_dict)

{0: {"[('pain', 'TTW1'), ('TTW1', 'OVYI'), ('OVYI', 'surgery')]": 0.09114408761904773,
  "[('pain', 'TTW1'), ('TTW1', 'surgery')]": 0.09126031746031751,
  "[('pain', 'OVYI'), ('OVYI', 'surgery')]": 0.06035396825396833},
 1: {"[('pain', 'V4SB'), ('V4SB', 'UVXF'), ('UVXF', 'surgery')]": 0.06473448421052641,
  "[('pain', 'V4SB'), ('V4SB', 'surgery')]": 0.017199999999999882,
  "[('pain', 'UVXF'), ('UVXF', 'surgery')]": 0.04421052631578952}}

In [17]:
# Save with numpy.
np.save(path+"clinical_notes_threshold_dict_v0.npy", threshold_dict) 

# Test loading.
#threshold_dict_loaded = np.load("clinical_notes_threshold_dict.npy",
#                                allow_pickle = "TRUE").item()
#display(threshold_dict_loaded)