In [28]:
import re
from collections import defaultdict

import pandas as pd
from tqdm.auto import tqdm 
    
from datasets import load_dataset

import matplotlib.pyplot as plt

# Load the dataset and access the train/validation/test splits

In [29]:
# Download the CORR2CAUSE dataset
dataset_name = "causal-nlp/corr2cause"
try:
    dataset = load_dataset(dataset_name)
    print("Dataset successfully loaded.")
except Exception as e:
    print(f"Error loading the dataset: {e}")

Dataset successfully loaded.


In [30]:
# Access the train, test, and validation splits
train_dataset = dataset['train']
test_dataset = dataset['test']
validation_dataset = dataset['validation']

# Convert to Pandas DataFrames
train_df = train_dataset.to_pandas()
test_df = test_dataset.to_pandas()
validation_df = validation_dataset.to_pandas()

# Display the length of each split
print(f"Train split length: {len(train_df)}")
print(f"Test split length: {len(test_df)}")
print(f"Validation split length: {len(validation_df)}")

Train split length: 205734
Test split length: 1162
Validation split length: 1076


In [4]:
# Extract only the problems with x  variables
num_variables = 6
train_df = train_df[train_df['num_variables'] == num_variables]
print(f"Train split length only with {num_variables} variables: {len(train_df)}")


Train split length only with 6 variables: 197634


# Parse the input to the adjacency graph format

In [19]:
def parse_input(text):
    # Initialize containers
    variables = set()
    correlations = []
    marginal_independencies = []
    conditional_independencies = []

    # Extract variables
    var_match = re.search(r'variables?(.*?)[\.\n]', text)
    if var_match:
        vars_text = var_match.group(1)
        # Split vars_text by commas and 'and'
        vars_list = re.split(r',\s*|\s+and\s+|\s*,\s*', vars_text)
        vars_list = [var.strip() for var in vars_list if var.strip() and var.strip().lower() != 'and']
        variables.update(vars_list)
    else:
        # Handle case when variables not found
        pass

    # Extract correlations
    # The correlations are in the text after 'All the statistical relations among these variables are as follows:'
    # and before 'However,'
    correlations_text_match = re.search(r'All the statistical relations.*?are as follows:(.*?)(However|$)', text, re.DOTALL)
    if correlations_text_match:
        correlations_text = correlations_text_match.group(1)
        # Now we need to extract all 'X correlates with Y.'
        correlation_matches = re.findall(r'([A-Za-z]+) correlates with ([A-Za-z]+)\.', correlations_text)
        correlations.extend(correlation_matches)
        variables.update([var for pair in correlation_matches for var in pair])

    # Extract independencies
    # The independencies are in the text after 'However,'
    independencies_text_match = re.search(r'However,(.*)', text, re.DOTALL)
    if independencies_text_match:
        independencies_text = independencies_text_match.group(1)
        # Now split independencies into sentences
        sentences = re.findall(r'([^.]*?\.)', independencies_text)
        for sentence in sentences:
            sentence = sentence.strip()
            # Try to match marginal independencies
            marg_match = re.match(r'([A-Za-z]+) is independent of ([A-Za-z]+)\.', sentence)
            if marg_match:
                var1 = marg_match.group(1)
                var2 = marg_match.group(2)
                marginal_independencies.append((var1, var2))
                variables.update([var1, var2])
            else:
                # Try to match conditional independencies
                cond_match = re.match(r'([A-Za-z]+) and ([A-Za-z]+) are independent given (.*?)[\.\n]', sentence)
                if cond_match:
                    var1 = cond_match.group(1)
                    var2 = cond_match.group(2)
                    given_vars_text = cond_match.group(3)
                    # Split given_vars_text by commas and 'and', strip spaces
                    given_vars = re.split(r',\s*|\s+and\s+', given_vars_text)
                    given_vars = [var.strip() for var in given_vars if var.strip() and var.strip().lower() != 'and']
                    conditional_independencies.append({
                        'vars': (var1, var2),
                        'given': given_vars
                    })
                    variables.update([var1, var2] + given_vars)
    else:
        pass

    return {
        'variables': list(sorted(variables)),
        'correlations': correlations,
        'marginal_independencies': marginal_independencies,
        'conditional_independencies': conditional_independencies
    }

def construct_causal_skeleton_with_steps(parsed_data):
    variables = parsed_data['variables']
    correlations = parsed_data['correlations']
    marginal_independencies = parsed_data['marginal_independencies']
    conditional_independencies = parsed_data['conditional_independencies']

    reasoning_steps = []

    # Step 1: Read the Data
    reasoning_steps.append("Step 1: Read the Data")
    reasoning_steps.append(f"- Extracted variables: {', '.join(variables)}")
    correlations_str = ', '.join([f"({var1}, {var2})" for var1, var2 in correlations])
    reasoning_steps.append(f"- Correlations: {correlations_str}")
    marg_indep_str = ', '.join([f"({var1}, {var2})" for var1, var2 in marginal_independencies])
    reasoning_steps.append(f"- Marginal Independencies: {marg_indep_str}")
    cond_indep_str = '\n  - '.join([
        f"({indep['vars'][0]}, {indep['vars'][1]}) are independent given {', '.join(indep['given'])}"
        for indep in conditional_independencies
    ])
    reasoning_steps.append(f"- Conditional Independencies:\n  - {cond_indep_str}")

    # Step 2: Initialize the Graph
    reasoning_steps.append("Step 2: Initialize the Graph")
    edges = set()
    for var1, var2 in correlations:
        edges.add(frozenset([var1, var2]))
    initial_edges_str = ', '.join([f"({var1}, {var2})" for var1, var2 in correlations])
    reasoning_steps.append(f"- Created edges between all correlated variable pairs.")
    reasoning_steps.append(f"- Initial edges: {{{initial_edges_str}}}")

    # Step 3: Apply Marginal Independencies
    reasoning_steps.append("Step 3: Apply Marginal Independencies")
    removed_edges = set()
    for var1, var2 in marginal_independencies:
        edge = frozenset([var1, var2])
        if edge in edges:
            edges.remove(edge)
            removed_edges.add(edge)
            reasoning_steps.append(f"- **Because {var1} is independent of {var2}, there is no edge between {var1} and {var2}.**")
    if not removed_edges:
        reasoning_steps.append("- No edges removed in this step.")

    # Step 4: Apply Conditional Independencies
    reasoning_steps.append("Step 4: Apply Conditional Independencies")
    removed_edges_cond = set()
    for indep in conditional_independencies:
        var1, var2 = indep['vars']
        edge = frozenset([var1, var2])
        if edge in edges:
            edges.remove(edge)
            removed_edges_cond.add(edge)
            given_vars_str = ', '.join(indep['given'])
            reasoning_steps.append(f"- Because {var1} and {var2} are independent given {given_vars_str}, there is no edge between {var1} and {var2}.")
    if not removed_edges_cond:
        reasoning_steps.append("- No edges removed in this step.")
    

    # Step 5: Compile the Remaining Edges
    reasoning_steps.append("Step 5: Compile the Causal Undirected Skeleton")
    remaining_edges = [(list(edge)[0], list(edge)[1]) for edge in edges]
    remaining_edges_str = ', '.join([f"({var1}, {var2})" for var1, var2 in remaining_edges])
    reasoning_steps.append(f"  - Edges: {{{remaining_edges_str}}}")

    answer = '\n'.join(reasoning_steps)
    return answer

def generate_causal_skeleton_reasoning(text):
    parsed_data = parse_input(text)
    answer = construct_causal_skeleton_with_steps(parsed_data)
    return answer

# Parse the input data to the incident graph format

In [20]:
def build_correlation_adjacency(correlations):
    """
    Builds an adjacency dictionary for correlations.

    :param correlations: List of tuples representing correlated node pairs.
    :return: Dictionary mapping each node to a set of correlated nodes.
    """
    adjacency = defaultdict(set)
    for var1, var2 in correlations:
        adjacency[var1].add(var2)
        adjacency[var2].add(var1)
    return adjacency


def format_correlation_adjacency(adjacency):
    """
    Formats the adjacency dictionary into a readable string.

    :param adjacency: Dictionary mapping each node to a set of correlated nodes.
    :return: Formatted string representing the correlations.
    """
    lines = []
    for node in sorted(adjacency.keys()):
        connected_nodes = sorted(adjacency[node])
        if connected_nodes:
            # Use 'node' or 'nodes' based on the number of connected nodes
            node_word = "node" if len(connected_nodes) == 1 else "nodes"
            connected_str = ', '.join(connected_nodes)
            lines.append(f"Node {node} is correlated with {node_word} {connected_str}.")
        else:
            lines.append(f"Node {node} has no correlations.")
    return '\n  - '.join(lines)


def format_adjacency(adjacency):
    lines = []
    for var in sorted(adjacency.keys()):
        connected_nodes = sorted(adjacency[var])
        if connected_nodes:
            # Determine whether to use 'node' or 'nodes'
            node_word = "node" if len(connected_nodes) == 1 else "nodes"
            connected_str = ', '.join(connected_nodes)
            lines.append(f"Node {var} is connected to {node_word} {connected_str}.")
        else:
            lines.append(f"Node {var} has no connections.")
    return '\n  - '.join(lines)


def construct_incident_causal_skeleton_with_steps(parsed_data):
    variables = parsed_data['variables']
    correlations = parsed_data['correlations']
    marginal_independencies = parsed_data['marginal_independencies']
    conditional_independencies = parsed_data['conditional_independencies']

    reasoning_steps = []

    # Step 1: Read the Data
    reasoning_steps.append("Step 1: Read the Data")
    reasoning_steps.append(f"- Extracted nodes: {', '.join(variables)}")
    
    # Build and format the correlation adjacency
    correlation_adjacency = build_correlation_adjacency(correlations)
    formatted_correlations = format_correlation_adjacency(correlation_adjacency)
    reasoning_steps.append("- Correlations:")
    reasoning_steps.append(f"  - {formatted_correlations}")
    
    # Marginal Independencies
    if marginal_independencies:
        reasoning_steps.append("- Marginal Independencies:")
        for var1, var2 in marginal_independencies:
            reasoning_steps.append(f"  - Node {var1} is independent of node {var2}.")
    else:
        reasoning_steps.append("- Marginal Independencies: None")
    
    # Conditional Independencies
    if conditional_independencies:
        cond_indep_str = '\n  - '.join([
            f"Nodes {indep['vars'][0]} and {indep['vars'][1]} are independent given "
            f"{'node' if len(indep['given']) == 1 else 'nodes'} {', '.join(indep['given'])}"
            for indep in conditional_independencies
        ])
        reasoning_steps.append(f"- Conditional Independencies:\n  - {cond_indep_str}")
    else:
        reasoning_steps.append(f"- Conditional Independencies: None")

    # Step 2: Initialize the Graph
    reasoning_steps.append("\nStep 2: Initialize the Graph")
    edges = set()
    for var1, var2 in correlations:
        edges.add(frozenset([var1, var2]))
    
    # Create adjacency list after initialization
    adjacency = {var: set() for var in variables}
    for edge in edges:
        var1, var2 = sorted(edge)  # Sort for consistent ordering
        adjacency[var1].add(var2)
        adjacency[var2].add(var1)
    
    # Format adjacency list
    adjacency_str = format_adjacency(adjacency)
    reasoning_steps.append("Created edges between all correlated variable pairs. In this graph:")
    reasoning_steps.append(f"  - {adjacency_str}")

    # Step 3: Apply Marginal Independencies
    reasoning_steps.append("\nStep 3: Apply Marginal Independencies")
    removed_edges = set()
    for var1, var2 in marginal_independencies:
        edge = frozenset([var1, var2])
        if edge in edges:
            edges.remove(edge)
            removed_edges.add(edge)
            reasoning_steps.append(f"- **Because {var1} is independent of {var2}, there is no edge between {var1} and {var2}.**")
    if not removed_edges:
        reasoning_steps.append("- No edges removed in this step.")

    # Step 4: Apply Conditional Independencies
    reasoning_steps.append("\nStep 4: Apply Conditional Independencies")
    removed_edges_cond = set()
    for indep in conditional_independencies:
        var1, var2 = indep['vars']
        edge = frozenset([var1, var2])
        if edge in edges:
            edges.remove(edge)
            removed_edges_cond.add(edge)
            given_vars = indep['given']
            given_vars_str = ', '.join(indep['given'])
            node_word = "node" if len(given_vars) == 1 else "nodes"
            reasoning_steps.append(f"- Because nodes {var1} and {var2} are independent given {node_word} {given_vars_str}, there is no edge between nodes {var1} and {var2}.")
    if not removed_edges_cond:
        reasoning_steps.append("- No edges removed in this step.")

    # Step 5: Compile the Causal Undirected Skeleton in Incident Form
    # reasoning_steps.append("\nAnswer:")
    reasoning_steps.append("\nStep 5: Compile the Causal Undirected Skeleton")
    
    # Initialize adjacency list
    adjacency = {var: set() for var in variables}
    
    # Populate adjacency list based on remaining edges
    for edge in edges:
        var1, var2 = sorted(edge)  # Ensure consistent ordering
        adjacency[var1].add(var2)
        adjacency[var2].add(var1)
    
    # Create incident form string with grammatical consistency
    incident_form = []
    for var in sorted(adjacency.keys()):
        connected_nodes = sorted(adjacency[var])
        if connected_nodes:
            # Determine whether to use 'node' or 'nodes'
            node_word = "node" if len(connected_nodes) == 1 else "nodes"
            connected_str = ', '.join(connected_nodes)
            incident_form.append(f"Node {var} is connected to {node_word} {connected_str}.")
        else:
            incident_form.append(f"Node {var} has no connections.")
    
    # Append to reasoning steps
    reasoning_steps.append("In this graph:")
    for line in incident_form:
        reasoning_steps.append(f"  - {line}")
    
    answer = '\n'.join(reasoning_steps)
    return answer

In [21]:
def generate_incident_causal_skeleton_reasoning(text):
    parsed_data = parse_input(text)
    answer = construct_incident_causal_skeleton_with_steps(parsed_data)
    return answer

In [22]:
def construct_incident_causal_skeleton_only_answer(parsed_data):
    variables = parsed_data['variables']
    correlations = parsed_data['correlations']
    marginal_independencies = parsed_data['marginal_independencies']
    conditional_independencies = parsed_data['conditional_independencies']

    reasoning_steps = []

    # Step 1: Read the Data
    reasoning_steps.append("Step 1: Read the Data")
    reasoning_steps.append(f"- Extracted nodes: {', '.join(variables)}")

    # Build and format the correlation adjacency
    correlation_adjacency = build_correlation_adjacency(correlations)
    formatted_correlations = format_correlation_adjacency(correlation_adjacency)
    reasoning_steps.append("- Correlations:")
    reasoning_steps.append(f"  - {formatted_correlations}")

    # Marginal Independencies
    if marginal_independencies:
        reasoning_steps.append("- Marginal Independencies:")
        for var1, var2 in marginal_independencies:
            reasoning_steps.append(f"  - Node {var1} is independent of node {var2}.")
    else:
        reasoning_steps.append("- Marginal Independencies: None")

    # Conditional Independencies
    if conditional_independencies:
        cond_indep_str = '\n  - '.join([
            f"Nodes {indep['vars'][0]} and {indep['vars'][1]} are independent given "
            f"{'node' if len(indep['given']) == 1 else 'nodes'} {', '.join(indep['given'])}"
            for indep in conditional_independencies
        ])
        reasoning_steps.append(f"- Conditional Independencies:\n  - {cond_indep_str}")
    else:
        reasoning_steps.append(f"- Conditional Independencies: None")

    # Step 2: Initialize the Graph
    reasoning_steps.append("\nStep 2: Initialize the Graph")
    edges = set()
    for var1, var2 in correlations:
        edges.add(frozenset([var1, var2]))

    # Create adjacency list after initialization
    adjacency = {var: set() for var in variables}
    for edge in edges:
        var1, var2 = sorted(edge)  # Sort for consistent ordering
        adjacency[var1].add(var2)
        adjacency[var2].add(var1)

    # Format adjacency list
    adjacency_str = format_adjacency(adjacency)
    reasoning_steps.append("Created edges between all correlated variable pairs. In this graph:")
    reasoning_steps.append(f"  - {adjacency_str}")

    # Step 3: Apply Marginal Independencies
    reasoning_steps.append("\nStep 3: Apply Marginal Independencies")
    removed_edges = set()
    for var1, var2 in marginal_independencies:
        edge = frozenset([var1, var2])
        if edge in edges:
            edges.remove(edge)
            removed_edges.add(edge)
            reasoning_steps.append(f"- **Because {var1} is independent of {var2}, there is no edge between {var1} and {var2}.**")
    if not removed_edges:
        reasoning_steps.append("- No edges removed in this step.")

    # Step 4: Apply Conditional Independencies
    reasoning_steps.append("\nStep 4: Apply Conditional Independencies")
    removed_edges_cond = set()
    for indep in conditional_independencies:
        var1, var2 = indep['vars']
        edge = frozenset([var1, var2])
        if edge in edges:
            edges.remove(edge)
            removed_edges_cond.add(edge)
            given_vars = indep['given']
            given_vars_str = ', '.join(indep['given'])
            node_word = "node" if len(given_vars) == 1 else "nodes"
            reasoning_steps.append(f"- Because nodes {var1} and {var2} are independent given {node_word} {given_vars_str}, there is no edge between nodes {var1} and {var2}.")
    if not removed_edges_cond:
        reasoning_steps.append("- No edges removed in this step.")

    """
    Refactor the below code, as to not have the following structure:
    Step 1. Read data
    Step 2. Extract variables
    Step 3. Compute independencies
    Step 4. Initialize graph
    Step 5. Apply independencies and provide computed causal graph

    Rather than that, for that particular experiment, we would like the model to provide it with:
    - Premise and actual question (e.g. "What is the causal undirected skeleton?")
    - The expected answer (e.g. "The causal undirected skeleton is ...")
    Therefore not provide the model with reasoning steps.
    This could be particularly useful for later experiments done with reasoning models.
    """
    # This is the hacky way to do it, rather than refactoring the code above
    # We just overwrite the reasoning 'output' but still compute the causal skeleton step by step
    reasoning_steps = []

    # Computed Causal Undirected Skeleton in Incident Form
    # reasoning_steps.append("\nAnswer:")
    reasoning_steps.append("Computed Causal Undirected Skeleton:")

    # Initialize adjacency list
    adjacency = {var: set() for var in variables}

    # Populate adjacency list based on remaining edges
    for edge in edges:
        var1, var2 = sorted(edge)  # Ensure consistent ordering
        adjacency[var1].add(var2)
        adjacency[var2].add(var1)

    # Create incident form string with grammatical consistency
    incident_form = []
    for var in sorted(adjacency.keys()):
        connected_nodes = sorted(adjacency[var])
        if connected_nodes:
            # Determine whether to use 'node' or 'nodes'
            node_word = "node" if len(connected_nodes) == 1 else "nodes"
            connected_str = ', '.join(connected_nodes)
            incident_form.append(f"Node {var} is connected to {node_word} {connected_str}.")
        else:
            incident_form.append(f"Node {var} has no connections.")

    # Append to reasoning steps
    # reasoning_steps.append("In this graph:")
    for line in incident_form:
        reasoning_steps.append(f"  - {line}")

    answer = '\n'.join(reasoning_steps)
    return answer

In [23]:
def generate_incident_causal_skeleton_only_answer(text):
    parsed_data = parse_input(text)
    answer = construct_incident_causal_skeleton_only_answer(parsed_data)
    return answer

# Test processing function and re-format whole datasets

In [24]:
def generate_causal_answers(text):
    """
    Generates both multi-line and single-line expected answers.

    Parameters:
        text (str): The input text containing the premise.

    Returns:
        dict: A dictionary with 'expected_answer' and 'expected_answer_single_line'.
    """
    # multi_line_answer = generate_incident_causal_skeleton_reasoning(text)
    multi_line_answer = generate_incident_causal_skeleton_only_answer(text)
    
    # Replace actual newlines with literal '\n' to create a single-line answer
    single_line_answer = multi_line_answer.replace('\n', '\\n')
    
    return {
        'expected_answer': multi_line_answer,
        'expected_answer_single_line': single_line_answer
    }

In [9]:
train_df.iloc[0]['input']

'Premise: Suppose there is a closed system of 4 variables, A, B, C and D. All the statistical relations among these 4 variables are as follows: A correlates with B. A correlates with C. A correlates with D. B correlates with C. B correlates with D. C correlates with D. However, B and D are independent given A. B and D are independent given A and C. C and D are independent given A. C and D are independent given A and B.\nHypothesis: There exists at least one collider (i.e., common effect) of A and B.'

In [10]:
result = generate_causal_skeleton_reasoning(train_df.iloc[0]['input'])
print(result)

Step 1: Read the Data
- Extracted variables: A, B, C, D
- Correlations: (A, B), (A, C), (A, D), (B, C), (B, D), (C, D)
- Marginal Independencies: 
- Conditional Independencies:
  - (B, D) are independent given A
  - (B, D) are independent given A, C
  - (C, D) are independent given A
  - (C, D) are independent given A, B
Step 2: Initialize the Graph
- Created edges between all correlated variable pairs.
- Initial edges: {(A, B), (A, C), (A, D), (B, C), (B, D), (C, D)}
Step 3: Apply Marginal Independencies
- No edges removed in this step.
Step 4: Apply Conditional Independencies
- Because B and D are independent given A, there is no edge between B and D.
- Because C and D are independent given A, there is no edge between C and D.
Step 5: Compile the Causal Undirected Skeleton
  - Edges: {(C, B), (A, C), (A, D), (A, B)}


In [11]:
result = generate_incident_causal_skeleton_reasoning(train_df.iloc[10]['input'])
print(result)

Step 1: Read the Data
- Extracted nodes: A, B, C, D
- Correlations:
  - Node A is correlated with nodes C, D.
  - Node B is correlated with nodes C, D.
  - Node C is correlated with nodes A, B, D.
  - Node D is correlated with nodes A, B, C.
- Marginal Independencies:
  - Node A is independent of node B.
- Conditional Independencies:
  - Nodes A and D are independent given nodes B, C
  - Nodes A and D are independent given node C
  - Nodes B and D are independent given nodes A, C
  - Nodes B and D are independent given node C

Step 2: Initialize the Graph
Created edges between all correlated variable pairs. In this graph:
  - Node A is connected to nodes C, D.
  - Node B is connected to nodes C, D.
  - Node C is connected to nodes A, B, D.
  - Node D is connected to nodes A, B, C.

Step 3: Apply Marginal Independencies
- No edges removed in this step.

Step 4: Apply Conditional Independencies
- Because nodes A and D are independent given nodes B, C, there is no edge between nodes A and

In [46]:
result = generate_incident_causal_skeleton_only_answer(train_df.iloc[10]['input'])
print(result)

Computed Causal Undirected Skeleton:
  - Node A is connected to node C.
  - Node B is connected to node C.
  - Node C is connected to nodes A, B, D.
  - Node D is connected to node C.


In [25]:
# Process the sample input
def test_single_row():
    test_row = train_df.iloc[0]['input']
    result = generate_causal_answers(test_row)
    print("=== Multi-Line Answer ===")
    print(result['expected_answer'])
    print("\n=== Single-Line Answer ===")
    print(result['expected_answer_single_line'])

# Run the test
test_single_row()

=== Multi-Line Answer ===
Computed Causal Undirected Skeleton:
  - Node A is connected to nodes B, C, D.
  - Node B is connected to nodes A, C.
  - Node C is connected to nodes A, B.
  - Node D is connected to node A.

=== Single-Line Answer ===
Computed Causal Undirected Skeleton:\n  - Node A is connected to nodes B, C, D.\n  - Node B is connected to nodes A, C.\n  - Node C is connected to nodes A, B.\n  - Node D is connected to node A.


In [26]:
def process_dataframe_with_progress(df, input_column, output_columns, processing_function, output_file):
    """
    Processes a DataFrame by applying a function to a specified input column and saving the result in a new column.
    Includes a progress bar to track the processing.

    Parameters:
        df (pd.DataFrame): The DataFrame to process.
        input_column (str): The name of the input column where the function will be applied.
        output_column (str): The name of the new column to store the results.
        processing_function (callable): The function to apply to each row's input column.
        output_file (str): The file path to save the processed DataFrame.

    Returns:
        pd.DataFrame: The modified DataFrame with the new column added.
    """
    # Make a copy of the original DataFrame
    df_copy = df.copy()

    # Enable the tqdm progress bar for pandas
    tqdm.pandas(desc=f"Processing {', '.join(output_columns)}")

    # Apply the processing function with progress tracking
    results = df_copy[input_column].progress_apply(processing_function)
    
    # If the processing function returns a dictionary or Series, expand into multiple columns
    results_df = pd.DataFrame(results.tolist(), index=df_copy.index)
    
    # Assign the new columns
    for col in output_columns:
        if col in results_df.columns:
            df_copy[col] = results_df[col]
        else:
            df_copy[col] = None  # or some default value

    # Save the modified DataFrame to a file, e.g., as a CSV
    df_copy.to_csv(output_file, index=False)

    print(f"Processing complete and DataFrame saved to {output_file}.")
    return df_copy

In [48]:
processed_train_df = process_dataframe_with_progress(
    df=train_df,
    input_column='input',
    # output_columns=['expected_answer', 'expected_answer_single_line'],
    output_columns=['expected_answer'],
    processing_function=generate_causal_answers,
    output_file='v0.0.8/train.csv'
)

Processing expected_answer:   0%|          | 0/576 [00:00<?, ?it/s]

Processing complete and DataFrame saved to v0.0.8/train.csv.


In [72]:
processed_validation_df = process_dataframe_with_progress(
    df=validation_df,
    input_column='input',
    output_columns=['expected_answer', 'expected_answer_single_line'],
    processing_function=generate_causal_answers,
    output_file='v0.0.3/validation.csv'
)

Processing expected_answer, expected_answer_single_line:   0%|          | 0/1076 [00:00<?, ?it/s]

Processing complete and DataFrame saved to data/v0.0.3/validation.csv.


In [73]:
processed_test_df = process_dataframe_with_progress(
    df=test_df,
    input_column='input',
    output_columns=['expected_answer', 'expected_answer_single_line'],
    processing_function=generate_causal_answers,
    output_file='v0.0.3/test.csv'
)

Processing expected_answer, expected_answer_single_line:   0%|          | 0/1162 [00:00<?, ?it/s]

Processing complete and DataFrame saved to data/v0.0.3/test.csv.


# Prepare the balanced 6-variables test dataset

In [31]:
# test_df = test_dataset.to_pandas()

# Extract only the problems with x  variables
num_variables = 6
test_df = test_df[test_df['num_variables'] == num_variables]
print(f"Test split length only with {num_variables} variables: {len(test_df)}")

Test split length only with 6 variables: 522


In [32]:
# Count the occurrences of each label value
label_counts = test_df['label'].value_counts()
label_percentages = test_df['label'].value_counts(normalize=True) * 100

# Print the statistics
print(f"Label distribution in test dataset with {num_variables} variables:")
print(f"0 (False): {label_counts.get(0, 0)} ({label_percentages.get(0, 0):.1f}%)")
print(f"1 (True): {label_counts.get(1, 0)} ({label_percentages.get(1, 0):.1f}%)")


Label distribution in test dataset with 6 variables:
0 (False): 402 (77.0%)
1 (True): 120 (23.0%)


In [33]:
balanced_test_df = pd.concat([
    test_df[test_df['label'] == 1],
    test_df[test_df['label'] == 0].sample(n=120, random_state=42)
])

# Shuffle the dataset
test_df = balanced_test_df.sample(frac=1, random_state=42).reset_index(drop=True)

label_counts = test_df['label'].value_counts()
label_percentages = test_df['label'].value_counts(normalize=True) * 100
print(f"Label distribution in test dataset with {num_variables} variables:")
print(f"0 (False): {label_counts.get(0, 0)} ({label_percentages.get(0, 0):.1f}%)")
print(f"1 (True): {label_counts.get(1, 0)} ({label_percentages.get(1, 0):.1f}%)")

Label distribution in test dataset with 6 variables:
0 (False): 120 (50.0%)
1 (True): 120 (50.0%)


In [34]:
processed_balanced_test_df = process_dataframe_with_progress(
    df=test_df,
    input_column='input',
    # output_columns=['expected_answer', 'expected_answer_single_line'],
    output_columns=['expected_answer'],
    processing_function=generate_causal_answers,
    output_file='test/balanced_50_50_test.csv'
)

Processing expected_answer:   0%|          | 0/240 [00:00<?, ?it/s]

Processing complete and DataFrame saved to test/balanced_50_50_test.csv.
