In [11]:
import re

import pandas as pd
from tqdm.auto import tqdm 
    
from datasets import load_dataset

In [12]:
# Download the CORR2CAUSE dataset
dataset_name = "causal-nlp/corr2cause"
try:
    dataset = load_dataset(dataset_name)
    print("Dataset successfully loaded.")
except Exception as e:
    print(f"Error loading the dataset: {e}")

Dataset successfully loaded.


In [13]:
# Access the train, test, and validation splits
train_dataset = dataset['train']
test_dataset = dataset['test']
validation_dataset = dataset['validation']

# Convert to Pandas DataFrames
train_df = train_dataset.to_pandas()
test_df = test_dataset.to_pandas()
validation_df = validation_dataset.to_pandas()

# Display the length of each split
print(f"Train split length: {len(train_df)}")
print(f"Test split length: {len(test_df)}")
print(f"Validation split length: {len(validation_df)}")

Train split length: 205734
Test split length: 1162
Validation split length: 1076


In [14]:
# Function to extract only problem with given amount of num_variables [4-6]
def filter_by_num_variables(df, target_num):
    """
    Filters the DataFrame to include only rows where the 'num_variables' column
    equals the specified target number.

    Parameters:
    - df (pd.DataFrame): The input DataFrame containing a 'num_variables' column.
    - target_num (int): The number of variables to filter by.

    Returns:
    - pd.DataFrame: A filtered DataFrame with rows where 'num_variables' == target_num.
    """
    if 'num_variables' not in df.columns:
        raise ValueError("The DataFrame does not contain a 'num_variables' column.")
    
    # Ensure that 'num_variables' column contains numeric data
    if not pd.api.types.is_numeric_dtype(df['num_variables']):
        raise TypeError("'num_variables' column must be of a numeric type.")
    
    # Filter the DataFrame
    filtered_df = df[df['num_variables'] == target_num].copy()
    
    return filtered_df


train_df = filter_by_num_variables(train_df, 4)
print(f"Train split length: {len(train_df)}")

Train split length: 576


In [18]:
def parse_input(text):
    # Initialize containers
    variables = set()
    correlations = []
    marginal_independencies = []
    conditional_independencies = []

    # Extract variables
    var_match = re.search(r'variables?(.*?)[\.\n]', text)
    if var_match:
        vars_text = var_match.group(1)
        # Split vars_text by commas and 'and'
        vars_list = re.split(r',\s*|\s+and\s+|\s*,\s*', vars_text)
        vars_list = [var.strip() for var in vars_list if var.strip() and var.strip().lower() != 'and']
        variables.update(vars_list)
    else:
        # Handle case when variables not found
        pass

    # Extract correlations
    # The correlations are in the text after 'All the statistical relations among these variables are as follows:'
    # and before 'However,'
    correlations_text_match = re.search(r'All the statistical relations.*?are as follows:(.*?)(However|$)', text, re.DOTALL)
    if correlations_text_match:
        correlations_text = correlations_text_match.group(1)
        # Now we need to extract all 'X correlates with Y.'
        correlation_matches = re.findall(r'([A-Za-z]+) correlates with ([A-Za-z]+)\.', correlations_text)
        correlations.extend(correlation_matches)
        variables.update([var for pair in correlation_matches for var in pair])

    # Extract independencies
    # The independencies are in the text after 'However,'
    independencies_text_match = re.search(r'However,(.*)', text, re.DOTALL)
    if independencies_text_match:
        independencies_text = independencies_text_match.group(1)
        # Now split independencies into sentences
        sentences = re.findall(r'([^.]*?\.)', independencies_text)
        for sentence in sentences:
            sentence = sentence.strip()
            # Try to match marginal independencies
            marg_match = re.match(r'([A-Za-z]+) is independent of ([A-Za-z]+)\.', sentence)
            if marg_match:
                var1 = marg_match.group(1)
                var2 = marg_match.group(2)
                marginal_independencies.append((var1, var2))
                variables.update([var1, var2])
            else:
                # Try to match conditional independencies
                cond_match = re.match(r'([A-Za-z]+) and ([A-Za-z]+) are independent given (.*?)[\.\n]', sentence)
                if cond_match:
                    var1 = cond_match.group(1)
                    var2 = cond_match.group(2)
                    given_vars_text = cond_match.group(3)
                    # Split given_vars_text by commas and 'and', strip spaces
                    given_vars = re.split(r',\s*|\s+and\s+', given_vars_text)
                    given_vars = [var.strip() for var in given_vars if var.strip() and var.strip().lower() != 'and']
                    conditional_independencies.append({
                        'vars': (var1, var2),
                        'given': given_vars
                    })
                    variables.update([var1, var2] + given_vars)
    else:
        pass

    return {
        'variables': list(sorted(variables)),
        'correlations': correlations,
        'marginal_independencies': marginal_independencies,
        'conditional_independencies': conditional_independencies
    }

In [20]:
def construct_causal_skeleton_with_steps(parsed_data):
    variables = parsed_data['variables']
    correlations = parsed_data['correlations']
    marginal_independencies = parsed_data['marginal_independencies']
    conditional_independencies = parsed_data['conditional_independencies']

    reasoning_steps = []

    # Step 1: Read the Data
    reasoning_steps.append("Step 1: Read the Data\n")
    reasoning_steps.append(f"- Extracted variables: {', '.join(variables)}")
    correlations_str = ', '.join([f"({var1}, {var2})" for var1, var2 in correlations])
    reasoning_steps.append(f"- Correlations: {correlations_str}")
    marg_indep_str = ', '.join([f"({var1}, {var2})" for var1, var2 in marginal_independencies])
    reasoning_steps.append(f"- Marginal Independencies: {marg_indep_str}")
    cond_indep_str = '\n  - '.join([
        f"({indep['vars'][0]}, {indep['vars'][1]}) are independent given {', '.join(indep['given'])}"
        for indep in conditional_independencies
    ])
    reasoning_steps.append(f"- Conditional Independencies:\n  - {cond_indep_str}\n")

    # Step 2: Initialize the Graph
    reasoning_steps.append("Step 2: Initialize the Graph\n")
    edges = set()
    for var1, var2 in correlations:
        edges.add(frozenset([var1, var2]))
    initial_edges_str = ', '.join([f"({var1}, {var2})" for var1, var2 in correlations])
    reasoning_steps.append(f"- Created edges between all correlated variable pairs.")
    reasoning_steps.append(f"- Initial edges: {{{initial_edges_str}}}\n")

    # Step 3: Apply Marginal Independencies
    reasoning_steps.append("Step 3: Apply Marginal Independencies\n")
    removed_edges = set()
    for var1, var2 in marginal_independencies:
        edge = frozenset([var1, var2])
        if edge in edges:
            edges.remove(edge)
            removed_edges.add(edge)
            reasoning_steps.append(f"- **Because {var1} is independent of {var2}, there is no edge between {var1} and {var2}.**")
    if not removed_edges:
        reasoning_steps.append("- No edges removed in this step.\n")
    else:
        reasoning_steps.append("")

    # Step 4: Apply Conditional Independencies
    reasoning_steps.append("Step 4: Apply Conditional Independencies\n")
    removed_edges_cond = set()
    for indep in conditional_independencies:
        var1, var2 = indep['vars']
        edge = frozenset([var1, var2])
        if edge in edges:
            edges.remove(edge)
            removed_edges_cond.add(edge)
            given_vars_str = ', '.join(indep['given'])
            reasoning_steps.append(f"- Because {var1} and {var2} are independent given {given_vars_str}, there is no edge between {var1} and {var2}.")
    if not removed_edges_cond:
        reasoning_steps.append("- No edges removed in this step.\n")
    else:
        reasoning_steps.append("")
        
    # Step 5: Compile the Causal Undirected Skeleton
    reasoning_steps.append("Step 5: Compile the Causal Undirected Skeleton")
    final_edges = {}
    for edge in edges:
        var1, var2 = sorted(edge)
        if var1 not in final_edges:
            final_edges[var1] = []
        final_edges[var1].append(var2)

    reasoning_steps.append("In this graph:")
    for node in variables:
        if node in final_edges:
            reasoning_steps.append(f"  - Node {node} is connected to nodes {', '.join(final_edges[node])}.")
        else:
            reasoning_steps.append(f"  - Node {node} has no connections.")

    # Step 5: Compile the Remaining Edges
    # reasoning_steps.append("Step 5: Compile the Remaining Edges\n")
    # remaining_edges = [(list(edge)[0], list(edge)[1]) for edge in edges]
    # remaining_edges_str = ', '.join([f"({var1}, {var2})" for var1, var2 in remaining_edges])
    # reasoning_steps.append(f"- Remaining edges after applying independencies: {{{remaining_edges_str}}}\n")

    # Step 6: Prepare the Step-by-Step Answer
    # reasoning_steps.append("Step 6: Give the Final Answer with Computed Causal Undirected Skeleton\n")
    # reasoning_steps.append(f"  - Edges: {{{remaining_edges_str}}}")

    answer = '\n'.join(reasoning_steps)
    return answer

In [19]:
def generate_causal_skeleton_reasoning(text):
    parsed_data = parse_input(text)
    answer = construct_causal_skeleton_with_steps(parsed_data)
    return answer

In [21]:
result = generate_causal_skeleton_reasoning(train_df.iloc[0]['input'])
print(result)

Step 1: Read the Data

- Extracted variables: A, B, C, D
- Correlations: (A, B), (A, C), (A, D), (B, C), (B, D), (C, D)
- Marginal Independencies: 
- Conditional Independencies:
  - (B, D) are independent given A
  - (B, D) are independent given A, C
  - (C, D) are independent given A
  - (C, D) are independent given A, B

Step 2: Initialize the Graph

- Created edges between all correlated variable pairs.
- Initial edges: {(A, B), (A, C), (A, D), (B, C), (B, D), (C, D)}

Step 3: Apply Marginal Independencies

- No edges removed in this step.

Step 4: Apply Conditional Independencies

- Because B and D are independent given A, there is no edge between B and D.
- Because C and D are independent given A, there is no edge between C and D.

Step 5: Compile the Causal Undirected Skeleton
In this graph:
  - Node A is connected to nodes B, D, C.
  - Node B is connected to nodes C.
  - Node C has no connections.
  - Node D has no connections.


In [16]:
def process_dataframe_with_progress(df, input_column, output_column, processing_function, output_file):
    """
    Processes a DataFrame by applying a function to a specified input column and saving the result in a new column.
    Includes a progress bar to track the processing.

    Parameters:
        df (pd.DataFrame): The DataFrame to process.
        input_column (str): The name of the input column where the function will be applied.
        output_column (str): The name of the new column to store the results.
        processing_function (callable): The function to apply to each row's input column.
        output_file (str): The file path to save the processed DataFrame.

    Returns:
        pd.DataFrame: The modified DataFrame with the new column added.
    """
    # Make a copy of the original DataFrame
    df_copy = df.copy()

    # Enable the tqdm progress bar for pandas
    tqdm.pandas(desc=f"Processing {output_column}")

    # Apply the processing function with progress tracking
    df_copy[output_column] = df_copy[input_column].progress_apply(processing_function)

    # Save the modified DataFrame to a file, e.g., as a CSV
    df_copy.to_csv(output_file, index=False)

    print(f"Processing complete and DataFrame saved to {output_file}.")
    return df_copy

In [23]:
processed_train_df = process_dataframe_with_progress(
    df=train_df,
    input_column='input',
    output_column='expected_answer',
    processing_function=generate_causal_skeleton_reasoning,
    output_file='train.csv'
)

Processing expected_answer:   0%|          | 0/576 [00:00<?, ?it/s]

Processing complete and DataFrame saved to train.csv.


In [8]:
processed_validation_df = process_dataframe_with_progress(
    df=validation_df,
    input_column='input',
    output_column='expected_answer',
    processing_function=generate_causal_skeleton_reasoning,
    output_file='validation.csv'
)

Processing expected_answer:   0%|          | 0/1076 [00:00<?, ?it/s]

Processing complete and DataFrame saved to processed_validation_df.csv.
