In [23]:
import os
import numpy as np
import pandas as pd
import networkx as nx
import lingam
from lingam.direct_lingam import DirectLiNGAM
from lingam.utils import make_prior_knowledge, make_dot
from sklearn.preprocessing import StandardScaler
from evaluation import evaluate_graph
from true_graph import create_true_graph_student, create_true_graph_student_small, create_true_graph_adult, create_true_graph_adult_small

In [24]:
def set_random_seed(seed):
    """Set random seed for reproducibility."""
    np.random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)

In [25]:
def run_direct_lingam(data, labels, prior_knowledge=None, seed=None, max_iter=5000):
    if seed is not None:
        set_random_seed(seed)

    # Standardize the data
    scaler = StandardScaler()
    data = scaler.fit_transform(data)
    
    # Print prior knowledge matrix for debugging
    if prior_knowledge is not None:
        print("Prior Knowledge Matrix:")
        print(prior_knowledge)

    # Initialize the DirectLiNGAM model
    model = DirectLiNGAM(prior_knowledge=prior_knowledge)
    model.fit(data)

    # Get and print the adjacency matrix for debugging
    adjacency_matrix = model.adjacency_matrix_
    print("Adjacency Matrix After Fitting:")
    print(adjacency_matrix)

    return adjacency_matrix

In [26]:
def main():
    # Specify dataset
    dataset = 'adult_small'  # Options: 'adult', 'adult_small', 'student', 'student_small'
    
    # Load the appropriate processed CSV file
    if dataset == 'adult':
        data_file = 'data/processed_adult.csv'
        df_encoded = pd.read_csv(data_file)
        labels = df_encoded.columns.tolist()
        data = df_encoded.to_numpy()
        true_graph = create_true_graph_adult()
        exogenous_vars = [0, 9]  # 'age' at index 0, 'native.country' at index 9
        sink_vars = [10]  # 'income' at index 10

    elif dataset == 'adult_small':
        data_file = 'data/processed_adult_small.csv'
        df_encoded = pd.read_csv(data_file)
        labels = df_encoded.columns.tolist()
        data = df_encoded.to_numpy()
        true_graph = create_true_graph_adult_small()
        exogenous_vars = [2, 6]  # 'age' at index 2, 'native.country' at index 6
        sink_vars = [0]  # 'income' at index 0

    elif dataset == 'student':
        data_file = 'data/processed_student.csv'
        df_encoded = pd.read_csv(data_file)
        labels = df_encoded.columns.tolist()
        data = df_encoded.to_numpy()
        true_graph = create_true_graph_student()
        exogenous_vars = [2]  # 'Medu' at index 2
        sink_vars = [7]  # 'G_avg' at index 7

    elif dataset == 'student_small':
        data_file = 'data/processed_student_small.csv'
        df_encoded = pd.read_csv(data_file)
        labels = df_encoded.columns.tolist()
        data = df_encoded.to_numpy()
        true_graph = create_true_graph_student_small()
        exogenous_vars = [2]  # 'Medu' at index 2
        sink_vars = [1]  # 'G_avg' at index 1

    else:
        raise ValueError("Invalid dataset specified. Choose 'adult', 'adult_small', 'student', or 'student_small'.")

    print(f"Processing dataset: {dataset}")
    print("Data loaded and prepared.")
    print(f"Labels: {labels}")

    # Construct the prior knowledge matrix
    prior_knowledge = make_prior_knowledge(
        n_variables=len(labels),
        exogenous_variables=exogenous_vars,
        sink_variables=sink_vars
    )

    # Step 1: Create the adjacency matrix with no background knowledge
    adjacency_matrix_no_bk = run_direct_lingam(data, labels, prior_knowledge=None, seed=42)
    print("Adjacency matrix with no background knowledge:")
    print(adjacency_matrix_no_bk)

    # Step 2: Visualize the graph with no background knowledge using make_dot
    dot_no_bk = make_dot(adjacency_matrix_no_bk, labels=labels)
    dot_no_bk.render("causal_graph_no_background_knowledge")  # Save the graph as a PDF/PNG file
    dot_no_bk.view()  # Open the graph in the default viewer

    # Evaluate the estimated graph with no background knowledge
    estimated_graph_no_bk = nx.DiGraph(adjacency_matrix_no_bk)
    if true_graph is not None:
        shd_no_bk, recall_no_bk, precision_no_bk = evaluate_graph(estimated_graph_no_bk, true_graph)
        print(f"Without Background Knowledge - SHD: {shd_no_bk}, Recall: {recall_no_bk}, Precision: {precision_no_bk}")

    # Step 3: Adjust the adjacency matrix with background knowledge
    adjacency_matrix_with_bk = run_direct_lingam(data, labels, prior_knowledge=prior_knowledge, seed=42)
    print("Adjacency matrix with background knowledge:")
    print(adjacency_matrix_with_bk)

    # Step 4: Visualize the graph with background knowledge using make_dot
    dot_with_bk = make_dot(adjacency_matrix_with_bk, labels=labels)
    dot_with_bk.render("causal_graph_with_background_knowledge")  # Save the graph as a PDF/PNG file
    dot_with_bk.view()  # Open the graph in the default viewer

    # Evaluate the estimated graph with background knowledge
    estimated_graph_with_bk = nx.DiGraph(adjacency_matrix_with_bk)
    if true_graph is not None:
        shd_with_bk, recall_with_bk, precision_with_bk = evaluate_graph(estimated_graph_with_bk, true_graph)
        print(f"With Background Knowledge - SHD: {shd_with_bk}, Recall: {recall_with_bk}, Precision: {precision_with_bk}")

if __name__ == "__main__":
    main()

Processing dataset: adult_small
Data loaded and prepared.
Labels: ['income', 'hours.per.week', 'age', 'education', 'workclass', 'occupation', 'native.country']
Adjacency Matrix After Fitting:
[[ 0.          0.          0.          0.          0.          0.
  -0.04020444]
 [ 0.22539219  0.          0.          0.04126435  0.04591983  0.
   0.        ]
 [ 0.23304742  0.04616827  0.         -0.02440765  0.07475075 -0.02064667
   0.        ]
 [ 0.07726351  0.          0.          0.          0.         -0.04302343
  -0.09807285]
 [ 0.          0.          0.          0.          0.          0.
   0.        ]
 [ 0.05157686  0.          0.          0.          0.          0.
   0.        ]
 [ 0.          0.          0.          0.          0.          0.
   0.        ]]
Adjacency matrix with no background knowledge:
[[ 0.          0.          0.          0.          0.          0.
  -0.04020444]
 [ 0.22539219  0.          0.          0.04126435  0.04591983  0.
   0.        ]
 [ 0.23304742  