# Causal Discovery & ML - From Assumptions to Applications

### Loading Libraries

In [1]:
# Numerical Computing
import numpy as np

# Data Manipulation
import pandas as pd

# IterTools
from itertools import combinations

# Statistical Models
from scipy import stats
import statsmodels.api as sm

# Networkx
import networkx as nx

# JavaScript Object Notation
import json

# Data Visualization
import graphviz
import seaborn as sns
import matplotlib.pyplot as plt

# Scikit-Learn
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.linear_model import LinearRegression, LogisticRegression, LassoCV
from sklearn.metrics import mean_absolute_percentage_error

# Castle
import castle
from castle.common import GraphDAG
from castle.metrics import MetricsDAG
from castle.datasets import DAG, IIDSimulation 

from castle.algorithms import PC, GES
from castle.algorithms import ANMNonlinear, ICALiNGAM, DirectLiNGAM
from castle.algorithms import Notears, NotearsNonlinear, GOLEM

from castle.common.priori_knowledge import PrioriKnowledge

from castle.common.independence_tests import hsic_test

# PyTorch & Transformers
import torch

# DoWhy Causal Libraries
import dowhy
from dowhy import gcm
from dowhy import CausalModel
from dowhy.causal_model import CausalModel

# Notebook Iteration 
from tqdm import tqdm

# Operating Systems & Environments
import os
os.environ['CASTLE_BACKEND'] = 'pytorch'

# Copy
from copy import deepcopy

# Light Gradient Boosting Models
from lightgbm import LGBMRegressor, LGBMClassifier

2024-08-27 08:36:18,717 - /Users/isisromero/anaconda3/envs/CAUSINF/lib/python3.11/site-packages/castle/backend/__init__.py[line:36] - INFO: You can use `os.environ['CASTLE_BACKEND'] = backend` to set the backend(`pytorch` or `mindspore`).
2024-08-27 08:36:18,734 - /Users/isisromero/anaconda3/envs/CAUSINF/lib/python3.11/site-packages/castle/algorithms/__init__.py[line:36] - INFO: You are using ``pytorch`` as the backend.


#### Setting Viz Standards

In [2]:
COLORS = [
    '#00B0F0',
    '#FF0000',
    '#B0F000'
]

#### Setting Seeds

In [3]:
# Set random seed
SEED = 18

np.random.seed(SEED)

## Introduction to gCastle

### Synthetic Data in gCastle

In [9]:
# Generate a scale-free adjacency matrix
adj_matrix = DAG.scale_free(
    n_nodes=10,
    n_edges=17,
    seed=SEED
)

In [10]:
adj_matrix

In [13]:
# Visualize the adjacency matrix
g = nx.DiGraph(adj_matrix)

plt.figure(figsize=(12, 8))
nx.draw(
    G=g,
    node_color=COLORS[0],
    node_size=1200,
    arrowsize=17,
    with_labels=True,
    font_color='white',
    font_size=21,
    pos=nx.circular_layout(g)
)

In [14]:
dataset = IIDSimulation(
    W=adj_matrix, 
    n=10000, 
    method='linear', 
    sem_type='gauss'
)

In [15]:
# Access the generated data
dataset.X

### Fitting a First Causal Discovery Model

In [None]:
# Instantiate the model
pc = PC()

In [None]:
# Fit the model
pc.learn(dataset.X)

In [None]:
pred_dag = pc.causal_matrix
pred_dag

#### Visualizing The Model

In [None]:
g_pred = nx.DiGraph(pred_dag)

plt.figure(figsize=(12, 8))
nx.draw(
    G=g_pred,
    node_color=COLORS[0],
    node_size=1200,
    arrowsize=17,
    with_labels=True,
    font_color='white',
    font_size=21,
    pos=nx.circular_layout(g)
)

In [None]:
GraphDAG(
    est_dag=pred_dag, 
    true_dag=adj_matrix)

plt.show()

#### Model Evaluation Metrics

In [None]:
metrics = MetricsDAG(
    B_est=pred_dag, 
    B_true=adj_matrix)

In [None]:
metrics.metrics['F1']

In [None]:
metrics.metrics

In [None]:
	
def get_n_undirected(g):
    
    total = 0
    
    for i in range(g.shape[0]):
        for j in range(g.shape[0]):
            if (g[i, j] == 1) and (g[i, j] == g[j, i]):
                total += .5
    
    return total

In [None]:
get_n_undirected(pred_dag)

In [None]:
np.tril(pred_dag)

### Constraint-Based Causal Discovery

In [None]:
# Build a DAG adj matrix
pc_dag = np.array([
    [0, 0, 1, 0],
    [0, 0, 1, 0],
    [0, 0, 0, 1],
    [0, 0, 0, 0]
])

In [None]:
# Generate data from this DAG
N = 1000

p = np.random.randn(N)
q = np.random.randn(N)

r = p + q + .1 * np.random.randn(N)
s = .7 * r + .1 * np.random.randn(N)

# To matrix
pc_dataset = np.vstack([p, q, r, s]).T

In [None]:
# Plot the original graph

# Get learned graph
true_graph = nx.DiGraph(pc_dag)

# Relabel the nodes
MAPPING = {k: v for k, v in zip(range(4), ['P', 'Q', 'R', 'S'])}
true_graph = nx.relabel_nodes(true_graph, MAPPING, copy=True)

plt.figure(figsize=(7, 3))
nx.draw(
    G=true_graph,
    node_color=COLORS[0],
    node_size=1200,
    arrowsize=17,
    with_labels=True,
    font_color='white',
    font_size=21,
    pos=nx.circular_layout(true_graph)
)

### PC

In [None]:
# Initialize PC
pc = PC()

# Fit 
pc.learn(pc_dataset)

# Display the learned matrix
pc.causal_matrix

In [None]:
# Visualize
GraphDAG(
    est_dag=pc.causal_matrix, 
    true_dag=pc_dag)

plt.show()

In [None]:
# Get metrics
MetricsDAG(
    B_est=pc.causal_matrix,
    B_true=pc_dag
).metrics

### PC-Stable

In [2]:
# PC-stable
pc_stable = PC(variant='stable')

# Fit 
pc_stable.learn(pc_dataset)

# Visualize
GraphDAG(
    est_dag=pc_stable.causal_matrix, 
    true_dag=pc_dag)

plt.show()

### PC-Parallel

In [None]:
# PC-stable
pc_parallel = PC(variant='parallel')

# Fit 
pc_parallel.learn(pc_dataset)

# Visualize
GraphDAG(
    est_dag=pc_parallel.causal_matrix, 
    true_dag=pc_dag)

plt.show()

### PC with Categorical Data

In [None]:
# Generate the data
a = np.random.binomial(4, .5, N)
b = np.random.binomial(4, .5, N)
c = ((a + b + np.random.normal(0, 1, N)) > 2).astype(int)

# To matrix
pc_cat_dataset = np.vstack([a, b, c]).T

In [None]:
pc_cat = PC(ci_test='chi2')

In [None]:
pc_cat.learn(pc_cat_dataset)

In [None]:
pc_cat.causal_matrix

### PC with Alternative Independence Tests

In [None]:
from castle.common.independence_tests import CITest

In [None]:
pc_cat_alt = PC(ci_test=CITest.cressie_read)

In [None]:
pc_cat_alt.learn(pc_cat_dataset)

In [None]:
pc_cat_alt.causal_matrix

### Score-Based Causal Discovery

In [None]:
# Instantiate GES
ges = GES(criterion='bic')

In [None]:
# Train
ges.learn(pc_dataset)

In [None]:
# Visualize
GraphDAG(
    est_dag=ges.causal_matrix, 
    true_dag=pc_dag)

plt.show()

## Function-Based Casual Discovery

### ANM Model (from scratch)

In [None]:
# Define helpers
from pygam import LinearGAM


class GAM:
    
    def __init__(self, n_splines):
        self.n_splines = n_splines
        
    def fit(self, x, y):
        # Check `x` dimensionality
        x = np.array(x)
        assert len(x.shape) == 2, f'`x` should be 2D array. Received {len(x.shape)} dimensional array.'
        
        # Fit the model
        self.model = LinearGAM(n_splines=self.n_splines).gridsearch(x, y) 
        
    def predict(self, x):
        return self.model.predict(x)

In [None]:
# Create data
x = np.random.randn(1000)
y = x**3 + np.random.randn(1000)

In [None]:
# Plot data
plt.style.use('fivethirtyeight')

plt.figure(figsize=(10, 7))

plt.scatter(x, y, alpha=.5, color=COLORS[0])

plt.xlabel('$X$')
plt.ylabel('$Y$')

plt.show()

In [None]:
# Define params
n_splines = 150

# Instantiate the models 
model_xy = GAM(n_splines=n_splines)
model_yx = GAM(n_splines=n_splines)

# Fit the models
model_xy.fit(x.reshape(-1, 1), y)
model_yx.fit(y.reshape(-1, 1), x)

# Generate predictions
y_pred = model_xy.predict(x.reshape(-1, 1))
x_pred = model_yx.predict(y.reshape(-1, 1))

In [None]:
# Visualize the fitted model
plt.figure(figsize=(10, 7))
plt.scatter(x, y, alpha=.5, color=COLORS[0], label='Data')
plt.scatter(x, y_pred, alpha=.5, color=COLORS[1], label='Causal model')
plt.scatter(x_pred, y, alpha=.5, color=COLORS[2], label='Anti-causal model')
plt.legend()
plt.show()

In [None]:
# Get the residuals
residuals_xy = y - y_pred
residuals_yx = x - x_pred

In [None]:
plt.figure(figsize=(15, 7))
plt.subplot(121)
plt.scatter(x, residuals_xy, alpha=.5, color=COLORS[0])
plt.xlabel('$X$', fontsize=14)
plt.ylabel('$Y-residuals$', fontsize=14)

plt.subplot(122)
plt.scatter(residuals_yx, y, alpha=.5, color=COLORS[0])
plt.xlabel('$X-residuals$', fontsize=14)
plt.ylabel('$Y$', fontsize=14)

plt.show()

In [None]:
# Compute HSIC
is_indep_xy = hsic_test(
    x = x.reshape(-1, 1), 
    y = residuals_xy.reshape(-1, 1),
    alpha=.05
) 

is_indep_yx = hsic_test(
    x = y.reshape(-1, 1), 
    y = residuals_yx.reshape(-1, 1),
    alpha=.05
)

In [None]:
is_indep_xy, is_indep_yx

### ANM model (gCastle)

In [None]:
# Instantiate 
anm = ANMNonlinear(alpha=.1)

In [None]:
# Train
nonlinear_dataset = np.vstack([x, y]).T
anm.learn(nonlinear_dataset)

In [None]:
anm.causal_matrix

## LiNGAM Time!

In [None]:
# Generate data
SAMPLE_SIZE = 1000

x_gauss = np.random.normal(0, 1, SAMPLE_SIZE)
y_gauss = x_gauss + 0.3 * np.random.normal(0, 1, SAMPLE_SIZE)

x_ngauss = np.random.uniform(0, 1, SAMPLE_SIZE)
y_ngauss = x_ngauss + 0.3 * np.random.uniform(0, 1, SAMPLE_SIZE)

In [None]:
# Fit regressions 
results = {}

for name, v in zip(['Gaussian', 'non-Gaussian'], [(x_gauss, y_gauss), (x_ngauss, y_ngauss)]):
    
    for direction in ['y ~ x', 'x ~ y']:   
        
        predictor_idx = 0
        target_idx = 1
        
        if direction.startswith('x'):
            predictor_idx, target_idx = target_idx, predictor_idx

        key = f'{name} | {direction}'
        print(key)
        
        # Fit the model
        lr = LinearRegression()
        lr.fit(v[predictor_idx].reshape(-1, 1), v[target_idx])
        
        # Predict
        preds = lr.predict(v[predictor_idx].reshape(-1, 1))
        
        # Compute residuals
        residuals = v[target_idx] - preds
        
        results[key] = preds, residuals

In [None]:
ALPHA = .3

# Plot the data
plt.figure(figsize=(20, 10))

# Plot X vs Y + regression lines
plt.subplot(241)
plt.scatter(x_gauss, y_gauss, label='Linear Gaussian', alpha=ALPHA, color=COLORS[0])
plt.plot(x_gauss, results['Gaussian | y ~ x'][0], color=COLORS[1], label='Fitted model')
plt.legend()
plt.ylabel('$Y$', alpha=.7, fontsize=14)
plt.title('Raw data\nY ~ X')

plt.subplot(245)
plt.scatter(x_ngauss, y_ngauss, label='Linear non-Gaussian', alpha=ALPHA, color=COLORS[0])
plt.plot(x_ngauss, results['non-Gaussian | y ~ x'][0], color=COLORS[1], label='Fitted model')
plt.xlabel('$X$', alpha=.7, fontsize=14)
plt.ylabel('$Y$', alpha=.7, fontsize=14)
plt.legend()


# Plot residuals Y ~ X
plt.subplot(242)
plt.scatter(x_gauss, results['Gaussian | y ~ x'][1], label='Linear Gaussian', color=COLORS[0], alpha=ALPHA)
plt.legend()
plt.ylabel('$Y - \hat{Y}$', alpha=.7, fontsize=14)
plt.xlabel('$X$', alpha=.7, fontsize=14)
plt.title('Residuals\nY ~ X')

plt.subplot(246)
plt.scatter(x_ngauss, results['non-Gaussian | y ~ x'][1], label='Linear non-Gaussian', color=COLORS[0], alpha=ALPHA)
plt.legend()
plt.ylabel('$Y - \hat{Y}$', alpha=.7, fontsize=14)
plt.xlabel('$X$', alpha=.7, fontsize=14)


# Plot Y vs X + regression lines
plt.subplot(243)
plt.scatter(y_gauss, x_gauss, label='Linear Gaussian', alpha=ALPHA, color=COLORS[0])
plt.plot(y_gauss, results['Gaussian | x ~ y'][0], color=COLORS[1], label='Fitted model')
plt.legend()
plt.ylabel('$X$', alpha=.7, fontsize=14)
plt.title('Raw data\nX ~ Y')

plt.subplot(247)
plt.scatter(y_ngauss, x_ngauss, label='Linear non-Gaussian', alpha=ALPHA, color=COLORS[0])
plt.plot(y_ngauss, results['non-Gaussian | x ~ y'][0], color=COLORS[1], label='Fitted model')
plt.xlabel('$Y$', alpha=.7, fontsize=14)
plt.ylabel('$X$', alpha=.7, fontsize=14)
plt.legend()


# Plot residuals X ~ Y
plt.subplot(244)
plt.scatter(y_gauss, results['Gaussian | x ~ y'][1], label='Linear Gaussian', color=COLORS[0], alpha=ALPHA)
plt.legend()
plt.xlabel('$Y$', alpha=.7, fontsize=14)
plt.ylabel('$X - \hat{X}$', alpha=.7, fontsize=14)
plt.title('Residuals\nX ~ Y')

plt.subplot(248)
plt.scatter(y_ngauss, results['non-Gaussian | x ~ y'][1], label='Linear non-Gaussian', color=COLORS[0], alpha=ALPHA)
plt.legend()
plt.xlabel('$Y$', alpha=.7, fontsize=14)
plt.ylabel('$X - \hat{X}$', alpha=.7, fontsize=14)

plt.tight_layout()
plt.show()

### LiNGAM in Action