In [3]:
import pickle
import os
import pandas as pd
from tqdm import tqdm
import numpy as np
import joblib

from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, balanced_accuracy_score

from d2c.benchmark import D2CWrapper

from d2c.descriptors.loader import DataLoader

from sklearn.ensemble import RandomForestClassifier
from imblearn.ensemble import BalancedRandomForestClassifier

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score


In [6]:
model = joblib.load('/home/jpalombarini/td2c/notebooks/contributions/Real_data_validation/model.pkl')
def comma_to_dot(x):
    return float(x.replace(b',', b'.'))

# Load the data with the custom converter
ts = np.loadtxt(
    '/home/jpalombarini/td2c/notebooks/contributions/Real_data_validation/data/veilleux/veilleux_subset_CC05a.txt',
    delimiter=';',
    skiprows=1,
    usecols=range(1, 3),
    converters={i: comma_to_dot for i in range(1, 3)}
)

In [8]:
d2cwrapper = D2CWrapper(ts_list=[ts], 
                        n_variables=2, 
                        model=model, 
                        maxlags=1, 
                        n_jobs=1, 
                        full=True, 
                        quantiles=True,
                        filename='d2c_results',
                        normalize=True, 
                        cmi='original', 
                        mb_estimator='original')

d2cwrapper.run()

causal_df = d2cwrapper.get_causal_dfs()

Shape of X: (70, 3), Shape of Y: (70,)
Shape of X: (70, 3), Shape of Y: (70,)
Shape of X: (70, 3), Shape of Y: (70,)
Shape of X: (70, 3), Shape of Y: (70,)
Shape of X: (70, 3), Shape of Y: (70,)
Shape of X: (70, 3), Shape of Y: (70,)
Shape of X: (70, 3), Shape of Y: (70,)
Shape of X: (70, 3), Shape of Y: (70,)


In [10]:
causal_df

{0:    from  to effect p_value  probability  is_causal
 0     3   1   None    None         0.28      False
 1     2   0   None    None         0.44      False
 2     2   1   None    None         0.26      False
 3     3   0   None    None         0.16      False}

In [11]:
df = causal_df[0]
# order df by 'from' and 'by' columns
df = df.sort_values(by=['from', 'to'])
df

Unnamed: 0,from,to,effect,p_value,probability,is_causal
1,2,0,,,0.44,False
2,2,1,,,0.26,False
3,3,0,,,0.16,False
0,3,1,,,0.28,False


In [12]:
df.to_csv('/home/jpalombarini/td2c/notebooks/contributions/Real_data_validation/data/veilleux/results/causal_df.csv', index=False)

In [14]:
def comma_to_dot(x):
    return float(x.replace(',', '.'))

ts2 = pd.read_csv(
    '/home/jpalombarini/td2c/notebooks/contributions/Real_data_validation/data/veilleux/veilleux_subset_CC05a.txt',
    delimiter=';',
    converters={i: comma_to_dot for i in range(1, 3)}
)

# drop the first column
ts2 = ts2.drop(columns=ts2.columns[0])

# list the names in the first row
names = ts2.columns

# associate a number to each name
name_to_number = {name: i+1 for i, name in enumerate(names)}

name_to_number

{'Paramecium': 1, 'Didinium': 2}

In [15]:
# show levels of 'from' in df
df['from'].unique()

array([2, 3])

In [16]:
df['to'].unique()

array([0, 1])

In [17]:
mapping = {2: 1, 3: 2}

# Apply the mapping
df['from'] = df['from'].replace(mapping)

In [18]:
mapping = {0: 1, 1:2}

# Apply the mapping
df['to'] = df['to'].replace(mapping)

In [19]:
df

Unnamed: 0,from,to,effect,p_value,probability,is_causal
1,1,1,,,0.44,False
2,1,2,,,0.26,False
3,2,1,,,0.16,False
0,2,2,,,0.28,False


In [32]:
# show only df rows that have 'is_causal' == True
df[df['probability'] > 0.2]

Unnamed: 0,from,to,effect,p_value,probability,is_causal
1,1,1,,,0.44,False
2,1,2,,,0.26,False
0,2,2,,,0.28,False


In [61]:
# take only the columns 'from', 'to'
caus = df[df['probability'] > 0.2][['from', 'to']]

number_to_name = {v: k for k, v in name_to_number.items()}

# apply the mapping
caus['from'] = caus['from'].replace(number_to_name)
caus['to'] = caus['to'].replace(number_to_name)

caus

Unnamed: 0,from,to
1,Paramecium,Paramecium
2,Paramecium,Didinium
0,Didinium,Didinium


In [62]:
# save caus to a csv file
caus.to_csv('/home/jpalombarini/td2c/notebooks/contributions/Real_data_validation/data/veilleux/results/causal_relations.csv', index=False)

In [63]:
# load a txt file as dataframe
gt = pd.read_csv('/home/jpalombarini/td2c/notebooks/contributions/Real_data_validation/data/veilleux/ground_truth.txt')
gt

Unnamed: 0,From -> To
0,Paramecium -> Didinium


In [64]:
# merge column 'from' with column 'to' of caus to create a new column 'From -> To'
caus['From -> To'] = caus['from'] + ' -> ' + caus['to']
caus = caus.drop(columns=['from', 'to'])
caus

Unnamed: 0,From -> To
1,Paramecium -> Paramecium
2,Paramecium -> Didinium
0,Didinium -> Didinium


In [65]:
print(f'Numbers of correctly estimated causal paths: {sum(caus["From -> To"].isin(gt["From -> To"]))} / {gt.shape[0]}'), 
print(f'Percentage of correctly estimated causal paths: {round((sum(caus["From -> To"].isin(gt["From -> To"])) / gt.shape[0]) * 100, 2)}%')

Numbers of correctly estimated causal paths: 1 / 1
Percentage of correctly estimated causal paths: 100.0%
