In [1]:
import pickle
import os
import pandas as pd
from tqdm import tqdm
import numpy as np
import joblib

from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, balanced_accuracy_score

from d2c.benchmark import D2CWrapper

from d2c.descriptors.loader import DataLoader

from sklearn.ensemble import RandomForestClassifier
from imblearn.ensemble import BalancedRandomForestClassifier

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score


In [7]:
model = joblib.load('/home/jpalombarini/td2c/notebooks/contributions/Real_data_validation/model.pkl')
ts = np.loadtxt('/home/jpalombarini/td2c/notebooks/contributions/Real_data_validation/data/Dairy_markets/dairy_markets_merged.txt', delimiter=',',skiprows=1, usecols=range(1, 4))

In [10]:
d2cwrapper = D2CWrapper(ts_list=[ts], 
                        n_variables=3, 
                        model=model, 
                        maxlags=1, 
                        n_jobs=1, 
                        full=True, 
                        quantiles=True,
                        filename='d2c_results',
                        normalize=True, 
                        cmi='original', 
                        mb_estimator='original')

d2cwrapper.run()

causal_df = d2cwrapper.get_causal_dfs()

Shape of X: (238, 5), Shape of Y: (238,)
Shape of X: (238, 5), Shape of Y: (238,)
Shape of X: (238, 5), Shape of Y: (238,)
Shape of X: (238, 5), Shape of Y: (238,)
Shape of X: (238, 5), Shape of Y: (238,)
Shape of X: (238, 5), Shape of Y: (238,)
Shape of X: (238, 5), Shape of Y: (238,)
Shape of X: (238, 5), Shape of Y: (238,)
Shape of X: (238, 5), Shape of Y: (238,)
Shape of X: (238, 5), Shape of Y: (238,)
Shape of X: (238, 5), Shape of Y: (238,)
Shape of X: (238, 5), Shape of Y: (238,)
Shape of X: (238, 5), Shape of Y: (238,)
Shape of X: (238, 5), Shape of Y: (238,)
Shape of X: (238, 5), Shape of Y: (238,)
Shape of X: (238, 5), Shape of Y: (238,)
Shape of X: (238, 5), Shape of Y: (238,)
Shape of X: (238, 5), Shape of Y: (238,)


In [11]:
causal_df

{0:    from  to effect p_value  probability  is_causal
 0     4   0   None    None         0.02      False
 1     3   1   None    None         0.00      False
 2     5   1   None    None         0.02      False
 3     4   2   None    None         0.02      False
 4     3   0   None    None         0.52       True
 5     5   0   None    None         0.00      False
 6     3   2   None    None         0.00      False
 7     4   1   None    None         0.58       True
 8     5   2   None    None         0.56       True}

In [12]:
df = causal_df[0]
# order df by 'from' and 'by' columns
df = df.sort_values(by=['from', 'to'])
df

Unnamed: 0,from,to,effect,p_value,probability,is_causal
4,3,0,,,0.52,True
1,3,1,,,0.0,False
6,3,2,,,0.0,False
0,4,0,,,0.02,False
7,4,1,,,0.58,True
3,4,2,,,0.02,False
5,5,0,,,0.0,False
2,5,1,,,0.02,False
8,5,2,,,0.56,True


In [13]:
df.to_csv('/home/jpalombarini/td2c/notebooks/contributions/Real_data_validation/data/Dairy_markets/results/causal_df.csv', index=False)

In [15]:
# load a dataset as dataframe
ts2 = pd.read_csv('/home/jpalombarini/td2c/notebooks/contributions/Real_data_validation/data/Dairy_markets/dairy_markets_merged.txt')

# drop the first column
ts2 = ts2.drop(columns=ts2.columns[0])

# list the names in the first row
names = ts2.columns

# associate a number to each name
name_to_number = {name: i+1 for i, name in enumerate(names)}

name_to_number

{'Butter': 1, 'Cheese': 2, 'Milk': 3}

In [16]:
# show levels of 'from' in df
df['from'].unique()

array([3, 4, 5])

In [17]:
df['to'].unique()

array([0, 1, 2])

In [18]:
mapping = {3: 1, 4: 2, 5: 3}

# Apply the mapping
df['from'] = df['from'].replace(mapping)

In [19]:
mapping = {0: 1, 1:2, 2:3}

# Apply the mapping
df['to'] = df['to'].replace(mapping)

In [20]:
df

Unnamed: 0,from,to,effect,p_value,probability,is_causal
4,1,1,,,0.52,True
1,1,2,,,0.0,False
6,1,3,,,0.0,False
0,2,1,,,0.02,False
7,2,2,,,0.58,True
3,2,3,,,0.02,False
5,3,1,,,0.0,False
2,3,2,,,0.02,False
8,3,3,,,0.56,True


In [21]:
# show only df rows that have 'is_causal' == True
df[df['is_causal'] == True]

Unnamed: 0,from,to,effect,p_value,probability,is_causal
4,1,1,,,0.52,True
7,2,2,,,0.58,True
8,3,3,,,0.56,True


In [30]:
# take only the columns 'from', 'to'
caus = df[df['is_causal'] == True][['from', 'to']]

number_to_name = {v: k for k, v in name_to_number.items()}

# apply the mapping
caus['from'] = caus['from'].replace(number_to_name)
caus['to'] = caus['to'].replace(number_to_name)

caus

Unnamed: 0,from,to
4,Butter,Butter
7,Cheese,Cheese
8,Milk,Milk


In [31]:
# save caus to a csv file
caus.to_csv('/home/jpalombarini/td2c/notebooks/contributions/Real_data_validation/data/Dairy_markets/results/causal_relations.csv', index=False)

In [32]:
# load a txt file as dataframe
gt = pd.read_csv('/home/jpalombarini/td2c/notebooks/contributions/Real_data_validation/data/Dairy_markets/ground_truth.txt')
gt

Unnamed: 0,From -> To
0,Milk -> Butter
1,Milk -> Cheese


In [33]:
# merge column 'from' with column 'to' of caus to create a new column 'From -> To'
caus['From -> To'] = caus['from'] + ' -> ' + caus['to']
caus = caus.drop(columns=['from', 'to'])
caus

Unnamed: 0,From -> To
4,Butter -> Butter
7,Cheese -> Cheese
8,Milk -> Milk


In [34]:
print(f'Numbers of correctly estimated causal paths: {sum(caus["From -> To"].isin(gt["From -> To"]))} / {gt.shape[0]}'), 
print(f'Percentage of correctly estimated causal paths: {round((sum(caus["From -> To"].isin(gt["From -> To"])) / gt.shape[0]) * 100, 2)}%')

Numbers of correctly estimated causal paths: 0 / 2
Percentage of correctly estimated causal paths: 0.0%
