In [24]:
import pickle
import os
import pandas as pd
from tqdm import tqdm
import numpy as np
import joblib

from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, balanced_accuracy_score

from d2c.benchmark import D2CWrapper

from d2c.descriptors_generation.loader import DataLoader

from sklearn.ensemble import RandomForestClassifier
from imblearn.ensemble import BalancedRandomForestClassifier

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score


In [25]:
model = joblib.load('/home/jpalombarini/td2c/notebooks/contributions/Real_data_validation/model.pkl')
ts = np.loadtxt('/home/jpalombarini/td2c/notebooks/contributions/Real_data_validation/data/Antivirus_activity/preprocessed_1.txt', delimiter=',',skiprows=1)

In [49]:
d2cwrapper = D2CWrapper(ts_list=[ts], 
                        n_variables=13, 
                        model=model, 
                        maxlags=1, 
                        n_jobs=1, 
                        full=True, 
                        quantiles=True,
                        filename='d2c_results',
                        normalize=True, 
                        cmi='original', 
                        mb_estimator='ts')

d2cwrapper

<d2c.benchmark.d2c_wrapper.D2CWrapper at 0x7f456e187a60>

In [50]:
d2cwrapper.run()

Estimating MB for node 3
Markov Blanket: [5. 1.]
Estimating MB for node 1
Markov Blanket: [3.]
Estimating MB for node 2
Markov Blanket: [4. 0.]
Estimating MB for node 0
Markov Blanket: [2.]
Estimating MB for node 2
Markov Blanket: [4. 0.]
Estimating MB for node 1
Markov Blanket: [3.]
Estimating MB for node 3
Markov Blanket: [5. 1.]
Estimating MB for node 0
Markov Blanket: [2.]


<d2c.benchmark.d2c_wrapper.D2CWrapper at 0x7f456e187a60>

In [None]:
causal_df = d2cwrapper.get_causal_dfs()

In [27]:
causal_df

{0:      from  to effect p_value  probability  is_causal
 0      23   4   None    None         0.06      False
 1      17   3   None    None         0.04      False
 2      19   0   None    None         0.04      False
 3      17  12   None    None         0.04      False
 4      19   9   None    None         0.18      False
 ..    ...  ..    ...     ...          ...        ...
 164    15   3   None    None         0.18      False
 165    15  12   None    None         0.06      False
 166    16  11   None    None         0.16      False
 167    18   8   None    None         0.16      False
 168    21   7   None    None         0.14      False
 
 [169 rows x 6 columns]}

In [28]:
df = causal_df[0]
# order df by 'from' and 'by' columns
df = df.sort_values(by=['from', 'to'])
df

Unnamed: 0,from,to,effect,p_value,probability,is_causal
79,13,0,,,0.32,False
16,13,1,,,0.20,False
94,13,2,,,0.26,False
30,13,3,,,0.18,False
110,13,4,,,0.10,False
...,...,...,...,...,...,...
86,25,8,,,0.12,False
151,25,9,,,0.16,False
99,25,10,,,0.04,False
39,25,11,,,0.10,False


In [29]:
df.to_csv('/home/jpalombarini/td2c/notebooks/contributions/Real_data_validation/data/Antivirus_activity/results/causal_df.csv', index=False)

In [30]:
# load a dataset as dataframe
ts2 = pd.read_csv('/home/jpalombarini/td2c/notebooks/contributions/Real_data_validation/data/Antivirus_activity/preprocessed_1.txt')

# list the names in the first row
names = ts2.columns

# associate a number to each name
name_to_number = {name: i+1 for i, name in enumerate(names)}

name_to_number

{'memory_usage_Portal': 1,
 'cpu_usage_Portal': 2,
 'Physical_Memory_prct_used_Portal': 3,
 'cpu_prct_used_Portal': 4,
 '0_C_read_Portal': 5,
 'memory_usage_VDI': 6,
 'cpu_usage_VDI': 7,
 'Physical_Memory_prct_used_VDI': 8,
 'cpu_prct_used_VDI': 9,
 '0_C_read_VDI': 10,
 'Chargement_portail': 11,
 'Chargement_IE': 12,
 'Default_Transaction': 13}

In [31]:
# show levels of 'from' in df
df['from'].unique()

array([13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25])

In [32]:
df['to'].unique()

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12])

In [33]:
mapping = {13: 1, 14:2, 15:3, 16:4, 17:5, 18:6, 19:7, 20:8, 21:9, 22:10, 23:11, 24:12, 25:13}

# Apply the mapping
df['from'] = df['from'].replace(mapping)

In [34]:
mapping = {0: 1, 1:2, 2:3, 3:4, 4:5, 5:6, 6:7, 7:8, 8:9, 9:10, 10:11, 11:12, 12:13}

# Apply the mapping
df['to'] = df['to'].replace(mapping)

In [35]:
df

Unnamed: 0,from,to,effect,p_value,probability,is_causal
79,1,1,,,0.32,False
16,1,2,,,0.20,False
94,1,3,,,0.26,False
30,1,4,,,0.18,False
110,1,5,,,0.10,False
...,...,...,...,...,...,...
86,13,9,,,0.12,False
151,13,10,,,0.16,False
99,13,11,,,0.04,False
39,13,12,,,0.10,False


In [36]:
# show only df rows that have 'is_causal' == True
df[df['is_causal'] == True]

Unnamed: 0,from,to,effect,p_value,probability,is_causal
78,5,5,,,0.62,True
74,10,10,,,0.52,True


In [42]:
# take only the columns 'from', 'to'
caus = df[df['probability'] > 0.4][['from', 'to']]

number_to_name = {v: k for k, v in name_to_number.items()}

# apply the mapping
caus['from'] = caus['from'].replace(number_to_name)
caus['to'] = caus['to'].replace(number_to_name)

caus

Unnamed: 0,from,to
79,memory_usage_Portal,memory_usage_Portal
94,memory_usage_Portal,Physical_Memory_prct_used_Portal
47,memory_usage_Portal,memory_usage_VDI
42,cpu_usage_Portal,cpu_usage_Portal
6,Physical_Memory_prct_used_Portal,memory_usage_VDI
66,cpu_prct_used_Portal,cpu_prct_used_Portal
78,0_C_read_Portal,0_C_read_Portal
85,memory_usage_VDI,Physical_Memory_prct_used_Portal
37,memory_usage_VDI,memory_usage_VDI
46,cpu_usage_VDI,cpu_usage_VDI


In [43]:
# save caus to a csv file
caus.to_csv('/home/jpalombarini/td2c/notebooks/contributions/Real_data_validation/data/Antivirus_activity/results/causal_relations.csv', index=False)

In [44]:
# load a txt file as dataframe
gt = pd.read_csv('/home/jpalombarini/td2c/notebooks/contributions/Real_data_validation/data/Antivirus_activity/ground_truth.txt')
gt

Unnamed: 0,From -> To
0,memory_usage_Portal -> Physical_Memory_prct_us...
1,cpu_usage_Portal -> cpu_prct_used_Portal
2,Physical_Memory_prct_used_Portal -> 0_C_read_P...
3,cpu_prct_used_Portal -> 0_C_read_Portal
4,memory_usage_VDI -> Physical_Memory_prct_used_VDI
5,cpu_usage_VDI -> cpu_prct_used_VDI
6,Physical_Memory_prct_used_VDI -> 0_C_read_VDI
7,cpu_prct_used_VDI -> 0_C_read_VDI
8,Physical_Memory_prct_used_Portal -> Chargement...
9,cpu_prct_used_Portal -> Chargement_portail


In [45]:
# merge column 'from' with column 'to' of caus to create a new column 'From -> To'
caus['From -> To'] = caus['from'] + ' -> ' + caus['to']
caus = caus.drop(columns=['from', 'to'])
caus

Unnamed: 0,From -> To
79,memory_usage_Portal -> memory_usage_Portal
94,memory_usage_Portal -> Physical_Memory_prct_us...
47,memory_usage_Portal -> memory_usage_VDI
42,cpu_usage_Portal -> cpu_usage_Portal
6,Physical_Memory_prct_used_Portal -> memory_usa...
66,cpu_prct_used_Portal -> cpu_prct_used_Portal
78,0_C_read_Portal -> 0_C_read_Portal
85,memory_usage_VDI -> Physical_Memory_prct_used_...
37,memory_usage_VDI -> memory_usage_VDI
46,cpu_usage_VDI -> cpu_usage_VDI


In [46]:
print(f'Numbers of correctly estimated causal paths: {sum(caus["From -> To"].isin(gt["From -> To"]))} / {gt.shape[0]}'), 
print(f'Percentage of correctly estimated causal paths: {round((sum(caus["From -> To"].isin(gt["From -> To"])) / gt.shape[0]) * 100, 2)}%')

Numbers of correctly estimated causal paths: 1 / 16
Percentage of correctly estimated causal paths: 6.25%
