Running D2C wrapper (therefore computing all possible couples)

In [2]:
import pickle 
import os
import pandas as pd
from tqdm import tqdm
import numpy as np

from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, balanced_accuracy_score

from d2c.benchmark import D2CWrapper

from d2c.descriptors.loader import DataLoader

from sklearn.ensemble import RandomForestClassifier
from imblearn.ensemble import BalancedRandomForestClassifier



  from .autonotebook import tqdm as notebook_tqdm


In [2]:
dfs = []
root = './descriptors_old_original/'
for file in sorted(os.listdir(root)):
    if file.endswith('.pkl'):
        df = pd.read_pickle(root + file)
        dfs.append(df)

data = pd.concat(dfs, axis=0).reset_index(drop=True)

In [3]:
data.columns

Index(['process_id', 'graph_id', 'n_variables', 'max_neighborhood_size',
       'noise_std', 'edge_source', 'edge_dest', 'is_causal', 'coeff_cause',
       'coeff_eff', 'HOC_3_1', 'HOC_1_2', 'HOC_2_1', 'HOC_1_3', 'kurtosis_ca',
       'kurtosis_ef', 'mca_mef_cau_q0', 'mca_mef_cau_q1', 'mca_mef_cau_q2',
       'mca_mef_cau_q3', 'mca_mef_cau_q4', 'mca_mef_cau_q5', 'mca_mef_cau_q6',
       'mca_mef_eff_q0', 'mca_mef_eff_q1', 'mca_mef_eff_q2', 'mca_mef_eff_q3',
       'mca_mef_eff_q4', 'mca_mef_eff_q5', 'mca_mef_eff_q6', 'cau_m_eff_q0',
       'cau_m_eff_q1', 'cau_m_eff_q2', 'cau_m_eff_q3', 'cau_m_eff_q4',
       'cau_m_eff_q5', 'cau_m_eff_q6', 'eff_m_cau_q0', 'eff_m_cau_q1',
       'eff_m_cau_q2', 'eff_m_cau_q3', 'eff_m_cau_q4', 'eff_m_cau_q5',
       'eff_m_cau_q6', 'm_cau_q0', 'm_cau_q1', 'm_cau_q2', 'm_cau_q3',
       'm_cau_q4', 'm_cau_q5', 'm_cau_q6', 'com_cau', 'cau_eff', 'eff_cau',
       'eff_cau_mbeff', 'cau_eff_mbcau', 'eff_cau_mbcau_plus_q0',
       'eff_cau_mbcau_plus_q1', 'ef

In [3]:
root = '../data/'

# if any exception write it on disk 

to_dos = []
for file in sorted(os.listdir(root)):
    if file.endswith('.pkl'):
        gen_process_number = int(file.split('_')[0][1:])
        n_variables = int(file.split('_')[1][1:])
        max_neighborhood_size = int(file.split('_')[2][2:])
        noise_std = float(file.split('_')[3][1:-4])
        
        if noise_std != 0.01:
            continue

        if max_neighborhood_size != 2:
            continue

        to_dos.append(file)

# sort to_dos by number of variables
to_dos_5_variables = [file for file in to_dos if int(file.split('_')[1][1:]) == 5]
to_dos_10_variables = [file for file in to_dos if int(file.split('_')[1][1:]) == 10]
to_dos_25_variables = [file for file in to_dos if int(file.split('_')[1][1:]) == 25]
to_dos_50_variables = [file for file in to_dos if int(file.split('_')[1][1:]) == 50]

In [7]:
# Getting a sample of 5 variables
file = to_dos_5_variables[0]
dataloader = DataLoader(n_variables = n_variables,
                maxlags = 5)
dataloader.from_pickle(root+file)
observations = dataloader.get_original_observations()
true_causal_dfs = dataloader.get_true_causal_dfs()
dags = dataloader.get_dags()

dags[0].edges()

OutEdgeView([(25, 20), (25, 22), (26, 21), (27, 22), (28, 23), (29, 21), (29, 23), (29, 24), (20, 15), (20, 17), (21, 16), (22, 17), (23, 18), (24, 16), (24, 18), (24, 19), (15, 10), (15, 12), (16, 11), (17, 12), (18, 13), (19, 11), (19, 13), (19, 14), (10, 5), (10, 7), (11, 6), (12, 7), (13, 8), (14, 6), (14, 8), (14, 9), (5, 0), (5, 2), (6, 1), (7, 2), (8, 3), (9, 1), (9, 3), (9, 4)])

The following recomputes all possible pairs with the 'big' d2c!

In [5]:
destination_root = './d2c_benchmark_old/'
#check what is missing from the folder 
to_do = []
for file in to_dos_5_variables+to_dos_10_variables+to_dos_25_variables:
    if file not in os.listdir(destination_root):
        to_do.append(file)

In [6]:

pickle_original_files = set()
for file in os.listdir(destination_root):
    if file.endswith('.csv'):
        pickle_original_file = file.split('_')[0] + '_'+ file.split('_')[1] + '_'+ file.split('_')[2] + '_'+ file.split('_')[3]
        pickle_original_files.add(pickle_original_file)




In [7]:
list(set(to_dos_5_variables+to_dos_10_variables+to_dos_25_variables).difference(pickle_original_files))

['P4_N25_Nj2_n0.01.pkl',
 'P6_N25_Nj2_n0.01.pkl',
 'P7_N25_Nj2_n0.01.pkl',
 'P3_N25_Nj2_n0.01.pkl',
 'P20_N25_Nj2_n0.01.pkl',
 'P9_N25_Nj2_n0.01.pkl',
 'P8_N25_Nj2_n0.01.pkl',
 'P18_N25_Nj2_n0.01.pkl',
 'P14_N25_Nj2_n0.01.pkl']

5 vars

In [8]:
to_dos_25_variables

['P10_N25_Nj2_n0.01.pkl',
 'P11_N25_Nj2_n0.01.pkl',
 'P12_N25_Nj2_n0.01.pkl',
 'P13_N25_Nj2_n0.01.pkl',
 'P14_N25_Nj2_n0.01.pkl',
 'P15_N25_Nj2_n0.01.pkl',
 'P16_N25_Nj2_n0.01.pkl',
 'P18_N25_Nj2_n0.01.pkl',
 'P19_N25_Nj2_n0.01.pkl',
 'P1_N25_Nj2_n0.01.pkl',
 'P20_N25_Nj2_n0.01.pkl',
 'P2_N25_Nj2_n0.01.pkl',
 'P3_N25_Nj2_n0.01.pkl',
 'P4_N25_Nj2_n0.01.pkl',
 'P6_N25_Nj2_n0.01.pkl',
 'P7_N25_Nj2_n0.01.pkl',
 'P8_N25_Nj2_n0.01.pkl',
 'P9_N25_Nj2_n0.01.pkl']

In [8]:
N_JOBS = 40
destination_root = './d2c_benchmark_old/'
maxlags = 5
for file in tqdm(list(set(to_dos_5_variables+to_dos_10_variables+to_dos_25_variables).difference(pickle_original_files))):
    if file.endswith('.pkl'):

        #check if the file is already processed
        for file_csv in os.listdir(destination_root):
            if file_csv.endswith('.csv'):
                pickle_original_file = file_csv.split('_')[0] + '_'+ file_csv.split('_')[1] + '_'+ file_csv.split('_')[2] + '_'+ file_csv.split('_')[3]
                if pickle_original_file == file:
                     continue

        gen_process_number = int(file.split('_')[0][1:])
        n_variables = int(file.split('_')[1][1:])
        max_neighborhood_size = int(file.split('_')[2][2:])
        noise_std = float(file.split('_')[3][1:-4])
            
        filename = f'{destination_root}/P{gen_process_number}_N{n_variables}_Nj{max_neighborhood_size}_n{noise_std}.pkl'

        training_data = data.loc[data['process_id'] != gen_process_number]
        X_train = training_data.drop(columns=['process_id', 'graph_id', 'n_variables', 'max_neighborhood_size','noise_std', 'edge_source', 'edge_dest', 'is_causal',])
        y_train = training_data['is_causal']

        model = RandomForestClassifier(n_estimators=50, random_state=0, n_jobs=1)

        model.fit(X_train, y_train)

        dataloader = DataLoader(n_variables = n_variables,
                        maxlags = maxlags)
        dataloader.from_pickle(root+file)
        observations = dataloader.get_original_observations()
        true_causal_dfs = dataloader.get_true_causal_dfs()

        d2cwrapper = D2CWrapper(ts_list=observations, 
                                n_variables=n_variables, 
                                model=model, 
                                maxlags=maxlags, 
                                n_jobs = N_JOBS, 
                                full=True, 
                                quantiles=True,
                                filename = filename,
                                normalize=True, 
                                cmi='original', 
                                mb_estimator='original')

        d2cwrapper.run()

        causal_df = d2cwrapper.get_causal_dfs()

        with open(filename, 'wb') as f:
                pickle.dump((
                            causal_df, 
                            true_causal_dfs), f)     
    

100%|██████████| 9/9 [7:09:25<00:00, 2862.88s/it]  


10 vars

In [6]:
root = '../../data/new_data/'
destination_root = './'
destination = 'd2c_all_couples_MB5_full'
if not os.path.exists(destination_root+'/'+destination):
    os.makedirs(destination_root+'/'+destination)
maxlags = 5
# empty folder ../../data/new_benchmark/
for todo in [to_dos_10_variables]:
    for file in tqdm(todo):
        if file.endswith('.pkl'):
            gen_process_number = int(file.split('_')[0][1:])
            n_variables = int(file.split('_')[1][1:])
            max_neighborhood_size = int(file.split('_')[2][2:])
            noise_std = float(file.split('_')[3][1:-4])
                
            filename = f'{destination_root}/{destination}/P{gen_process_number}_N{n_variables}_Nj{max_neighborhood_size}_n{noise_std}.pkl'

            training_data = data.loc[data['process_id'] != gen_process_number]
            X_train = training_data.drop(columns=['process_id', 'graph_id', 'n_variables', 'max_neighborhood_size','noise_std', 'edge_source', 'edge_dest', 'is_causal',])
            y_train = training_data['is_causal']

            model = BalancedRandomForestClassifier(n_estimators=50, random_state=0, n_jobs=1, replacement=True, sampling_strategy='all')

            model.fit(X_train, y_train)

            dataloader = DataLoader(n_variables = n_variables,
                            maxlags = maxlags)
            dataloader.from_pickle(root+file)
            observations = dataloader.get_original_observations()
            true_causal_dfs = dataloader.get_true_causal_dfs()

            d2cwrapper = D2CWrapper(ts_list=observations, n_variables=n_variables, model=model, maxlags=maxlags, n_jobs = 55, full=True)

            d2cwrapper.run()

            causal_df = d2cwrapper.get_causal_dfs()

            with open(filename, 'wb') as f:
                    pickle.dump((
                                causal_df, 
                                true_causal_dfs), f)     
    

100%|██████████| 19/19 [11:07:54<00:00, 2109.20s/it] 


25 vars

In [7]:
root = '../../data/new_data/'
destination_root = '../../data/new_benchmark'
destination = 'd2c_all_couples_MB5_full'

if not os.path.exists(destination_root+'/'+destination):
    os.makedirs(destination_root+'/'+destination)
maxlags = 5
# empty folder ../../data/new_benchmark/

for todo in [to_dos_25_variables]:
    for file in tqdm(todo):
        if file.endswith('.pkl'):
            gen_process_number = int(file.split('_')[0][1:])
            n_variables = int(file.split('_')[1][1:])
            max_neighborhood_size = int(file.split('_')[2][2:])
            noise_std = float(file.split('_')[3][1:-4])
                
            filename = f'{destination_root}/{destination}/P{gen_process_number}_N{n_variables}_Nj{max_neighborhood_size}_n{noise_std}.pkl'

            training_data = data.loc[data['process_id'] != gen_process_number]
            X_train = training_data.drop(columns=['process_id', 'graph_id', 'n_variables', 'max_neighborhood_size','noise_std', 'edge_source', 'edge_dest', 'is_causal',])
            y_train = training_data['is_causal']

            model = BalancedRandomForestClassifier(n_estimators=50, random_state=0, n_jobs=1, replacement=True, sampling_strategy='all')

            model.fit(X_train, y_train)

            dataloader = DataLoader(n_variables = n_variables,
                            maxlags = maxlags)
            dataloader.from_pickle(root+file)
            observations = dataloader.get_original_observations()
            true_causal_dfs = dataloader.get_true_causal_dfs()


            d2cwrapper = D2CWrapper(ts_list=observations, n_variables=n_variables, model=model, maxlags=maxlags, n_jobs = 55, full=True)

            d2cwrapper.run()

            causal_df = d2cwrapper.get_causal_dfs()

            with open(filename, 'wb') as f:
                    pickle.dump((
                                causal_df, 
                                true_causal_dfs), f)     
    

 74%|███████▎  | 14/19 [15:35:46<5:45:07, 4141.58s/it]

In [6]:
# handle the two missing files separately
missing_files = ['P8_N25_Nj8_n0.005.pkl','P9_N25_Nj8_n0.005.pkl']

root = '../../data/new_data/'
destination_root = '../../data/new_benchmark'
destination = 'd2c_all_couples_MB5_full'

maxlags = 5
# empty folder ../../data/new_benchmark/


for file in tqdm(missing_files):
    if file.endswith('.pkl'):
        gen_process_number = int(file.split('_')[0][1:])
        n_variables = int(file.split('_')[1][1:])
        max_neighborhood_size = int(file.split('_')[2][2:])
        noise_std = float(file.split('_')[3][1:-4])
            
        filename = f'{destination_root}/{destination}/P{gen_process_number}_N{n_variables}_Nj{max_neighborhood_size}_n{noise_std}.pkl'

        training_data = data.loc[data['process_id'] != gen_process_number]
        X_train = training_data.drop(columns=['process_id', 'graph_id', 'n_variables', 'max_neighborhood_size','noise_std', 'edge_source', 'edge_dest', 'is_causal',])
        y_train = training_data['is_causal']

        model = BalancedRandomForestClassifier(n_estimators=50, random_state=0, n_jobs=1, replacement=True, sampling_strategy='all')

        model.fit(X_train, y_train)

        dataloader = DataLoader(n_variables = n_variables,
                        maxlags = maxlags)
        dataloader.from_pickle(root+file)
        observations = dataloader.get_original_observations()
        true_causal_dfs = dataloader.get_true_causal_dfs()


        d2cwrapper = D2CWrapper(ts_list=observations, n_variables=n_variables, model=model, maxlags=maxlags, n_jobs = 55, full=True)

        d2cwrapper.run()

        causal_df = d2cwrapper.get_causal_dfs()

        with open(filename, 'wb') as f:
                pickle.dump((
                            causal_df, 
                            true_causal_dfs), f)     
    

100%|██████████| 2/2 [2:07:35<00:00, 3827.78s/it]  


50 variables is just too much

In [7]:
# root = '../../data/new_data/'
# destination_root = '../../data/new_benchmark'
# destination = 'd2c_all_couples_MB5_full'

# if not os.path.exists(destination_root+'/'+destination):
#     os.makedirs(destination_root+'/'+destination)
# maxlags = 5
# # empty folder ../../data/new_benchmark/

# for todo in [to_dos_50_variables]:
#     for file in tqdm(todo):
#         if file.endswith('.pkl'):
#             gen_process_number = int(file.split('_')[0][1:])
#             n_variables = int(file.split('_')[1][1:])
#             max_neighborhood_size = int(file.split('_')[2][2:])
#             noise_std = float(file.split('_')[3][1:-4])
                
#             filename = f'{destination_root}/{destination}/P{gen_process_number}_N{n_variables}_Nj{max_neighborhood_size}_n{noise_std}.pkl'

#             training_data = data.loc[data['process_id'] != gen_process_number]
#             X_train = training_data.drop(columns=['process_id', 'graph_id', 'n_variables', 'max_neighborhood_size','noise_std', 'edge_source', 'edge_dest', 'is_causal',])
#             y_train = training_data['is_causal']

#             model = BalancedRandomForestClassifier(n_estimators=50, random_state=0, n_jobs=1, replacement=True, sampling_strategy='all')

#             model.fit(X_train, y_train)

#             dataloader = DataLoader(n_variables = n_variables,
#                             maxlags = maxlags)
#             dataloader.from_pickle(root+file)
#             observations = dataloader.get_original_observations()
#             true_causal_dfs = dataloader.get_true_causal_dfs()


#             d2cwrapper = D2CWrapper(ts_list=observations, n_variables=n_variables, model=model, maxlags=maxlags, n_jobs = 55, full=True)

#             d2cwrapper.run()

#             causal_df = d2cwrapper.get_causal_dfs()

#             with open(filename, 'wb') as f:
#                     pickle.dump((
#                                 causal_df, 
#                                 true_causal_dfs), f)     
    

  0%|          | 0/10 [1:39:35<?, ?it/s]


KeyboardInterrupt: 