Create the 'predictions' for LOPO D2C on the same files as the other methods

In [1]:
import pickle 
import os
import pandas as pd
from tqdm import tqdm
import numpy as np

from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, balanced_accuracy_score

from d2c.benchmark import D2CWrapper

from d2c.descriptors.loader import DataLoader

from sklearn.ensemble import RandomForestClassifier



  from .autonotebook import tqdm as notebook_tqdm


In [4]:
dfs = []
root = './descriptors/'
for file in sorted(os.listdir(root)):
    if file.endswith('.pkl'):
        df = pd.read_pickle(root + file)
        dfs.append(df)

descriptors = pd.concat(dfs, axis=0).reset_index(drop=True)

In [3]:
root = './data/'

# if any exception write it on disk 

to_dos = []
for file in sorted(os.listdir(root)):
    if file.endswith('.pkl'):
        gen_process_number = int(file.split('_')[0][1:])
        n_variables = int(file.split('_')[1][1:])
        max_neighborhood_size = int(file.split('_')[2][2:])
        noise_std = float(file.split('_')[3][1:-4])
        
        if noise_std != 0.005:
            continue

        if max_neighborhood_size != 8:
            continue

        to_dos.append(file)

# sort to_dos by number of variables
to_dos_5_variables = [file for file in to_dos if int(file.split('_')[1][1:]) == 5]
to_dos_10_variables = [file for file in to_dos if int(file.split('_')[1][1:]) == 10]
to_dos_25_variables = [file for file in to_dos if int(file.split('_')[1][1:]) == 25]
to_dos_50_variables = [file for file in to_dos if int(file.split('_')[1][1:]) == 50]

In [6]:
from imblearn.ensemble import BalancedRandomForestClassifier
# silence SettingWithCopyWarning
import warnings
warnings.filterwarnings('ignore')

root = './data/'
destination_root = './d2c_subset_benchmark/'
maxlags = 5
for file in tqdm(to_dos):
    if file.endswith('.pkl'):
        gen_process_number = int(file.split('_')[0][1:])
        n_variables = int(file.split('_')[1][1:])
        max_neighborhood_size = int(file.split('_')[2][2:])
        noise_std = float(file.split('_')[3][1:-4])
        
        causal_dfs = []
        true_causal_dfs = []

        training_data = descriptors.loc[descriptors['process_id'] != gen_process_number]
        X_train = training_data.drop(columns=['process_id', 'graph_id', 'n_variables', 'max_neighborhood_size','noise_std', 'edge_source', 'edge_dest', 'is_causal'])
        y_train = training_data['is_causal']
        
        model = BalancedRandomForestClassifier(n_estimators=110, random_state=0, n_jobs=55, replacement=True, sampling_strategy='all')

        model.fit(X_train, y_train)
        
        for graph_id in range(20):
        
            testing_data = descriptors.loc[(descriptors['process_id'] == gen_process_number) & (descriptors['graph_id'] == graph_id) & (descriptors['n_variables'] == n_variables) & (descriptors['max_neighborhood_size'] == max_neighborhood_size) & (descriptors['noise_std'] == noise_std)]

            from_ = testing_data['edge_source']
            to = testing_data['edge_dest']
            X_test = testing_data.drop(columns=['process_id', 'graph_id', 'n_variables', 'max_neighborhood_size','noise_std', 'edge_source', 'edge_dest', 'is_causal'])

            y_test = testing_data['is_causal']

            y_pred_proba = model.predict_proba(X_test)[:,1]
            y_pred = y_pred_proba > 0.5

            X_test['probability'] = y_pred_proba
            X_test['is_causal'] = y_pred
            X_test['edge_source'] = from_
            X_test['edge_dest'] = to
            results = X_test[['edge_source','edge_dest','probability','is_causal']]

            results.rename(columns={'edge_source':'from', 'edge_dest':'to'}, inplace=True)

            results['p_value'] = None
            results['effect'] = None

            causal_df = results[['from','to','effect','p_value','probability','is_causal']]

            true_causal_df = causal_df.drop(columns=['is_causal'])
            true_causal_df['is_causal'] = y_test

            causal_dfs.append(causal_df)
            true_causal_dfs.append(true_causal_df)

        filename = f'{destination_root}/{destination}/P{gen_process_number}_N{n_variables}_Nj{max_neighborhood_size}_n{noise_std}.pkl'

        with open(filename, 'wb') as f:
                pickle.dump((
                            causal_dfs, 
                            true_causal_dfs), f)     

  0%|          | 0/35 [00:00<?, ?it/s]

100%|██████████| 35/35 [40:34<00:00, 69.57s/it]
