In [13]:
import os
import shutil
import numpy as np
import pandas as pd
import matplotlib.pylab as plt
import dill
from sys import argv
from sklearn.manifold import MDS
from sklearn.metrics import PrecisionRecallDisplay, RocCurveDisplay, roc_curve

from argparse import ArgumentParser
import warnings
from pathlib import Path

from bio_data.bio_data_preprocess import BioDataPreprocess
from model.cross_validated_model import CrossValidatedModel
from model.feature_selection import FeatureSelectionCV
import shap

from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.utils.class_weight import compute_sample_weight

import matplotlib.pyplot as plt

from utils.utils import scores_with_optimal_cutoff, load_data, load_model, get_config, simplify_cross_val_result
warnings.filterwarnings("ignore")


In [2]:
config_path = Path('./params/config.yaml')
data_path = '../data/tabular/coronal-REBLs.csv'
augmentation_path = '../data/tabular/axial-REBLs.csv'

In [3]:
data = pd.read_csv(data_path)
augmented_data = pd.read_csv(augmentation_path)

In [4]:
config = get_config(config_path)
param_grid = config['param_grid']

In [5]:
config

{'cv': 5,
 'random_state': 0,
 'n_jobs': 8,
 'do_feature_selection': False,
 'model_name': 'svc',
 'base_model': CalibratedBinarySVC(cv=5, n_jobs=8),
 'param_grid': {'classifier__kernel': ['rbf'],
  'classifier__gamma': [0.1, 1, 10, 100],
  'classifier__C': [0.01, 0.1, 1, 10]},
 'balancing_method': 'null',
 'weighted': False,
 'preprocess': {'smote': False,
  'target_column': 'Label',
  'normalizer': 'standard',
  'drop_threshold': 0.3,
  'categorical_impute': 'external',
  'real_impute': 'iterative'}}

In [6]:
X, y, bigX, bigY, index_pairs, pipeline = BioDataPreprocess(data, augmented_data,
										base_model=config['base_model'],
										random_state=config['random_state'],
										**config['preprocess']).prerocess_and_create_pipeline()
									

DONE


In [7]:
print(X.shape, bigX.shape)

(92, 1197) (118, 1197)


In [19]:
cv = StratifiedKFold(n_splits=5, shuffle=True,random_state=42)
print(index_pairs)

[(3, 98), (5, 114), (6, 112), (7, 96), (8, 92), (9, 103), (12, 115), (25, 106), (27, 104), (30, 110), (31, 107), (34, 101), (39, 97), (43, 108), (45, 94), (47, 99), (49, 113), (51, 111), (56, 116), (57, 102), (67, 105), (72, 95), (76, 109), (79, 100), (83, 117), (90, 93)]


In [31]:
for i, (train, test) in enumerate(cv.split(X, y)):
    for idx1, idx2 in index_pairs:
        if idx1 in train:
            train = np.append(train, idx2)

    print(i, test)


0 [ 2  3  7 10 14 18 20 23 28 29 33 51 58 66 69 72 78 83 87]
1 [ 6  8 12 13 22 24 37 45 49 50 53 54 57 68 75 80 81 85 89]
2 [16 17 21 25 30 32 36 39 44 52 59 62 63 71 73 79 84 90]
3 [ 4  9 19 26 34 35 42 43 46 55 61 64 65 67 74 76 88 91]
4 [ 0  1  5 11 15 27 31 38 40 41 47 48 56 60 70 77 82 86]
