In [3]:
import os
import json
import re
import shutil
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import pickle as pkl
from tqdm import tqdm
import warnings
from collections import Counter
import scipy.stats as st
from sklearn.feature_selection import mutual_info_classif
import xgboost as xgb

### Init

In [5]:
def LoadData(path):
    ls = []
    patients = os.listdir(path)
    # Load data
    for patient in patients:
        dataPath = os.path.join(path, patient)

        with open(dataPath, 'r') as file:
            data = json.load(file)

        ls.append(pd.DataFrame([data]))

    df = pd.concat(ls, ignore_index = True)
    y = df['Label']
    X = df.select_dtypes(include=[np.number]).drop(['Label'],axis = 1)
    case = df['Name']
        
    return df, case, X, y

In [5]:
def GroupData(X_train, X_test, group = ['original', 'log', 'wavelet']):
    selected = []

    for col in X_train.columns:
        for g in group:
            if col.startswith(g):
                selected.append(col)
                break

    train = X_train[selected]
    test  = X_test[selected]

    return train, test

In [8]:
def readDataset(path):
    dict = np.load(path, allow_pickle = True).item()
    return dict['X_train'], dict['y_train'], dict['X_test'], dict['y_test']

In [None]:
groups = [['original'], ['log'], ['wavelet'],
          ['original', 'log'], ['original', 'wavelet'], ['log', 'wavelet'], 
          ['original', 'log', 'wavelet']]
path = './path'
npy_path = './npy_path'
json_path = './json_path'

In [16]:
# Initial
filter_type = os.listdir(os.path.join(npy_path,'fold0'))
folds = os.listdir(npy_path)
folds.sort()
init_fold = folds[0]

### Split Data and save by .npy

In [None]:
for group in groups:
    group_str = '_'.join(group)
    trainX, trainY = LoadData(os.path.join(path, init_fold, 'train'))
    testX, testY = LoadData(os.path.join(path, init_fold, 'test'))
    
    X_train, X_test = GroupData(trainX, testX, group)
    temp = X_train
    
    X = np.concatenate((X_train, X_test), axis=0)
    Y = np.concatenate((trainY.to_numpy(),testY.to_numpy()), axis = 0)
    
    mi = mutual_info_classif(X, Y)

    for fold in os.listdir(path):
        des = os.path.join(npy_path,fold)
        trainX, trainY = LoadData(os.path.join(path, fold, 'train'))
        testX, testY = LoadData(os.path.join(path, fold, 'test'))
        y_train = trainY.to_numpy()
        y_test = testY.to_numpy()
        X_train, X_test = GroupData(trainX, testX, group)
        X_train = X_train[:, np.where(mi >= 0.12)[0]]
        X_test  = X_test[:, np.where(mi >= 0.12)[0]]

        dict = {
            'X_train': X_train,
            'X_test': X_test,
            'y_train': y_train,
            'y_test': y_test
        }
        
        if (fold == 'fold0'):
            print(group_str, X_train.shape, X_test.shape)
        np.save(os.path.join(des,f'{group_str}.npy'), dict)

original (84, 14) (26, 14)
log (84, 47) (26, 47)
wavelet (84, 61) (26, 61)
original_log (84, 61) (26, 61)
original_wavelet (84, 75) (26, 75)
log_wavelet (84, 108) (26, 108)
original_log_wavelet (84, 123) (26, 123)
