In [None]:
import random
import numpy as np  # manipulate N-dimensional arrays
import pandas as pd  # data frame
from sklearn import preprocessing
from sklearn.model_selection import train_test_split

## Utils

In [None]:
def generate_indices(data, prop=1, val_prop=0.15, test_prop=0.15, rs=0):
    """Generated train, validation and test indices that will be used in the
    dataloaders.

    Args:
        data (numpy.ndarray): 2-dimensional array of the dataset. The first column has
            to contain the class (ex: cancer / no cancer) information.
        prop (int, optional): Proportion of the dataset that is used to generate the
            indices. Defaults to 1.
        val_prop (float, optional): Proportion of data dedicated to the validation set.
            Defaults to 0.15.
        test_prop (float, optional): Proportion of data dedicated to the test set.
            Defaults to 0.15.
        rs (int, optional): Random state. Defaults to 0.

    Returns:
        (numpy.ndarray, numpy.ndarray, numpy.ndarray): Train, validation and test
        indices.
    """
    indices = list(range(len(data)))
    
    if test_prop != 0 :
        train_idx, test_idx = train_test_split(
            indices, test_size=test_prop, stratify=data[:,0], train_size=None, random_state=rs
        )
        train_idx, val_idx = train_test_split(
            train_idx,
            test_size=val_prop / (1 - test_prop),
            train_size=None,
            stratify=data[train_idx,0],
            random_state=rs,
        )
    else :
        train_idx, val_idx = train_test_split(
            indices,
            data[:,0],
            test_size=val_prop,
            train_size=None,
            stratify=data[:,0],
            random_state=rs,
        )
        test_idx=[]
    if prop != 1:
        modes = data[train_idx, 0]
        subtrain_idx = []
        for mode in np.unique(modes):
            candidates = np.array(train_idx)[np.argwhere(modes == mode).flatten()]
            selected_idx = candidates[: round(len(candidates) * prop)]
            #selected_idx = np.random.choice(candidates, int(round(len(candidates)*prop)), replace=False)
            subtrain_idx += selected_idx.tolist()
        train_idx = subtrain_idx
        print(len(train_idx))

    return (train_idx, val_idx, test_idx)

In [None]:
def read_process_data_ARCHS4(
    data_path,
    label_path
):
    class_df = pd.read_parquet(label_path)
    data_df = pd.read_parquet(data_path)
    labels = class_df["labels"]
    
    # encoding cancer names to integers
    le = preprocessing.LabelEncoder()
    data_df.insert(0, 'labels', le.fit_transform(class_df["labels"]))
    print(data_df.columns)
    np_dataset = data_df.to_numpy(dtype=np.float32)

    # normal standardardization
    scaler = preprocessing.StandardScaler()
    np_dataset[:, 1:] = scaler.fit_transform(np_dataset[:, 1:])

    return np_dataset


## Import data

In [None]:
dataset = read_process_data_ARCHS4('specific_data_corrected.v2.parquet.gzip', 'specific_metadata.parquet.gzip')

In [None]:
data_orig = pd.read_parquet('specific_data_corrected.v2.parquet.gzip')
metadata = pd.read_parquet('specific_metadata.parquet.gzip')

## Split

In [None]:
idx = generate_indices(dataset, prop=1, val_prop=0.025, test_prop=0.025, rs=0)
len(idx[0]), len(idx[1]), len(idx[2])

In [None]:
pretrain_data = pd.DataFrame(dataset[idx[0],1:], columns=data_orig.columns)
nopretrain_data = pd.DataFrame(dataset[idx[1]+idx[2],1:], columns=data_orig.columns)

In [None]:
pretrain_metadata = metadata.iloc[idx[0]]
nopretrain_metadata = metadata.iloc[idx[1]+idx[2]]

In [None]:
pretrain_data.shape, nopretrain_data.shape, pretrain_metadata.shape, nopretrain_metadata.shape

In [None]:
pretrain_data.to_parquet('pretrain_specific_data.2.parquet.gzip', compression='gzip')
nopretrain_data.to_parquet('nopretrain_specific_data.2.parquet.gzip', compression='gzip')

In [None]:
pretrain_metadata.to_parquet('pretrain_specific_metadata.2.parquet.gzip', compression='gzip')
nopretrain_metadata.to_parquet('nopretrain_specific_metadata.2.parquet.gzip', compression='gzip')