In [10]:
import numpy as np
import pandas as pd 
import random 
from sklearn import preprocessing
from sklearn.model_selection import train_test_split

## Utils

In [19]:
def generate_indices(data, prop=1, val_prop=0.15, test_prop=0.15, rs=0):
    """Generated train, validation and test indices that will be used in the
    dataloaders.

    Args:
        data (numpy.ndarray): 2-dimensional array of the dataset. The first column has
            to contain the class (ex: cancer / no cancer) information.
        prop (int, optional): Proportion of the dataset that is used to generate the
            indices. Defaults to 1.
        val_prop (float, optional): Proportion of data dedicated to the validation set.
            Defaults to 0.15.
        test_prop (float, optional): Proportion of data dedicated to the test set.
            Defaults to 0.15.
        rs (int, optional): Random state. Defaults to 0.

    Returns:
        (numpy.ndarray, numpy.ndarray, numpy.ndarray): Train, validation and test
        indices.
    """
    indices = list(range(len(data)))
    
    if test_prop != 0 :
        train_idx, test_idx = train_test_split(
            indices, test_size=test_prop, stratify=data[:,0], train_size=None, random_state=rs
        )
        train_idx, val_idx = train_test_split(
            train_idx,
            test_size=val_prop / (1 - test_prop),
            train_size=None,
            stratify=data[train_idx,0],
            random_state=rs,
        )
    else :
        train_idx, val_idx = train_test_split(
            indices,
            data[:,0],
            test_size=val_prop,
            train_size=None,
            stratify=data[:,0],
            random_state=rs,
        )
        test_idx=[]
    if prop != 1:
        modes = data[train_idx, 0]
        subtrain_idx = []
        for mode in np.unique(modes):
            candidates = np.array(train_idx)[np.argwhere(modes == mode).flatten()]
            selected_idx = candidates[: round(len(candidates) * prop)]
            #selected_idx = np.random.choice(candidates, int(round(len(candidates)*prop)), replace=False)
            subtrain_idx += selected_idx.tolist()
        train_idx = subtrain_idx
        print(len(train_idx))

    return (train_idx, val_idx, test_idx)

In [3]:
def read_process_data_TCGA(
    data_path,
    label_path,
    coding_genes=False,
    coding_genes_file='../data/protein-coding_gene.txt'
):
    """Reads and processes (including a normal standardardization) the TCGA data.

    Args:
        data_path (str) : path to dataset
        label_path (str) : path to classes

    Returns:
        numpy.ndarray: Numpy array of the processed data.
    """
    
    class_df = pd.read_parquet(label_path)
    data_df = pd.read_parquet(data_path)
    
    if coding_genes:
        protein_coding_file = pd.read_csv(coding_genes_file, '\t')
        ens_list = np.unique(protein_coding_file['ensembl_gene_id'].tolist()).tolist()
        ens_list.pop()
        selected_columns = ['caseID'] + ens_list
        genes = data_df.columns
        intersection = list(set(selected_columns) & set(genes))
        data_df = data_df[intersection]

    # merging the dataframes based on "caseID"
    class_df["caseID"] = class_df.apply(lambda row: row.cases.split("|")[1], axis=1)
    df = class_df.merge(data_df, on="caseID", how="inner")
    #df = class_df.iloc[:100].merge(data_df.iloc[:100], on="caseID", how="inner")
    
    df = df.drop(columns=list(df.columns[:7]) + [df.columns[8]] + [df.columns[9]])  # columns management
    
    # encoding cancer names to integers
    le = preprocessing.LabelEncoder()
    df["cancer_type"] = le.fit_transform(df["cancer_type"])
    print(df.columns)
    np_dataset = df.to_numpy(dtype=np.float32)

    # normal standardardization
    scaler = preprocessing.StandardScaler()
    np_dataset[:, 1:] = scaler.fit_transform(np_dataset[:, 1:])

    return np_dataset

## Import Data

In [7]:
dataset = read_process_data_TCGA('mRNA.omics.parquet', 'label.parquet')

Index(['cancer_type', 'ENSG00000000003', 'ENSG00000000005', 'ENSG00000000419',
       'ENSG00000000457', 'ENSG00000000460', 'ENSG00000000938',
       'ENSG00000000971', 'ENSG00000001036', 'ENSG00000001084',
       ...
       'ENSG00000288658', 'ENSG00000288659', 'ENSG00000288660',
       'ENSG00000288662', 'ENSG00000288663', 'ENSG00000288667',
       'ENSG00000288669', 'ENSG00000288670', 'ENSG00000288674',
       'ENSG00000288675'],
      dtype='object', length=56903)


In [26]:
data_orig = pd.read_parquet('mRNA.omics.parquet')

In [28]:
data_orig.shape, dataset.shape

((9349, 56903), (9349, 56903))

In [20]:
idx = generate_indices(dataset, prop=1, val_prop=0.11, test_prop=0.11, rs=0)
len(idx[0]), len(idx[1]), len(idx[2])

(7291, 1029, 1029)

In [39]:
pretrain_data = pd.DataFrame(data_orig.iloc[idx[0]], columns=data_orig.columns)
nopretrain_data = pd.DataFrame(data_orig.iloc[idx[1]+idx[2]], columns=data_orig.columns)

In [41]:
pretrain_data.to_parquet('pretrain_data.2.parquet')
nopretrain_data.to_parquet('nopretrain_data.2.parquet')

In [42]:
nopretrain_data_p = read_process_data_TCGA('nopretrain_data.2.parquet', 'label.parquet')

Index(['cancer_type', 'ENSG00000000003', 'ENSG00000000005', 'ENSG00000000419',
       'ENSG00000000457', 'ENSG00000000460', 'ENSG00000000938',
       'ENSG00000000971', 'ENSG00000001036', 'ENSG00000001084',
       ...
       'ENSG00000288658', 'ENSG00000288659', 'ENSG00000288660',
       'ENSG00000288662', 'ENSG00000288663', 'ENSG00000288667',
       'ENSG00000288669', 'ENSG00000288670', 'ENSG00000288674',
       'ENSG00000288675'],
      dtype='object', length=56903)
