In [None]:
import numpy as np
import pandas as pd
import h5pyimport numpy as np
import pandas as pd
import h5py
from collections import Counter
import archs4py as a4
from sklearn.preprocessing import StandardScaler, normalize
import qnorm
import time
import os

# Extract tissue sample

In [None]:
file = "human_gene_v2.2.h5"
a4.ls(file)

In [None]:
# extract samples
# change columns names to ensembl_id
samples_counts = a4.data.meta(file, "tissue", remove_sc=True)
samples_counts = samples_counts.T
gene_names = a4.meta.field(file, "ensembl_gene_id")
samples_counts.columns = gene_names

# extract metadata samples
id_list = list(samples_counts.index)
samples_meta = a4.meta.samples(file, id_list)

# save if necessary
#samples_counts.to_hdf("raw_tissue_matrix.h5", key="data", mode="w")
#samples_meta.to_hdf("raw_tissue_matrix.h5", key="meta")

In [None]:
# shapes
samples_counts = samples_counts.T
samples_counts.shape, samples_meta.shape

# Clean data

In [None]:
df = samples_counts

## Remove zero expression transcript

In [None]:
zero_expr_mask = df.apply(lambda x: x.sum() > 0 if x.dtype != 'object' else True)
zero_expr_mask

In [None]:
print(f'{df.shape[1] - df.loc[:, zero_expr_mask].shape[1]} number of the transcripts are filtered out because of the zero expression.')
df = df.loc[:, zero_expr_mask]
df.head()

## Replace missing values

In [None]:
# replace by mean
dict_normal = {}
for col in df.columns :
    mean = np.mean(df[col])
    dict_normal[col] = int(np.round(mean))

In [None]:
# replace missing values with values in dict
df = df.fillna(value=dict_normal)

In [None]:
df.head()

## Save 

In [None]:
df.to_parquet("clean_expression_matrix.parquet.gzip", compression="gzip")
df.to_parquet("clean_expression_metadata.parquet.gzip", compression="gzip")

# Log normalization + Quantile normalization

In [None]:
# Open file
data_df = pd.read_parquet("clean_expression_matrix.parquet.gzip")
gene_list = data_df.columns

In [None]:
# Convert to numpy
data_df = data_df.to_numpy()

In [None]:
# log normalization
data_df = np.log2(data_df+1)

# quantile normalization
data_df = qnorm.quantile_normalize(data_df, ncpus=4)

In [None]:
# save
#data_df.to_parquet('normalized_expression_matrix.parquet.gzip', compression='gzip')
np.savez_compressed('normalized_expression_matrix.npz', data_df)

# Convert to DataFrame

In [None]:
# load data
data_npz = np.load('normalized_expression_matrix.npz')
data = data_npz['arr_0']
data = data.astype('float32')
data_npz.close()

In [None]:
metadata = pd.read_parquet('clean_expression_matrix.parquet.gzip')
genes_list = metadata.columns

In [None]:
df = pd.DataFrame(data, columns=genes_list)
df.to_parquet('normalized_expression_data.parquet.gzip', compression='gzip')

# Isolate tissue type

In [None]:
# load data
start = time.time()
metadata = pd.read_parquet('clean_expression_matrix_metadata.parquet.gzip')
data = pd.read_parquet('normalized_expression_data.parquet.gzip')
end = time.time()
print((end-start)/60)

## Counts for each type

In [None]:
brain_selection = []
skin_selection = []
neuron_selection = []
lung_selection = []
colon_selection = []
liver_selection = []
breast_selection = []
kidney_selection = []
bladder_selection = []
cervix_selection = []
ovary_selection = []
prostate_selection = []
sarcoma_selection = []
headneck_selection = []
stomach_selection = []
thyroid_selection = []
uterine_selection = []
heart_selection = []
pancreas_selection = []


brain_mask = []
skin_mask = []
neuron_mask = []
lung_mask = []
colon_mask = []
liver_mask = []
breast_mask = []
kidney_mask = []
bladder_mask = []
cervix_mask = []
ovary_mask = []
prostate_mask = []
sarcoma_mask = []
headneck_mask = []
stomach_mask = []
thyroid_mask = []
uterine_mask = []
heart_mask = []
pancreas_mask = []

selection_mask = []
organ_label = []

#organ_list = ['BRAIN','SKIN','LUNG','COLON','LIVER','BREAST','KIDNEY','BLADDER','CERVIX','OVARY','PROSTATE','SQUAMOUS','MOUTH','STOMACH','THYROID','UTERINE']
organ_list = ['BRAIN','SKIN','NEURON','LUNG','COLON','LIVER','BREAST','KIDNEY','BLADDER','CERVIX','OVARY','PROSTATE','SQUAMOUS','MOUTH','STOMACH','THYROID','UTERINE','HEART','PANCREAS']

selection_list = [brain_selection,skin_selection,neuron_selection,lung_selection,colon_selection,liver_selection,breast_selection,kidney_selection,
                  bladder_selection,cervix_selection,ovary_selection,prostate_selection,sarcoma_selection,headneck_selection,stomach_selection,
                  thyroid_selection,uterine_selection,heart_selection,pancreas_selection]
mask_list = [brain_mask,skin_mask,neuron_mask,lung_mask,colon_mask,liver_mask,breast_mask,kidney_mask,
                  bladder_mask,cervix_mask,ovary_mask,prostate_mask,sarcoma_mask,headneck_mask,stomach_mask,
                  thyroid_mask,uterine_mask,heart_mask,pancreas_mask]
labels = []

for charac in metadata['characteristics_ch1'] :
    for organ, selection, mask in zip(organ_list, selection_list, mask_list) :
        if organ in charac :
            selection.append(charac)
            mask.append(1)
            labels.append(organ)
        else : mask.append(0)

In [None]:
brain_data = pd.DataFrame(columns=data.columns)
brain_metadata = pd.DataFrame(columns=metadata.columns)
skin_data = pd.DataFrame(columns=data.columns)
skin_metadata = pd.DataFrame(columns=metadata.columns)
neuron_data = pd.DataFrame(columns=data.columns)
neuron_metadata = pd.DataFrame(columns=metadata.columns)
lung_data = pd.DataFrame(columns=data.columns)
lung_metadata = pd.DataFrame(columns=metadata.columns)
colon_data = pd.DataFrame(columns=data.columns)
colon_metadata = pd.DataFrame(columns=metadata.columns)
pancreas_data = pd.DataFrame(columns=data.columns)
pancreas_metadata = pd.DataFrame(columns=metadata.columns)
kidney_data = pd.DataFrame(columns=data.columns)
kidney_metadata = pd.DataFrame(columns=metadata.columns)
heart_data = pd.DataFrame(columns=data.columns)
heart_metadata = pd.DataFrame(columns=metadata.columns)
liver_data = pd.DataFrame(columns=data.columns)
liver_metadata = pd.DataFrame(columns=metadata.columns)
breast_data = pd.DataFrame(columns=data.columns)
breast_metadata = pd.DataFrame(columns=metadata.columns)

bladder_data = pd.DataFrame(columns=data.columns)
bladder_metadata = pd.DataFrame(columns=metadata.columns)
cervix_data = pd.DataFrame(columns=data.columns)
cervix_metadata = pd.DataFrame(columns=metadata.columns)
ovary_data = pd.DataFrame(columns=data.columns)
ovary_metadata = pd.DataFrame(columns=metadata.columns)
prostate_data = pd.DataFrame(columns=data.columns)
prostate_metadata = pd.DataFrame(columns=metadata.columns)
sarcoma_data = pd.DataFrame(columns=data.columns)
sarcoma_metadata = pd.DataFrame(columns=metadata.columns)
headneck_data = pd.DataFrame(columns=data.columns)
headneck_metadata = pd.DataFrame(columns=metadata.columns)
stomach_data = pd.DataFrame(columns=data.columns)
stomach_metadata = pd.DataFrame(columns=metadata.columns)
thyroid_data = pd.DataFrame(columns=data.columns)
thyroid_metadata = pd.DataFrame(columns=metadata.columns)
uterine_data = pd.DataFrame(columns=data.columns)
uterine_metadata = pd.DataFrame(columns=metadata.columns)

In [None]:
data_list = [brain_data,skin_data,neuron_data,lung_data,colon_data,pancreas_data,kidney_data,heart_data,liver_data,breast_data,
            bladder_data,cervix_data,ovary_data,prostate_data,sarcoma_data,headneck_data,stomach_data,thyroid_data,uterine_data]
metadata_list = [brain_metadata,skin_metadata,neuron_metadata,lung_metadata,colon_metadata,pancreas_metadata,kidney_metadata,heart_metadata,liver_metadata,breast_metadata,
            bladder_metadata,cervix_metadata,ovary_metadata,prostate_metadata,sarcoma_metadata,headneck_metadata,stomach_metadata,thyroid_metadata,uterine_metadata]
mask_list = [brain_mask,skin_mask,neuron_mask,lung_mask,colon_mask,pancreas_mask,kidney_mask,heart_mask,liver_mask,breast_mask,
            bladder_mask,cervix_mask,ovary_mask,prostate_mask,sarcoma_mask,headneck_mask,stomach_mask,thyroid_mask,uterine_mask]

In [None]:
start = time.time()

for data_type, metadata_type, data_mask in zip(data_list, metadata_list, mask_list):
    for i, mask in enumerate(data_mask) :
        if mask == 1 :
            data_type.loc[len(data_type)] = data.iloc[i]
            metadata_type.loc[len(metadata_type)] = metadata.iloc[i]

print(time.time()-start)

## Save each tissue type (if necessary)

In [None]:
organ_list = [brain,skin,neuron,lung,colon,pancreas,kidney,heart,liver,breast,bladder,cervix,ovary,prostate,sarcoma,headneck,stomach,thyroid,uterine]
for organ_data, organ_metadata, organ in zip(data_list, metadata_list, organ_list):
    organ_data.to_parquet(f'specific/{organ}_data.parquet.gzip', compression='gzip')
    organ_metadata.to_parquet(f'specific/{organ}_metadata.parquet.gzip', compression='gzip')

## Add labels to metadata

### Tissue type

In [None]:
# load if necessary
brain_data = pd.read_parquet('specific/brain_data.parquet.gzip')
brain_metadata = pd.read_parquet('specific/brain_metadata.parquet.gzip')
skin_data = pd.read_parquet('specific/skin_data.parquet.gzip')
skin_metadata = pd.read_parquet('specific/skin_metadata.parquet.gzip')
neuron_data = pd.read_parquet('specific/neuron_data.parquet.gzip')
neuron_metadata = pd.read_parquet('specific/neuron_metadata.parquet.gzip')
lung_data = pd.read_parquet('specific/lung_data.parquet.gzip')
lung_metadata = pd.read_parquet('specific/lung_metadata.parquet.gzip')
colon_data = pd.read_parquet('specific/colon_data.parquet.gzip')
colon_metadata = pd.read_parquet('specific/colon_metadata.parquet.gzip')
pancreas_data = pd.read_parquet('specific/pancreas_data.parquet.gzip')
pancreas_metadata = pd.read_parquet('specific/pancreas_metadata.parquet.gzip')
kidney_data = pd.read_parquet('specific/kidney_data.parquet.gzip')
kidney_metadata = pd.read_parquet('specific/kidney_metadata.parquet.gzip')
heart_data = pd.read_parquet('specific/heart_data.parquet.gzip')
heart_metadata = pd.read_parquet('specific/heart_metadata.parquet.gzip')
liver_data = pd.read_parquet('specific/liver_data.parquet.gzip')
liver_metadata = pd.read_parquet('specific/liver_metadata.parquet.gzip')
breast_data = pd.read_parquet('specific/breast_data.parquet.gzip')
breast_metadata = pd.read_parquet('specific/breast_metadata.parquet.gzip')

bladder_data = pd.read_parquet('specific/bladder_data.parquet.gzip')
bladder_metadata = pd.read_parquet('specific/bladder_metadata.parquet.gzip')
cervix_data = pd.read_parquet('specific/cervix_data.parquet.gzip')
cervix_metadata = pd.read_parquet('specific/cervix_metadata.parquet.gzip')
ovary_data = pd.read_parquet('specific/ovary_data.parquet.gzip')
ovary_metadata = pd.read_parquet('specific/ovary_metadata.parquet.gzip')
prostate_data = pd.read_parquet('specific/prostate_data.parquet.gzip')
prostate_metadata = pd.read_parquet('specific/prostate_metadata.parquet.gzip')
sarcoma_data = pd.read_parquet('specific/sarcoma_data.parquet.gzip')
sarcoma_metadata = pd.read_parquet('specific/sarcoma_metadata.parquet.gzip')
headneck_data = pd.read_parquet('specific/headneck_data.parquet.gzip')
headneck_metadata = pd.read_parquet('specific/headneck_metadata.parquet.gzip')
stomach_data = pd.read_parquet('specific/stomach_data.parquet.gzip')
stomach_metadata = pd.read_parquet('specific/stomach_metadata.parquet.gzip')
thyroid_data = pd.read_parquet('specific/thyroid_data.parquet.gzip')
thyroid_metadata = pd.read_parquet('specific/thyroid_metadata.parquet.gzip')
uterine_data = pd.read_parquet('specific/uterine_data.parquet.gzip')
uterine_metadata = pd.read_parquet('specific/uterine_metadata.parquet.gzip')

In [None]:
brain_labels = ['brain' for i in range(brain_data.shape[0])]
skin_labels = ['skin' for i in range(skin_data.shape[0])]
neuron_labels = ['neuron' for i in range(neuron_data.shape[0])]
lung_labels = ['lung' for i in range(lung_data.shape[0])]
colon_labels = ['colon' for i in range(colon_data.shape[0])]
pancreas_labels = ['pancreas' for i in range(pancreas_data.shape[0])]
kidney_labels = ['kidney' for i in range(kidney_data.shape[0])]
heart_labels = ['heart' for i in range(heart_data.shape[0])]
liver_labels = ['liver' for i in range(liver_data.shape[0])]
breast_labels = ['breast' for i in range(breast_data.shape[0])]

bladder_labels = ['bladder' for i in range(bladder_data.shape[0])]
cervix_labels = ['cervix' for i in range(cervix_data.shape[0])]
ovary_labels = ['ovary' for i in range(ovary_data.shape[0])]
prostate_labels = ['prostate' for i in range(prostate_data.shape[0])]
sarcoma_labels = ['sarcoma' for i in range(sarcoma_data.shape[0])]
headneck_labels = ['headneck' for i in range(headneck_data.shape[0])]
stomach_labels = ['stomach' for i in range(stomach_data.shape[0])]
thyroid_labels = ['thyroid' for i in range(thyroid_data.shape[0])]
uterine_labels = ['uterine' for i in range(uterine_data.shape[0])] 

In [None]:
labels_list = [brain_labels,skin_labels,neuron_labels,lung_labels,colon_labels,pancreas_labels,kidney_labels,heart_labels,liver_labels,breast_labels,
              bladder_labels,cervix_labels,ovary_labels,prostate_labels,sarcoma_labels,headneck_labels,stomach_labels,thyroid_labels,uterine_labels]

In [None]:
for organ_metadata, labels in zip(metadata_list, labels_list):
    organ_metadata['labels'] = labels

### Cancer type

In [None]:
for organ_metadata in metadata_list:
    cancer_type = []
    for line in metadata['characteristics_ch1']:
        if 'CANCER' in line or 'TUMOR' in line : cancer_type.append('cancer')
        else : cancer_type.append('normal')
    organ_metadata['cancer_type'] = cancer_type

### Save new metadata

In [None]:
organ_list = [brain,skin,neuron,lung,colon,pancreas,kidney,heart,liver,breast,bladder,cervix,ovary,prostate,sarcoma,headneck,stomach,thyroid,uterine]
for organ_metadata, organ in zip(metadata_list, organ_list):
    organ_metadata.to_parquet(f'specific/{organ}_metadata.parquet.gzip', compression='gzip')

# Apply BEC

In [None]:
from combat.pycombat import pycombat
import numpy as np

In [None]:
# add noise
# little noise must be injected to apply pycombat 
gaussian_noise = np.random.normal(0, 0.0001, lung_data.shape)
lung_data_noise = lung_data + gaussian_noise

gaussian_noise = np.random.normal(0, 0.0001, skin_data.shape)
skin_data_noise = skin_data + gaussian_noise

gaussian_noise = np.random.normal(0, 0.0001, brain_data.shape)
brain_data_noise = brain_data + gaussian_noise

gaussian_noise = np.random.normal(0, 0.0001, breast_data.shape)
breast_data_noise = breast_data + gaussian_noise

gaussian_noise = np.random.normal(0, 0.0001, colon_data.shape)
colon_data_noise = colon_data + gaussian_noise

gaussian_noise = np.random.normal(0, 0.0001, heart_data.shape)
heart_data_noise = heart_data + gaussian_noise

gaussian_noise = np.random.normal(0, 0.0001, liver_data.shape)
liver_data_noise = liver_data + gaussian_noise

gaussian_noise = np.random.normal(0, 0.0001, neuron_data.shape)
neuron_data_noise = neuron_data + gaussian_noise

gaussian_noise = np.random.normal(0, 0.0001, pancreas_data.shape)
pancreas_data_noise = pancreas_data + gaussian_noise

gaussian_noise = np.random.normal(0, 0.0001, kidney_data.shape)
kidney_data_noise = kidney_data + gaussian_noise

gaussian_noise = np.random.normal(0, 0.0001, bladder_data.shape)
bladder_data_noise = bladder_data + gaussian_noise

gaussian_noise = np.random.normal(0, 0.0001, cervix_data.shape)
cervix_data_noise = cervix_data + gaussian_noise

gaussian_noise = np.random.normal(0, 0.0001, ovary_data.shape)
ovary_data_noise = ovary_data + gaussian_noise

gaussian_noise = np.random.normal(0, 0.0001, prostate_data.shape)
prostate_data_noise = prostate_data + gaussian_noise

gaussian_noise = np.random.normal(0, 0.0001, sarcoma_data.shape)
sarcoma_data_noise = sarcoma_data + gaussian_noise

gaussian_noise = np.random.normal(0, 0.0001, headneck_data.shape)
headneck_data_noise = headneck_data + gaussian_noise

gaussian_noise = np.random.normal(0, 0.0001, stomach_data.shape)
stomach_data_noise = stomach_data + gaussian_noise

gaussian_noise = np.random.normal(0, 0.0001, thyroid_data.shape)
thyroid_data_noise = thyroid_data + gaussian_noise

gaussian_noise = np.random.normal(0, 0.0001, uterine_data.shape)
uterine_data_noise = uterine_data + gaussian_noise

In [None]:
lung_data_corrected = pycombat(lung_data_noise.T, list(lung_batches), mean_only=True)
skin_data_corrected = pycombat(skin_data_noise.T, list(skin_batches), mean_only=True)
brain_data_corrected = pycombat(brain_data_noise.T, list(brain_batches), mean_only=True)
breast_data_corrected = pycombat(breast_data_noise.T, list(breast_batches), mean_only=True)
colon_data_corrected = pycombat(colon_data_noise.T, list(colon_batches), mean_only=True)
heart_data_corrected = pycombat(heart_data_noise.T, list(heart_batches), mean_only=True)
liver_data_corrected = pycombat(liver_data_noise.T, list(liver_batches), mean_only=True)
neuron_data_corrected = pycombat(neuron_data_noise.T, list(neuron_batches), mean_only=True)
pancreas_data_corrected = pycombat(pancreas_data_noise.T, list(pancreas_batches), mean_only=True)
kidney_data_corrected = pycombat(kidney_data_noise.T, list(kidney_batches), mean_only=True)

bladder_data_corrected = pycombat(bladder_data_noise.T, list(bladder_batches), mean_only=True)
cervix_data_corrected = pycombat(cervix_data_noise.T, list(cervix_batches), mean_only=True)
ovary_data_corrected = pycombat(ovary_data_noise.T, list(ovary_batches), mean_only=True)
prostate_data_corrected = pycombat(prostate_data_noise.T, list(prostate_batches), mean_only=True)
sarcoma_data_corrected = pycombat(sarcoma_data_noise.T, list(sarcoma_batches), mean_only=True)
headneck_data_corrected = pycombat(headneck_data_noise.T, list(headneck_batches), mean_only=True)
stomach_data_corrected = pycombat(stomach_data_noise.T, list(stomach_batches), mean_only=True)
thyroid_data_corrected = pycombat(thyroid_data_noise.T, list(thyroid_batches), mean_only=True)
uterine_data_corrected = pycombat(uterine_data_noise.T, list(uterine_batches), mean_only=True)

In [None]:
lung_data_corrected = lung_data_corrected.T
skin_data_corrected = skin_data_corrected.T
brain_data_corrected = brain_data_corrected.T
breast_data_corrected = breast_data_corrected.T
colon_data_corrected = colon_data_corrected.T
heart_data_corrected = heart_data_corrected.T
liver_data_corrected = liver_data_corrected.T
neuron_data_corrected = neuron_data_corrected.T
pancreas_data_corrected = pancreas_data_corrected.T
kidney_data_corrected = kidney_data_corrected.T

bladder_data_corrected = bladder_data_corrected.T
cervix_data_corrected = cervix_data_corrected.T
ovary_data_corrected = ovary_data_corrected.T
prostate_data_corrected = prostate_data_corrected.T
sarcoma_data_corrected = sarcoma_data_corrected.T
headneck_data_corrected = headneck_data_corrected.T
stomach_data_corrected = stomach_data_corrected.T
thyroid_data_corrected = thyroid_data_corrected.T
uterine_data_corrected = uterine_data_corrected.T

# Merge tissue type datasets and metadata

In [None]:
# 10 classes version 
merged_dataset = pd.concat([brain_data_corrected,skin_data_corrected,neuron_data_corrected,lung_data_corrected,colon_data_corrected,pancreas_data_corrected,kidney_data_corrected,heart_data_corrected,liver_data_corrected,breast_data_corrected])
merged_metadata = pd.concat([brain_metadata,skin_metadata,neuron_metadata,lung_metadata,colon_meatadata,pancreas_metadata,kidney_metadata,heart_metadata,liver_metadata,breast_metadata])

# 19 classes version (include TCGA classes)
merged_dataset = pd.concat([brain_data_corrected,skin_data_corrected,neuron_data_corrected,lung_data_corrected,colon_data_corrected,pancreas_data_corrected,kidney_data_corrected,heart_data_corrected,liver_data_corrected,breast_data_corrected,bladder_data_corrected,cervix_data_corrected,ovary_data_corrected,prostate_data_corrected,sarcoma_data_corrected,headneck_data_corrected,stomach_data_corrected,thyroid_data_corrected,uterine_data_corrected])
merged_metadata = pd.concat([brain_metadata,skin_metadata,neuron_metadata,lung_metadata,colon_meatadata,pancreas_metadata,kidney_metadata,heart_metadata,liver_metadata,breast_metadata,bladder_metadata,cervix_metadata,ovary_metadata,prostate_metadata,sarcoma_metadata,headneck_metadata,stomach_metadata,thyroid_metadata,uterine_metadata])

# new classes version (only missing TCGA classes)
merged_dataset = pd.concat([bladder_data_corrected,cervix_data_corrected,ovary_data_corrected,prostate_data_corrected,sarcoma_data_corrected,headneck_data_corrected,stomach_data_corrected,thyroid_data_corrected,uterine_data_corrected])
merged_metadata = pd.concat([bladder_metadata,cervix_metadata,ovary_metadata,prostate_metadata,sarcoma_metadata,headneck_metadata,stomach_metadata,thyroid_metadata,uterine_metadata])

# Save final datasets

In [None]:
merged_dataset.to_parquet("specific/archs4_10classes_data.parquet.gzip", compression="gzip")
merged_metadata.to_parquet("specific/archs4_10classes_metadata.parquet.gzip", compression="gzip")