# Generate Mapping of TCGA Data and MNIST Images

Here we:
1. Load the preprocessed data from Exp6 ()
2. Load the MNIST dataset
3. Map Cancer Types to Digits


### 1. Load TCGA data
Run Exp6 first (see reproducibility repo)

In [2]:
os.getcwd()

'/Users/maximilianjoas/development/autoencodix-reproducibility'

In [1]:
import pandas as pd
import pyarrow
import os

base_dir = "data/processed/Exp6_TCGA_METH_RNA/"
rna_path = "RNA_data.parquet"
meth_path = "METH_data.parquet"
clin_path = "ANNO_data.parquet"

rdf = pd.read_parquet(os.path.join(base_dir, rna_path))

mdf = pd.read_parquet(os.path.join(base_dir, meth_path))

cdf = pd.read_parquet(os.path.join(base_dir, clin_path))

print(f"rdf shape: {rdf.shape}")

print(f"mdf shape: {mdf.shape}")

print(f"cdf shape: {cdf.shape}")

FileNotFoundError: [Errno 2] No such file or directory: 'data/processed/Exp6_TCGA_METH_RNA/RNA_data.parquet'

In [None]:
tcga_mapping = pd.read_csv("data/raw/tcga_mappings.txt", sep="\t")
print(tcga_mapping.shape)

(3230, 4)


In [None]:
common_id = cdf.index.intersection(rdf.index)
print(len(common_id))
clin_data_filtered = cdf.loc[common_id]
print(f"filtered shape: {clin_data_filtered.shape}")


3529
filtered shape: (3529, 56)


In [None]:
tcga_mapping.head()

Unnamed: 0,sample_ids,img_paths,extra_class_labels,CANCER_TYPE_ACRONYM
0,TCGA-05-4244-01,0_label_1.png,Non-Small Cell Lung Cancer,LUAD
1,TCGA-05-4249-01,1_label_1.png,Non-Small Cell Lung Cancer,LUAD
2,TCGA-05-4250-01,2_label_1.png,Non-Small Cell Lung Cancer,LUAD
3,TCGA-05-4382-01,3_label_1.png,Non-Small Cell Lung Cancer,LUAD
4,TCGA-05-4384-01,4_label_1.png,Non-Small Cell Lung Cancer,LUAD


### 2-3. Load MNIST Data and Create Mapping

In [None]:
import os
from PIL import Image
import numpy as np

import keras

(x_train, y_train), (x_test, y_test) = keras.datasets.mnist.load_data()

assert x_train.shape == (60000, 28, 28)

assert x_test.shape == (10000, 28, 28)

assert y_train.shape == (60000,)

assert y_test.shape == (10000,)
TCGA_DIR = "data/raw/images/tcga_fake"
mapping = {
    "Breast Cancer": 0,
    "Non-Small Cell Lung Cancer": 1,
    "Colorectal Cancer": 2,
    "Endometrial Cancer": 3,
    "Ovarian Epithelial Tumor": 4,
}


x_data = np.concatenate((x_train, x_test), axis=0)
y_data = np.concatenate((y_train, y_test), axis=0)
mapping_revers = {v: k for k, v in mapping.items()}


selected_labels = []

selected_images = []

label_mapping = {"sample_ids": [], "image_name": []}

used_indices = np.zeros(len(y_data), dtype=bool)

for i, cancertype in enumerate(clin_data_filtered["CANCER_TYPE"]):
    matching_label = mapping[cancertype]

    # Find the index of the first occurrence of the matching label in y_data that has not been used yet
    indices = np.where((y_data == matching_label) & (~used_indices))[0]

    if len(indices) == 0:
        raise ValueError(f"No more images left for label {matching_label}")

    index = indices[0]
    used_indices[index] = True  # Mark this index as used

    # Select matching label from y_data
    selected_label = y_data[index]
    selected_labels.append(selected_label)

    image_name = f"{i}_label_{selected_label}.png"
    label_mapping["sample_ids"].append(clin_data_filtered.index[i])
    label_mapping["image_name"].append(image_name)

    # Select matching image from x_data
    selected_image = x_data[index]
    image = Image.fromarray(selected_image)
    image.save(os.path.join(TCGA_DIR, image_name))

# Print results for verification
print(f"Selected labels: {selected_labels}")
print(f"Label mapping: {label_mapping}")

Selected labels: [np.uint8(0), np.uint8(0), np.uint8(0), np.uint8(0), np.uint8(0), np.uint8(0), np.uint8(0), np.uint8(0), np.uint8(0), np.uint8(0), np.uint8(0), np.uint8(0), np.uint8(0), np.uint8(0), np.uint8(0), np.uint8(0), np.uint8(0), np.uint8(0), np.uint8(0), np.uint8(0), np.uint8(0), np.uint8(0), np.uint8(0), np.uint8(0), np.uint8(0), np.uint8(0), np.uint8(0), np.uint8(0), np.uint8(0), np.uint8(0), np.uint8(0), np.uint8(0), np.uint8(0), np.uint8(0), np.uint8(0), np.uint8(0), np.uint8(0), np.uint8(0), np.uint8(0), np.uint8(0), np.uint8(0), np.uint8(0), np.uint8(0), np.uint8(0), np.uint8(0), np.uint8(0), np.uint8(0), np.uint8(0), np.uint8(0), np.uint8(0), np.uint8(0), np.uint8(0), np.uint8(0), np.uint8(0), np.uint8(0), np.uint8(0), np.uint8(0), np.uint8(0), np.uint8(0), np.uint8(0), np.uint8(0), np.uint8(0), np.uint8(0), np.uint8(0), np.uint8(0), np.uint8(0), np.uint8(0), np.uint8(0), np.uint8(0), np.uint8(0), np.uint8(0), np.uint8(0), np.uint8(0), np.uint8(0), np.uint8(0), np.uint

In [None]:
tcga_mapping.head()

Unnamed: 0,sample_ids,img_paths,extra_class_labels,CANCER_TYPE_ACRONYM
0,TCGA-05-4244-01,0_label_1.png,Non-Small Cell Lung Cancer,LUAD
1,TCGA-05-4249-01,1_label_1.png,Non-Small Cell Lung Cancer,LUAD
2,TCGA-05-4250-01,2_label_1.png,Non-Small Cell Lung Cancer,LUAD
3,TCGA-05-4382-01,3_label_1.png,Non-Small Cell Lung Cancer,LUAD
4,TCGA-05-4384-01,4_label_1.png,Non-Small Cell Lung Cancer,LUAD


In [None]:
label_mapping.keys()

dict_keys(['sample_id', 'image_name'])

In [None]:
tcga_new = pd.DataFrame(label_mapping)

In [None]:
tcga_new.head()

Unnamed: 0,sample_id,image_name
0,TCGA-3C-AAAU-01,0_label_0.png
1,TCGA-3C-AALI-01,1_label_0.png
2,TCGA-3C-AALJ-01,2_label_0.png
3,TCGA-3C-AALK-01,3_label_0.png
4,TCGA-4H-AAAK-01,4_label_0.png


In [None]:
tcga_new["extra_class_labels"] = clin_data_filtered.loc[tcga_new.sample_id]["CANCER_TYPE"].to_list()

tcga_new["CANCER_TYPE_ACRONYM"] = clin_data_filtered.loc[tcga_new.sample_id]["CANCER_TYPE_ACRONYM"].to_list()
print(tcga_new.head())

         sample_id     image_name extra_class_labels CANCER_TYPE_ACRONYM
0  TCGA-3C-AAAU-01  0_label_0.png      Breast Cancer                BRCA
1  TCGA-3C-AALI-01  1_label_0.png      Breast Cancer                BRCA
2  TCGA-3C-AALJ-01  2_label_0.png      Breast Cancer                BRCA
3  TCGA-3C-AALK-01  3_label_0.png      Breast Cancer                BRCA
4  TCGA-4H-AAAK-01  4_label_0.png      Breast Cancer                BRCA


In [None]:
tcga_new.to_csv("data/raw/tcga_mappings.txt", sep="\t", index=False)

In [None]:
test = pd.read_csv("data/raw/tcga_mappings.txt", sep="\t")
test.head()

Unnamed: 0,sample_id,image_name,extra_class_labels,CANCER_TYPE_ACRONYM
0,TCGA-3C-AAAU-01,0_label_0.png,Breast Cancer,BRCA
1,TCGA-3C-AALI-01,1_label_0.png,Breast Cancer,BRCA
2,TCGA-3C-AALJ-01,2_label_0.png,Breast Cancer,BRCA
3,TCGA-3C-AALK-01,3_label_0.png,Breast Cancer,BRCA
4,TCGA-4H-AAAK-01,4_label_0.png,Breast Cancer,BRCA
