In [1]:
from os.path import join

from openslide import OpenSlide
import numpy as np
import pandas as pd

In [2]:
SEED = 0
RNG = np.random.default_rng(SEED)

MIN_N_SAMPLES_PER_DIAG = 50
MAX_N_SAMPLES_PER_DIAG = 75

BASE_DIR = "/home/data/GDC_BBCLL/"

# Sampling

In [3]:
# Function that allows us to randomly sample from each sub-type
# The reason the number of samples is different for each sub-type is to introduce class unbalancy
def sample_n(df):
    if len(df) < MIN_N_SAMPLES_PER_DIAG:
        return df
    n = RNG.integers(low=MIN_N_SAMPLES_PER_DIAG, high=MAX_N_SAMPLES_PER_DIAG, endpoint=True)
    return df.sample(n=n, random_state=RNG)

In [4]:
metadata = pd.read_csv("metadata.csv")

# Only choosing slides that the sample type is "Primary Solid Tumor"
# https://gdc.cancer.gov/resources-tcga-users/tcga-code-tables/sample-type-codes
solid_tumor_mask = metadata["sample_type_id"] == 1
metadata = metadata[solid_tumor_mask]

# Getting insight into the new metadata
diag_counts = metadata.groupby("primary_site")["project_name"].value_counts()
print(diag_counts)

# Making sure to only keep sub-types that we have more than 50 of them.
diags_count_dict = {}
for index in diag_counts.index:
    site, diag = index
    count = diag_counts[index]
    if count >= 50:
        diags_count_dict[diag] = count
print(diags_count_dict)

diags = list(diags_count_dict.keys())

# Only choosing slides with project_name in diags
diags_mask = metadata["project_name"].isin(diags)
metadata = metadata[diags_mask]

# Shuffling the dataset before eliminating repeated patient_ids
metadata.sample(frac=1, random_state=RNG)

# Ensuring only unique patient_id entries
metadat = metadata.drop_duplicates(subset="patient_id", keep="first")

# Sampling
metadata = metadata.groupby("project_name", group_keys=False).apply(sample_n)
metadata = metadata.reset_index(drop=True)

# Final metadata summary
print(len(metadata))
diag_counts = metadata.groupby("primary_site")["project_name"].value_counts()
print(diag_counts)

primary_site                       project_name                                   
Brain                              Glioblastoma Multiforme                            2040
                                   Brain Lower Grade Glioma                           1543
                                   Lymphoid Neoplasm Diffuse Large B-cell Lymphoma       4
Breast                             Breast Invasive Carcinoma                          2704
                                   Lymphoid Neoplasm Diffuse Large B-cell Lymphoma       2
Bronchus and lung                  Lung Adenocarcinoma                                1359
                                   Lung Squamous Cell Carcinoma                       1265
                                   Mesothelioma                                          2
Colon                              Colon Adenocarcinoma                               1307
                                   Rectum Adenocarcinoma                                18
       

# Finding Zoom Level

In [5]:
def get_magnification(row):
    path = join(BASE_DIR, row["id"], row["file_name"])
    slide = OpenSlide(path)
    magnification = slide.properties.get('openslide.objective-power')
    if magnification:
        magnification = int(magnification)
    return magnification

In [6]:
metadata["magnification"] = metadata.apply(get_magnification, axis=1)

# Saving

In [7]:
metadata.to_csv('sampled_metadata.csv', index=False)