In [1]:
from os import makedirs
from os.path import join, basename, splitext
from statistics import mode, mean
from collections import Counter
import random
import glob
import json
import pickle
import datetime
import sys
import json
import yaml

import numpy as np
from numpy.linalg import norm
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms
import openslide
from matplotlib import pyplot as plt
import h5py
from PIL import Image
from tqdm.notebook import tqdm

In [2]:
diagnoses_dict = {
    "Brain Lower Grade Glioma": "LGG",
    "Glioblastoma Multiforme": "GBM",
    "Breast Invasive Carcinoma": "BRCA",
    "Lung Adenocarcinoma": "LUAD",
    "Lung Squamous Cell Carcinoma": "LUSC",
    "Colon Adenocarcinoma": "COAD",
    "Liver Hepatocellular Carcinoma": "LIHC",
    "Cholangiocarcinoma": "CHOL",
}

sites_dict = {
    "Brain": "brain",
    "Breast": "breast",
    "Bronchus and lung": "lung",
    "Colon": "colon",
    "Liver and intrahepatic bile ducts": "liver",
}

In [3]:
DATA_DIR = "FEATURES/DATABASE/"

# DATASETS = ["brain", "breast", "breast", "colon", "liver", "lung"]
# EXPERIMENTS = ["UCLA", "READER_STUDY", "BRCA_HER2", "BRCA_TRASTUZUMAB", "GBM_MICROSCOPE_CPTAC", "GBM_MICROSCOPE_UPENN"]
# EXTENSIONS = ["svs", "svs", "svs", "svs", "svs", "ndpi"]

DATASETS = ["brain", "breast", "breast", "colon", "liver", "lung"]
EXPERIMENTS = ["UCLA", "READER_STUDY", "BRCA_HER2", "BRCA_TRASTUZUMAB", "GBM_MICROSCOPE_CPTACC"]
EXTENSIONS = ["svs", "svs", "svs", "svs", "svs"]

metadata = pd.read_csv("FEATURES/DATABASE/sampled_metadata.csv")
metadata = metadata.set_index('file_name')

In [4]:
dataset = "organ"

for experiment, extension in zip(EXPERIMENTS, EXTENSIONS):
    k = 10
    with open(f"FEATURES/TEST_DATA/{experiment}/query_slides.yaml", 'r') as f:
        QUERY_SLIDES = yaml.safe_load(f)
    with open(f"FEATURES/TEST_DATA/{experiment}/query_subtypes.yaml", 'r') as f:
        QUERY_SUBTYPES = yaml.safe_load(f)
    
    save_dir = join("TEST_DATA_RESULTS", experiment)
    makedirs(save_dir, exist_ok=True)
    
    results_path = join("FEATURES/TEST_DATA/", experiment, "results", dataset, "Results.pkl")
    with open(results_path, "rb") as file:
        results = pickle.load(file)

    records = []
    for test_slide, ret_final in results.items():
        temp = []

        query_name = test_slide
        query_site = QUERY_SLIDES[query_name]
        query_diagnosis = QUERY_SUBTYPES[query_name]

        temp.extend([query_name, query_site, query_diagnosis])
        
        for path, sim, mean_sim in ret_final[:k]:
            result_name = basename(path)
            result_site = sites_dict[metadata.loc[result_name, "primary_site"]]
            result_diagnosis = diagnoses_dict[metadata.loc[result_name, "project_name"]]
            result_distance = mean_sim

            temp.extend([result_name, result_site, result_diagnosis, result_distance])
        
        records.append(temp)
    
    columns = ["query_name", "query_site", "query_diagnosis"]
    [columns.extend([f"ret_{i}_name", f"ret_{i}_site", f"ret_{i}_diagnosis", f"ret_{i}_dist"]) for i in range(1, k + 1)]
    
    for record in records:
        if len(record) < len(columns):
            record.extend([None] * (len(columns) - len(record)))
            
    df = pd.DataFrame.from_records(records, columns=columns)
    save_path = join(save_dir, f"site.csv")
    df.to_csv(save_path, index=False)

In [11]:
for experiment, extension in zip(EXPERIMENTS, EXTENSIONS):
    k = 5
    with open(f"FEATURES/TEST_DATA/{experiment}/query_slides.yaml", 'r') as f:
        QUERY_SLIDES = yaml.safe_load(f)
    with open(f"FEATURES/TEST_DATA/{experiment}/query_subtypes.yaml", 'r') as f:
        QUERY_SUBTYPES = yaml.safe_load(f)

    save_dir = join("TEST_DATA_RESULTS", experiment)
    makedirs(save_dir, exist_ok=True)

    records = []
    for slide, site in QUERY_SLIDES.items():
        results_path = join("FEATURES/TEST_DATA/", experiment, "results", f"{splitext(slide)[0]}_{site}", "Results.pkl")
        with open(results_path, "rb") as file:
            results = pickle.load(file)
        
        for test_slide, ret_final in results.items():
            temp = []
    
            query_name = test_slide
            query_site = QUERY_SLIDES[query_name]
            query_diagnosis = QUERY_SUBTYPES[query_name]
    
            temp.extend([query_name, query_site, query_diagnosis])
            
            for path, sim, mean_sim in ret_final[:k]:
                result_name = basename(path)
                result_site = sites_dict[metadata.loc[result_name, "primary_site"]]
                result_diagnosis = diagnoses_dict[metadata.loc[result_name, "project_name"]]
                result_distance = mean_sim
    
                temp.extend([result_name, result_site, result_diagnosis, result_distance])
            
            records.append(temp)
    
    columns = ["query_name", "query_site", "query_diagnosis"]
    [columns.extend([f"ret_{i}_name", f"ret_{i}_site", f"ret_{i}_diagnosis", f"ret_{i}_dist"]) for i in range(1, k + 1)]
    
    for record in records:
        if len(record) < len(columns):
            record.extend([None] * (len(columns) - len(record)))
            
    df = pd.DataFrame.from_records(records, columns=columns)
    save_path = join(save_dir, f"sub_type.csv")
    df.to_csv(save_path, index=False)