In [1]:
import os
import subprocess
import sys
from pathlib import Path

import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from PIL import Image

import torch
from torch import nn
from torch.nn import functional as F
from torch.utils.data import DataLoader, Dataset
from torchmetrics import Accuracy
import torchvision.transforms as T
import torchvision.transforms.functional as F
from torch.optim.lr_scheduler import CyclicLR, ReduceLROnPlateau
import pytorch_lightning as pl
from openslide import OpenSlide

from torchvision.models import resnet50
from torch.utils.data import Sampler, WeightedRandomSampler

from tqdm import tqdm
import h5py
from sklearn.model_selection import train_test_split
import ngsci

In [2]:
brca_dir = Path().home() / 'datasets' / 'brca-psj-path' / "contest-phase-2"
image_dir = brca_dir / "basic-downsampling" / "v2-subsample-a"
table_dir = brca_dir / "csv-train"
ndpi_dir = Path().home() / 'datasets' / 'brca-psj-path' / 'ndpi'
clam_train_dir = brca_dir / 'clam-preprocessing-train'
clam_test_dir = brca_dir / 'clam-preprocessing-holdout'

masks_dir = clam_train_dir / 'masks'
patches_dir = clam_train_dir / 'patches'
patches_dir_test = clam_test_dir / 'patches'
stitches_dir = clam_train_dir / 'stitches'
features_h5_dir = clam_train_dir / 'resnet50-features'/ 'h5_files'
features_pt_dir = clam_train_dir / 'resnet50-features'/ 'pt_files'

logger_dir = Path().home() / "logs"

In [3]:
weights_path = "lightning_checkpoints/lightning_logs/version_7/checkpoints/epoch=31-step=14624.ckpt"
backbone = resnet50(pretrained=False)
backbone.fc = nn.Linear(backbone.fc.in_features, 5)

state_dict = torch.load(weights_path, map_location=torch.device('cpu'))['state_dict']

renamed_state_dict = {}
for key, value in state_dict.items():
    new_key = key[6:]
    renamed_state_dict[new_key] = value
        
backbone.load_state_dict(renamed_state_dict)
backbone.eval()

test_augs = T.Compose([T.ToTensor()])
soft = nn.Softmax(dim=1)



In [5]:
class BreastBiopsy(Dataset):
    def __init__(self, mapping_file, transform=None, target_transform=None):
        self.dataframe = pd.read_csv(mapping_file)
        self.target_transform = target_transform
        self.transform = transform
        self.demographic_column = "race"

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        slide_id = self.dataframe.loc[idx, "slide_id"]

        with h5py.File(patches_dir / f'{slide_id}.h5', 'r') as f:
            coords = f['coords'][:]
            
        random_indices = np.random.choice(coords.shape[0], size=10)
        random_rows = coords[random_indices]

        with OpenSlide(ndpi_dir / f'{slide_id}.ndpi') as slide:
            tiles = []
            labels = []
            demos = []
            
            for row in random_rows:
                tile_img = slide.read_region(
                    location=row,
                    level=0,
                    size=(256, 256)
                )

                label = self.dataframe.loc[idx, 'stage_int']
                demo = self.dataframe.loc[idx, self.demographic_column]

                if self.transform:
                    tile_img = self.transform(tile_img.convert('RGB'))
                if self.target_transform:
                    label = self.target_transform(label)

                tiles.append(tile_img)
                labels.append(torch.tensor(label))
                demos.append(demo)

        return tiles, labels, demos
    
class BreastBiopsyTest(Dataset):
    def __init__(self, mapping_file, transform=None, target_transform=None):
        self.dataframe = pd.read_csv(brca_dir / mapping_file)
        self.target_transform = target_transform
        self.transform = transform
        self.demographic_column = "race"

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        slide_id = self.dataframe.loc[idx, "slide_id"]

        with h5py.File(patches_dir_test / f'{slide_id}.h5', 'r') as f:
            coords = f['coords'][:]
            
        random_indices = np.random.choice(coords.shape[0], size=10)
        random_rows = coords[random_indices]
        
        test_ndpi_dir = Path().home() / 'datasets' / 'brca-psj-path' / 'ndpi-holdout'
        with OpenSlide(test_ndpi_dir / f'{slide_id}.ndpi') as slide:
            tiles = []
            labels = []
            demos = []
            
            for row in random_rows:
                tile_img = slide.read_region(
                    location=row,
                    level=0,
                    size=(256, 256)
                )

                if self.transform:
                    tile_img = self.transform(tile_img.convert('RGB'))

                tiles.append(tile_img)

        return tiles
        
#validation_data = pd.read_csv('validation.csv', nrows=2)
validation_data = BreastBiopsy(mapping_file='validation.csv', transform=test_augs)
validation_loader = DataLoader(validation_data, batch_size=1, num_workers=8)

testing_data = BreastBiopsyTest(mapping_file='slide-manifest-holdout.csv', transform=test_augs)
testing_loader = DataLoader(testing_data, batch_size=1, num_workers=8)

In [6]:
predictions_test = []

for batch, (tiles) in enumerate(testing_loader):
    tiles = torch.stack(tiles).squeeze()  # Stack tiles in the batch
    #tiles = tiles.to(device)  # Move tiles to the device (e.g., GPU)

    with torch.no_grad():
        features = backbone(tiles)
        outputs = soft(features)

    predictions_test.append(outputs.cpu().numpy())

# # Concatenate predictions for all tiles into a numpy array
predictions_test = np.concatenate(predictions_test)

In [7]:
testing_csv = pd.read_csv(brca_dir /'slide-manifest-holdout.csv')

In [8]:
predictions_test_array = np.array(predictions_test)  # Convert the predictions list to a NumPy array
predictions_test_array = predictions_test_array.reshape(14466, 10, 5)  # Reshape the array to (3547, 10)
mean_predictions_test = np.mean(predictions_test_array, axis=1) 

In [9]:
testing_csv[['prob_stage_0', 'prob_stage_1', 'prob_stage_2', 'prob_stage_3', 'prob_stage_4']] = mean_predictions_test
grouped_test = testing_csv.groupby('biopsy_id')[['prob_stage_0', 'prob_stage_1', 'prob_stage_2', 'prob_stage_3', 'prob_stage_4']].mean()

In [10]:
pred_stage = np.argmax(np.array(grouped_test[['prob_stage_0', 'prob_stage_1', 'prob_stage_2', 'prob_stage_3', 'prob_stage_4']]), axis=1)

In [11]:
grouped_test['stage_pred'] = pred_stage

In [12]:
grouped_test = grouped_test.reset_index()

filepath = "submission2.csv"

grouped_test.to_csv(filepath, index=False, header=False)

In [13]:
ngsci.stop_instance()

(<Result.SUCCESS: 1>, 'success')

In [2]:
ngsci.submit_contest_entry(
    "submission2.csv", description="Submission 2"
)

(<Result.SUCCESS: 1>, 'Success')

In [85]:
len(testing_csv)

14466

In [81]:
np.savetxt('data.csv', predictions_test, delimiter=',')

In [47]:
predictions = []

for batch, (tiles, labels, demos) in enumerate(validation_loader):
    tiles = torch.stack(tiles).squeeze()  # Stack tiles in the batch
    labels = torch.stack(labels)
    demos = torch.stack(demos)
    #tiles = tiles.to(device)  # Move tiles to the device (e.g., GPU)

    with torch.no_grad():
        features = backbone(tiles)
        outputs = soft(features)

    predictions.append(outputs.cpu().numpy())

# # Concatenate predictions for all tiles into a numpy array
predictions = np.concatenate(predictions)

Exception ignored in: <function _ConnectionBase.__del__ at 0x7fd9fd7a1990>
Traceback (most recent call last):
  File "/home/ngsci/.asdf/installs/python/3.10.9/lib/python3.10/multiprocessing/connection.py", line 132, in __del__
Traceback (most recent call last):
  File "/home/ngsci/.asdf/installs/python/3.10.9/lib/python3.10/multiprocessing/queues.py", line 239, in _feed
    reader_close()
  File "/home/ngsci/.asdf/installs/python/3.10.9/lib/python3.10/multiprocessing/connection.py", line 177, in close
    self._close()
  File "/home/ngsci/.asdf/installs/python/3.10.9/lib/python3.10/multiprocessing/connection.py", line 361, in _close
    _close(self._handle)
OSError: [Errno 9] Bad file descriptor
    self._close()
  File "/home/ngsci/.asdf/installs/python/3.10.9/lib/python3.10/multiprocessing/connection.py", line 361, in _close
    _close(self._handle)
OSError: [Errno 9] Bad file descriptor
Exception ignored in: <function _ConnectionBase.__del__ at 0x7fd9fd7a1990>
Traceback (most recent

TypeError: 'BreastBiopsy' object does not support item assignment

    self._target(*self._args, **self._kwargs)
  File "/home/ngsci/.asdf/installs/python/3.10.9/lib/python3.10/multiprocessing/queues.py", line 271, in _feed
    queue_sem.release()
ValueError: semaphore or lock released too many times
    queue_sem.release()
ValueError: semaphore or lock released too many times


In [65]:
predictions_array = np.array(predictions)  # Convert the predictions list to a NumPy array
predictions_array = predictions_array.reshape(3547, 10, 5)  # Reshape the array to (3547, 10)
mean_predictions = np.mean(predictions_array, axis=1) 

In [67]:
validation_csv[['prediction_0', 'prediction_1', 'prediction_2', 'prediction_3', 'prediction_4']] = mean_predictions

In [70]:
grouped = validation_csv.groupby('biopsy_id')[['stage_int', 'prediction_0', 
                                     'prediction_1', 'prediction_2', 
                                     'prediction_3', 'prediction_4']].mean()

In [71]:
from sklearn.metrics import roc_auc_score

roc_auc_score(grouped['stage_int'], grouped[['prediction_0', 'prediction_1', 'prediction_2', 'prediction_3', 'prediction_4']], multi_class='ovr')

0.6179475174782653

In [64]:
from sklearn.metrics import roc_auc_score

roc_auc_score(validation_csv['stage_int'], mean_predictions, multi_class='ovr')

ValueError: Target scores need to be probabilities for multiclass roc_auc, i.e. they should sum up to 1.0 over classes

In [56]:
validation_csv = pd.read_csv('validation.csv')

In [57]:
validation_csv['stage_int'].unique()

array([2, 1, 0, 4, 3])

In [50]:
len(validation_csv)

3547

In [45]:
predictions

[array([[0.13921665, 0.27859384, 0.18984735, 0.3679208 , 0.02442137],
        [0.11886358, 0.2591849 , 0.41398185, 0.18147205, 0.02649759],
        [0.11014639, 0.338629  , 0.21688011, 0.3209696 , 0.01337486],
        [0.08049978, 0.18591887, 0.32845303, 0.3612298 , 0.04389848],
        [0.10968459, 0.2817299 , 0.32332706, 0.23210652, 0.05315192],
        [0.10704922, 0.18278946, 0.2564581 , 0.41714498, 0.0365582 ],
        [0.13280453, 0.26732203, 0.28716496, 0.2608255 , 0.05188296],
        [0.09743032, 0.30762702, 0.3797851 , 0.19496673, 0.02019078],
        [0.0756904 , 0.27659202, 0.27705577, 0.3036023 , 0.06705955],
        [0.01527485, 0.28739876, 0.56771684, 0.09824431, 0.0313652 ]],
       dtype=float32),
 array([[0.030879  , 0.3550534 , 0.45118526, 0.14929941, 0.01358293],
        [0.05000063, 0.35959402, 0.43473846, 0.13185345, 0.02381345],
        [0.0480096 , 0.60008514, 0.26854303, 0.0798969 , 0.00346532],
        [0.05267908, 0.4256208 , 0.39088497, 0.11035256, 0.0204626

In [40]:
torch.stack(tiles).squeeze().shape

torch.Size([10, 3, 256, 256])

In [6]:
def process_single_slide(slide_id):
    preds = []
    with h5py.File(patches_dir / f'{slide_id}.h5', 'r') as f:
        coords = f['coords'][:]

    for x in coords:
        with OpenSlide(ndpi_dir / f'{slide_id}.ndpi') as slide:
            tile_img = slide.read_region(
                location=x, 
                level=0, 
                size=(256,256)
            )
        
            pred = soft(backbone(test_augs(tile_img.convert('RGB')).unsqueeze(0)))
            preds.append(pred)
            
    return preds

In [7]:
validation = pd.read_csv('validation.csv', nrows=2)

validation['prediction'] = validation['slide_id'].apply(process_single_slide)

KeyboardInterrupt: 