In [1]:
import os
import cv2
import glob
import timm
import random
import numpy as np
import pandas as pd
from collections import Counter
from sklearn.model_selection import train_test_split

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset, RandomSampler

import albumentations as A
from albumentations.pytorch.transforms import ToTensorV2

import pytorch_lightning as pl
from pytorch_lightning.callbacks import ModelCheckpoint
from torchmetrics.functional import mean_squared_error, mean_absolute_error

batch_size = 128

df = pd.read_excel("data/US_fibrosis_stage_dataset.xlsx", engine="openpyxl")
df = df.loc[(df.AST < 100) | (df.ALT < 100)].reset_index(drop=True)
df = df.loc[:, ["ID", "kPa_fib"]].dropna().reset_index(drop=True)
df.kPa_fib = df.kPa_fib.map(lambda x: 25 if x >= 25 else x)

def generate_target(x):
    if x > 12:
        return 1
    else:
        return 0
    
df.loc[:, "target"] = df.kPa_fib.map(generate_target)
df.ID = df.ID.map(lambda x: str(x).zfill(8))

flist = os.listdir("data/roi/")
id_list = list(map(lambda x: x.split("_")[0].zfill(8), flist))

image_df = pd.DataFrame(glob.glob(os.path.join("data", "roi", "*.jpg")), columns=["image_path"])
image_df.loc[:, "ID"] = image_df.image_path.map(lambda x: x.split("/")[-1].split("-")[0])

df = pd.merge(df, image_df, on="ID", how="inner")
df.head()

Unnamed: 0,ID,kPa_fib,target,image_path
0,266195,3.8,0,data/roi/00266195-0.jpg
1,266195,3.8,0,data/roi/00266195-1.jpg
2,266195,3.8,0,data/roi/00266195-10.jpg
3,266195,3.8,0,data/roi/00266195-11.jpg
4,266195,3.8,0,data/roi/00266195-12.jpg


In [2]:
ids = df.loc[:, ["ID", "target"]].drop_duplicates().reset_index(drop=True)

train_id, test_id = train_test_split(ids, stratify=ids.target, test_size=0.15, random_state=42)
train_id = train_id.reset_index(drop=True)

train_id, valid_id = train_test_split(train_id, stratify=train_id.target, test_size=0.1, random_state=42)
train_id = train_id.ID
valid_id = valid_id.ID
test_id = test_id.ID

train_df = df[df.ID.isin(train_id)].reset_index(drop=True)
valid_df = df[df.ID.isin(valid_id)].reset_index(drop=True)
test_df = df[df.ID.isin(test_id)].reset_index(drop=True)

print("Train: ", len(train_df.ID.drop_duplicates()))
print("Valid: ", len(valid_df.ID.drop_duplicates()))
print("Test: ", len(test_df.ID.drop_duplicates()))

Train:  902
Valid:  101
Test:  177


In [3]:
from typing import Callable

class ImbalancedDatasetSampler(torch.utils.data.sampler.Sampler):
    """Samples elements randomly from a given list of indices for imbalanced dataset
    Arguments:
        indices: a list of indices
        num_samples: number of samples to draw
        callback_get_label: a callback-like function which takes two arguments - dataset and index
    """

    def __init__(
        self,
        dataset,
        labels: list = None,
        indices: list = None,
        num_samples: int = None,
        callback_get_label: Callable = None,
    ):
        # if indices is not provided, all elements in the dataset will be considered
        self.indices = list(range(len(dataset))) if indices is None else indices

        # define custom callback
        self.callback_get_label = dataset.df.target

        # if num_samples is not provided, draw `len(indices)` samples in each iteration
        self.num_samples = len(self.indices) if num_samples is None else num_samples

        # distribution of classes in the dataset
        df = pd.DataFrame()
        df["label"] = self._get_labels(dataset) if labels is None else labels
        df.index = self.indices
        df = df.sort_index()

        label_to_count = df["label"].value_counts()

        weights = 1.0 / label_to_count[df["label"]]

        self.weights = torch.DoubleTensor(weights.to_list())

    def __iter__(self):
        return (self.indices[i] for i in torch.multinomial(self.weights, self.num_samples, replacement=True))

    def __len__(self):
        return self.num_samples
    

def define_augmentation(w, h):
    train_transforms = A.Compose([ 
        A.Resize(width=w, height=h, p=1.0),
        A.OneOf([
            A.Downscale(),
        ], p=0.5),        
        
        A.HorizontalFlip(p=0.5),
        
        A.Affine(p=0.8),
        
        A.OneOf([
            A.RandomBrightnessContrast(),
            A.RandomBrightness(),
            A.RandomContrast()
        ], p=0.5),
        
        A.Normalize(p=1.0),
        ToTensorV2()
    ])

    valid_transforms = A.Compose([ 
        A.Resize(width=w, height=h, p=1.0),
        A.Normalize(p=1.0),
        ToTensorV2()
    ])

    return train_transforms, valid_transforms


class SonographyDataset(Dataset):
    def __init__(self, df, transform, train_mode=False):
        self.df = df
        self.transform = transform
        self.train_mode = train_mode
        
        
    def __len__(self):
        return len(self.df)

    
    def __getitem__(self, idx):
        image = cv2.imread(self.df.loc[idx, "image_path"])
        image = self.transform(image=image)
        
        y = self.df.loc[idx, "kPa_fib"]
#         if self.train_mode:
#             y += np.log(np.random.rand(1)[0])
        
        return image['image'], torch.tensor(y).log().float()
    
    
train_transform, valid_transform = define_augmentation(w=224, h=224)

train_dataset = SonographyDataset(train_df, train_transform, train_mode=True)
valid_dataset = SonographyDataset(valid_df, valid_transform)
test_dataset = SonographyDataset(test_df, valid_transform)

train_dataloader = DataLoader(train_dataset, batch_size=batch_size, 
                              num_workers=16, prefetch_factor=10,
                              sampler=ImbalancedDatasetSampler(train_dataset, labels=train_dataset.df.target),
                              pin_memory=True)

valid_dataloader = DataLoader(valid_dataset, batch_size=batch_size, 
                              num_workers=16, prefetch_factor=10,
                              pin_memory=True)

test_dataloader = DataLoader(test_dataset, batch_size=batch_size, 
                              num_workers=16, prefetch_factor=10,
                              pin_memory=True)



In [4]:
class KpaPredictor(pl.LightningModule):
    def __init__(self, model):
        super().__init__()
        self.model = model
        
        
    def forward(self, x):
        return self.model(x)
    
    
    def step(self, batch):
        # x: image, y: kpa
        x, y = batch
        preds = torch.squeeze(self(x), -1)
        
        loss = F.l1_loss(preds, y)
        mse = torch.sqrt(mean_squared_error(preds, y))
        
        return preds, loss, mse
    
    
    def training_step(self, batch, batch_idx):
        _, loss, rmse = self.step(batch)
        
        self.log('train_loss', loss, on_step=False, on_epoch=True, prog_bar=True)
        self.log('train_rmse', rmse, on_step=False, on_epoch=True, prog_bar=True)
        
        return loss
    
    
    def validation_step(self, batch, batch_idx):
        _, loss, rmse = self.step(batch)
        
        self.log('valid_loss', loss, on_step=False, on_epoch=True, prog_bar=True)
        self.log('valid_rmse', rmse, on_step=False, on_epoch=True, prog_bar=True)
        
    
    def test_step(self, batch, batch_idx):
        _, loss, rmse = self.step(batch)
        
        self.log('test_loss', loss, on_step=False, on_epoch=True, prog_bar=True)
        self.log('test_rmse', rmse, on_step=False, on_epoch=True, prog_bar=True)
    
    
    def predict_step(self, batch, batch_idx):
        preds, _, _ = self.step(batch)
        
        return preds

    
    def configure_optimizers(self):
        optimizer = torch.optim.Adam(self.parameters(), lr=5e-4)
        scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=10)
        
        return {"optimizer": optimizer, "lr_scheduler": scheduler}
    
    
    def lr_scheduler_step(self, scheduler, optimizer_idx, metric):
        scheduler.step(epoch=self.current_epoch)

    
    
callbacks = [
    ModelCheckpoint(monitor='valid_loss', save_top_k=3, dirpath='weights/ResNet152_regression', filename='kpa_predictor-{epoch:03d}-{valid_loss:.4f}-{valid_rmse:.4f}'),
]


model = timm.create_model("resnet152", num_classes=1, pretrained=True)
kpa_predictor = KpaPredictor(model)

trainer = pl.Trainer(max_epochs=50, gpus=[1], 
                     enable_progress_bar=True, 
                     callbacks=callbacks, precision=16)

  rank_zero_deprecation(
Using 16bit native Automatic Mixed Precision (AMP)
GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


In [None]:
trainer.fit(kpa_predictor, train_dataloader, valid_dataloader)

  rank_zero_warn(f"Checkpoint directory {dirpath} exists and is not empty.")
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]

  | Name  | Type   | Params
---------------------------------
0 | model | ResNet | 58.1 M
---------------------------------
58.1 M    Trainable params
0         Non-trainable params
58.1 M    Total params
116.292   Total estimated model params size (MB)


Sanity Checking: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]



Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

In [6]:
ckpt_fname = "kpa_predictor-epoch=025-valid_loss=0.4188-valid_rmse=0.5472.ckpt"
kpa_predictor = kpa_predictor.load_from_checkpoint("weights/ResNet152_regression/" + ckpt_fname, model=model)

trainer.test(kpa_predictor, test_dataloader)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]


Testing: 0it [00:00, ?it/s]

────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
       Test metric             DataLoader 0
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
        test_loss           0.4280899465084076
        test_rmse           0.5596672296524048
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────


[{'test_loss': 0.4280899465084076, 'test_rmse': 0.5596672296524048}]

In [7]:
from tqdm import tqdm

class PredictDataset(Dataset):
    def __init__(self, df, transform):
        self.df = df
        self.transform = transform
        
        
    def __len__(self):
        return len(self.df)

    
    def __getitem__(self, idx):
        ID = self.df.loc[idx, "ID"]
        fname = self.df.loc[idx, "image_path"]
        
        image = cv2.imread(self.df.loc[idx, "image_path"])
        image = self.transform(image=image)
        
        y = self.df.loc[idx, "kPa_fib"]
        
        return image['image'], torch.tensor(y).log().float(), ID, fname
    
predict_df = pd.concat([valid_df, test_df], axis=0).reset_index(drop=True)
    
predict_dataset = PredictDataset(predict_df, valid_transform)
predict_dataloader = DataLoader(predict_dataset, batch_size=1)

results = []

kpa_predictor.model.eval()
kpa_predictor.model.to("cuda")

for batch in predict_dataloader:
    image, y, ID, fname = batch
    
    pred = kpa_predictor(image.to("cuda"))
    results.append([ID[0], np.exp(pred.detach().to("cpu").numpy()[0][0])])
    
results = pd.DataFrame(results, columns=['ID', 'pred'])
results = results.groupby("ID").head(3).sort_values(["ID", "pred"], ascending=False).groupby("ID").agg(list).reset_index()
results = pd.concat([results['ID'], pd.DataFrame(results['pred'].tolist(), columns=['v1', 'v2', 'v3'])], axis=1)

df = pd.read_excel("data/US_fibrosis_stage_dataset.xlsx", engine="openpyxl")
df = df.loc[:, ["ID", "age", "AST", "ALT", "PLT"]]
df.ID = df.ID.map(lambda x: str(x).zfill(8))

predict_df = pd.merge(predict_df, df, on="ID", how="left")
predict_df = pd.merge(predict_df, results, on="ID", how="left")
predict_df = predict_df.loc[:, ["ID", "kPa_fib", "v1", "v2", "v3", "age", "AST", "ALT", "PLT"]]
predict_df = predict_df.drop_duplicates().reset_index(drop=True)

predict_df.head(40)

# predict_df.to_csv("data/fibroscan_predict_df.csv", index=False)

Unnamed: 0,ID,kPa_fib,v1,v2,v3,age,AST,ALT,PLT
0,883439,7.3,20.191383,5.098398,4.340386,59,42.0,22.0,158.0
1,1034390,5.1,6.011755,5.820227,4.723373,72,25.0,18.0,139.0
2,1062934,11.7,4.694145,4.607368,4.416852,72,34.0,21.0,139.0
3,1067391,8.9,9.413785,8.260871,5.150177,62,55.0,47.0,259.0
4,1092354,3.3,4.840446,4.682412,4.656144,48,22.0,13.0,199.0
5,1131003,5.8,6.161881,4.332696,4.260494,59,22.0,12.0,236.0
6,1166636,10.7,8.23795,5.574651,4.677879,65,53.0,13.0,63.0
7,1168622,7.1,4.90064,4.89407,4.821332,75,26.0,11.0,205.0
8,1191410,10.4,5.126828,4.716825,4.13588,62,33.0,28.0,153.0
9,1193976,4.4,9.86424,4.744425,4.319278,52,26.0,24.0,180.0


In [8]:
predict_df.to_csv("data/fibroscan_predict_df.csv", index=False)

In [9]:
mre_df = pd.read_excel("data/US_fibrosis_stage_dataset.xlsx", engine="openpyxl")
mre_df = mre_df.loc[:, ["ID", "kPa_mre"]].dropna().reset_index(drop=True)
mre_df.ID = mre_df.ID.map(lambda x: str(x).zfill(8))
flist = os.listdir("data/roi_sampled/")
id_list = list(map(lambda x: x.split("_")[0].zfill(8), flist))

image_df = pd.DataFrame(glob.glob(os.path.join("data", "roi_sampled", "*.jpg")), columns=["image_path"])
image_df.loc[:, "ID"] = image_df.image_path.map(lambda x: x.split("/")[-1].split("-")[0])

mre_df = pd.merge(mre_df, image_df, on="ID", how="inner")
mre_df.head()


Unnamed: 0,ID,kPa_mre,image_path
0,8960,4.34,data/roi_sampled/00008960-6.jpg
1,8960,4.34,data/roi_sampled/00008960-0.jpg
2,8960,4.34,data/roi_sampled/00008960-1.jpg
3,8960,4.34,data/roi_sampled/00008960-10.jpg
4,8960,4.34,data/roi_sampled/00008960-11.jpg


In [12]:
class PredictDataset(Dataset):
    def __init__(self, df, transform):
        self.df = df
        self.transform = transform
        
        
    def __len__(self):
        return len(self.df)

    
    def __getitem__(self, idx):
        ID = self.df.loc[idx, "ID"]
        fname = self.df.loc[idx, "image_path"]
        
        image = cv2.imread(self.df.loc[idx, "image_path"])
        image = self.transform(image=image)
        
        y = self.df.loc[idx, "kPa_mre"]
        
        return image['image'], torch.tensor(y).log().float(), ID, fname

mre_dataset = PredictDataset(mre_df, valid_transform)
mre_dataloader = DataLoader(mre_dataset, batch_size=1)

results = []

kpa_predictor.model.eval()
kpa_predictor.model.to("cuda")

for batch in mre_dataloader:
    image, y, ID, fname = batch
    
    pred = kpa_predictor(image.to("cuda"))
    results.append([ID[0], np.exp(pred.detach().to("cpu").numpy()[0][0])])
    
results = pd.DataFrame(results, columns=['ID', 'pred'])
results = results.groupby("ID").head(3).sort_values(["ID", "pred"], ascending=False).groupby("ID").agg(list).reset_index()
results = pd.concat([results['ID'], pd.DataFrame(results['pred'].tolist(), columns=['v1', 'v2', 'v3'])], axis=1)

df = pd.read_excel("data/US_fibrosis_stage_dataset.xlsx", engine="openpyxl")
df = df.loc[:, ["ID", "age", "AST", "ALT", "PLT"]]
df.ID = df.ID.map(lambda x: str(x).zfill(8))

mre_df = pd.merge(mre_df, df, on="ID", how="left")
mre_df = pd.merge(mre_df, results, on="ID", how="left")
mre_df = mre_df.loc[:, ["ID", "kPa_mre", "v1", "v2", "v3", "age", "AST", "ALT", "PLT"]]
mre_df = mre_df.drop_duplicates().reset_index(drop=True)

mre_df.to_csv("data/mre_predict_df.csv", index=False)
