In [35]:
import os
import cv2
import glob
import timm
import random
import numpy as np
import pandas as pd
from collections import Counter
from sklearn.model_selection import train_test_split

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset

import albumentations as A
from albumentations.pytorch.transforms import ToTensorV2

import pytorch_lightning as pl
from pytorch_lightning.callbacks import ModelCheckpoint
from torchmetrics.functional import mean_squared_error, mean_absolute_error

batch_size = 128

df = pd.read_excel("data/US_fibrosis_stage_dataset.xlsx", engine="openpyxl")
# only HBV patients
# df = df[df.Etiology == 1].reset_index(drop=True)
df = df.loc[(df.AST < 100) | (df.ALT < 100)].reset_index(drop=True)
df = df.loc[:, ["ID", "kPa_fib"]].dropna().reset_index(drop=True)
df.kPa_fib = df.kPa_fib.map(lambda x: 30 if x >= 30 else x)

df.ID = df.ID.map(lambda x: str(x).zfill(8))

flist = os.listdir("data/roi_sampled/")
id_list = list(map(lambda x: x.split("_")[0].zfill(8), flist))

image_df = pd.DataFrame(glob.glob(os.path.join("data", "roi_sampled", "*.jpg")), columns=["image_path"])
image_df.loc[:, "ID"] = image_df.image_path.map(lambda x: x.split("/")[-1].split("-")[0])

df = pd.merge(df, image_df, on="ID", how="inner")
df.head()

Unnamed: 0,ID,kPa_fib,image_path
0,266195,3.8,data/roi_sampled/00266195-0.jpg
1,266195,3.8,data/roi_sampled/00266195-1.jpg
2,266195,3.8,data/roi_sampled/00266195-10.jpg
3,266195,3.8,data/roi_sampled/00266195-11.jpg
4,266195,3.8,data/roi_sampled/00266195-12.jpg


In [36]:
ids = df.ID.drop_duplicates().reset_index(drop=True)

train_id, test_id = train_test_split(ids, test_size=0.15, random_state=42)
train_id, valid_id = train_test_split(train_id, test_size=0.15, random_state=42)

train_df = df[df.ID.isin(train_id)].reset_index(drop=True)
valid_df = df[df.ID.isin(valid_id)].reset_index(drop=True)
test_df = df[df.ID.isin(test_id)].reset_index(drop=True)

print("Train: ", len(train_df.ID.drop_duplicates()))
print("Valid: ", len(valid_df.ID.drop_duplicates()))
print("Test: ", len(test_df.ID.drop_duplicates()))

Train:  852
Valid:  151
Test:  177


In [3]:
def define_augmentation(w, h):
    train_transforms = A.Compose([ 
        A.Resize(width=w, height=h, p=1.0),
        A.OneOf([
            A.Downscale(),
        ], p=0.5),        
        
        A.HorizontalFlip(p=0.5),
        
        A.Affine(p=0.8),
        
        A.OneOf([
            A.RandomBrightnessContrast(),
            A.RandomBrightness(),
            A.RandomContrast()
        ], p=0.5),
        
        A.Normalize(p=1.0),
        ToTensorV2()
    ])

    valid_transforms = A.Compose([ 
        A.Resize(width=w, height=h, p=1.0),
        A.Normalize(p=1.0),
        ToTensorV2()
    ])

    return train_transforms, valid_transforms


class SonographyDataset(Dataset):
    def __init__(self, df, transform, train_mode=False):
        self.df = df
        self.transform = transform
        self.train_mode = train_mode
        
        
    def __len__(self):
        return len(self.df)

    
    def __getitem__(self, idx):
        image = cv2.imread(self.df.loc[idx, "image_path"])
        image = self.transform(image=image)
        
        y = self.df.loc[idx, "kPa_fib"]
        if self.train_mode:
            y += np.random.rand(1)[0] - 0.5

        return image['image'], torch.tensor(y).log().float()
    
    
train_transform, valid_transform = define_augmentation(w=224, h=224)

train_dataset = SonographyDataset(train_df, train_transform, train_mode=True)
valid_dataset = SonographyDataset(valid_df, valid_transform)
test_dataset = SonographyDataset(test_df, valid_transform)


train_dataloader = DataLoader(train_dataset, batch_size=batch_size, 
                              num_workers=14, prefetch_factor=10,
                              pin_memory=True)

valid_dataloader = DataLoader(valid_dataset, batch_size=batch_size, 
                              num_workers=14, prefetch_factor=10,
                              pin_memory=True)

test_dataloader = DataLoader(test_dataset, batch_size=batch_size, 
                              num_workers=14, prefetch_factor=10,
                              pin_memory=True)



In [4]:
class KpaPredictor(pl.LightningModule):
    def __init__(self, model):
        super().__init__()
        self.model = model
        
        
    def forward(self, x):
        return self.model(x)
    
    
    def step(self, batch):
        # x: image, y: kpa
        x, y = batch
        preds = torch.squeeze(self(x), -1)
        loss = F.l1_loss(preds, y)
        mse = mean_squared_error(preds, y)
        
        return preds, loss, mse
    
    
    def training_step(self, batch, batch_idx):
        _, loss, mse = self.step(batch)
        
        self.log('train_loss', loss, on_step=False, on_epoch=True, prog_bar=True)
        self.log('train_mse', mse, on_step=False, on_epoch=True, prog_bar=True)
        
        return loss
    
    
    def validation_step(self, batch, batch_idx):
        _, loss, mse = self.step(batch)
        
        self.log('valid_loss', loss, on_step=False, on_epoch=True, prog_bar=True)
        self.log('valid_mse', mse, on_step=False, on_epoch=True, prog_bar=True)
        
    
    def test_step(self, batch, batch_idx):
        _, loss, mse = self.step(batch)
        
        self.log('test_loss', loss, on_step=False, on_epoch=True, prog_bar=True)
        self.log('test_mse', mse, on_step=False, on_epoch=True, prog_bar=True)
    
    
    def predict_step(self, batch, batch_idx):
        preds, _, _ = self.step(batch)
        
        return preds

    
    def configure_optimizers(self):
        optimizer = torch.optim.Adam(self.parameters(), lr=6e-4)
        scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=30)
        
        return {"optimizer": optimizer, "lr_scheduler": scheduler}
    
    
    def lr_scheduler_step(self, scheduler, optimizer_idx, metric):
        scheduler.step(epoch=self.current_epoch)

    
    
callbacks = [
    ModelCheckpoint(monitor='valid_loss', save_top_k=3, dirpath='weights/regression_res152_wo_bound', filename='kpa_predictor-{epoch:03d}-{valid_loss:.4f}-{valid_mse:.4f}'),
]


model = timm.create_model("resnet152", num_classes=1, pretrained=True)
kpa_predictor = KpaPredictor(model)

trainer = pl.Trainer(max_epochs=100, gpus=[1], 
                     enable_progress_bar=True, 
                     callbacks=callbacks, precision=16)

  rank_zero_deprecation(
Using 16bit native Automatic Mixed Precision (AMP)
GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


In [5]:
trainer.fit(kpa_predictor, train_dataloader, valid_dataloader)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]

  | Name  | Type   | Params
---------------------------------
0 | model | ResNet | 58.1 M
---------------------------------
58.1 M    Trainable params
0         Non-trainable params
58.1 M    Total params
116.292   Total estimated model params size (MB)


Sanity Checking: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]



Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [6]:
ckpt_fname = "kpa_predictor-epoch=027-valid_loss=0.4691-valid_mse=0.3680.ckpt"
kpa_predictor = kpa_predictor.load_from_checkpoint("weights/regression_res152_wo_bound/" + ckpt_fname, model=model)

trainer.test(kpa_predictor, test_dataloader)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]


Testing: 0it [00:00, ?it/s]

────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
       Test metric             DataLoader 0
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
        test_loss           0.4161258935928345
        test_mse            0.3028821349143982
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────


[{'test_loss': 0.4161258935928345, 'test_mse': 0.3028821349143982}]

In [54]:
from tqdm import tqdm

class PredictDataset(Dataset):
    def __init__(self, df, transform):
        self.df = df
        self.transform = transform
        
        
    def __len__(self):
        return len(self.df)

    
    def __getitem__(self, idx):
        ID = self.df.loc[idx, "ID"]
        fname = self.df.loc[idx, "image_path"]
        
        image = cv2.imread(self.df.loc[idx, "image_path"])
        image = self.transform(image=image)
        
        y = self.df.loc[idx, "kPa_fib"]
        
        return image['image'], torch.tensor(y).log().float(), ID, fname
    
predict_df = pd.concat([valid_df, test_df], axis=0).reset_index(drop=True)
    
predict_dataset = PredictDataset(predict_df, valid_transform)
predict_dataloader = DataLoader(predict_dataset, batch_size=1)

results = []

kpa_predictor.model.eval()
kpa_predictor.model.to("cuda")

for batch in predict_dataloader:
    image, y, ID, fname = batch
    
    pred = kpa_predictor(image.to("cuda"))
    results.append([ID[0], np.exp(pred.detach().to("cpu").numpy()[0][0])])
    
results = pd.DataFrame(results, columns=['ID', 'pred'])
results = results.groupby("ID").head(5).sort_values(["ID", "pred"], ascending=False).groupby("ID").agg(list).reset_index()
results = pd.concat([results['ID'], pd.DataFrame(results['pred'].tolist(), columns=['v1', 'v2', 'v3', 'v4', 'v5'])], axis=1)

df = pd.read_excel("data/US_fibrosis_stage_dataset.xlsx", engine="openpyxl")
df = df.loc[:, ["ID", "age", "AST", "ALT", "PLT"]]
df.ID = df.ID.map(lambda x: str(x).zfill(8))

predict_df = pd.merge(predict_df, df, on="ID", how="left")
predict_df = pd.merge(predict_df, results, on="ID", how="left")
predict_df = predict_df.loc[:, ["ID", "kPa_fib", "v1", "v2", "v3", "v4", "v5", "age", "AST", "ALT", "PLT"]]
predict_df = predict_df.drop_duplicates().reset_index(drop=True)

predict_df.to_csv("data/fibroscan_predict_df.csv", index=False)

Unnamed: 0,ID,kPa_fib,v1,v2,v3,v4,v5,age,AST,ALT,PLT
0,652843,12.0,7.471465,7.395196,6.274761,5.935843,5.564222,79,31.0,17.0,139.0
1,1061554,21.3,12.710855,9.921928,7.731797,7.617371,6.973564,74,69.0,19.0,19.0
2,1100790,4.5,9.618956,6.810647,6.08664,5.637625,5.123699,69,51.0,62.0,164.0
3,1122103,5.3,7.335764,6.027724,5.289991,4.760654,4.748064,46,27.0,37.0,153.0
4,1165085,4.7,7.502084,6.94823,6.869021,5.553672,5.308191,57,50.0,37.0,274.0
5,1168622,7.1,6.309327,5.989295,5.768185,5.151645,5.119466,75,26.0,11.0,205.0
6,1297629,12.6,12.590362,7.559417,7.504156,6.276116,5.218731,48,31.0,23.0,86.0
7,1305221,7.4,5.608263,5.448699,5.399424,5.378242,4.181996,33,40.0,64.0,289.0
8,1332746,18.8,9.591213,8.712101,6.336744,5.714222,5.060771,60,43.0,66.0,119.0
9,1380232,2.3,5.313112,5.091123,4.959963,4.882387,4.582383,53,28.0,14.0,238.0


In [59]:
mre_df = pd.read_excel("data/US_fibrosis_stage_dataset.xlsx", engine="openpyxl")
mre_df = mre_df.loc[:, ["ID", "kPa_mre"]].dropna().reset_index(drop=True)
mre_df.ID = mre_df.ID.map(lambda x: str(x).zfill(8))
flist = os.listdir("data/roi_sampled/")
id_list = list(map(lambda x: x.split("_")[0].zfill(8), flist))

image_df = pd.DataFrame(glob.glob(os.path.join("data", "roi_sampled", "*.jpg")), columns=["image_path"])
image_df.loc[:, "ID"] = image_df.image_path.map(lambda x: x.split("/")[-1].split("-")[0])

mre_df = pd.merge(mre_df, image_df, on="ID", how="inner")
mre_df.head()


Unnamed: 0,ID,kPa_mre,image_path
0,8960,4.34,data/roi_sampled/00008960-6.jpg
1,8960,4.34,data/roi_sampled/00008960-0.jpg
2,8960,4.34,data/roi_sampled/00008960-1.jpg
3,8960,4.34,data/roi_sampled/00008960-10.jpg
4,8960,4.34,data/roi_sampled/00008960-11.jpg


In [62]:
class PredictDataset(Dataset):
    def __init__(self, df, transform):
        self.df = df
        self.transform = transform
        
        
    def __len__(self):
        return len(self.df)

    
    def __getitem__(self, idx):
        ID = self.df.loc[idx, "ID"]
        fname = self.df.loc[idx, "image_path"]
        
        image = cv2.imread(self.df.loc[idx, "image_path"])
        image = self.transform(image=image)
        
        y = self.df.loc[idx, "kPa_mre"]
        
        return image['image'], torch.tensor(y).log().float(), ID, fname

mre_dataset = PredictDataset(mre_df, valid_transform)
mre_dataloader = DataLoader(mre_dataset, batch_size=1)

results = []

kpa_predictor.model.eval()
kpa_predictor.model.to("cuda")

for batch in mre_dataloader:
    image, y, ID, fname = batch
    
    pred = kpa_predictor(image.to("cuda"))
    results.append([ID[0], np.exp(pred.detach().to("cpu").numpy()[0][0])])
    
results = pd.DataFrame(results, columns=['ID', 'pred'])
results = results.groupby("ID").head(5).sort_values(["ID", "pred"], ascending=False).groupby("ID").agg(list).reset_index()
results = pd.concat([results['ID'], pd.DataFrame(results['pred'].tolist(), columns=['v1', 'v2', 'v3', 'v4', 'v5'])], axis=1)

df = pd.read_excel("data/US_fibrosis_stage_dataset.xlsx", engine="openpyxl")
df = df.loc[:, ["ID", "age", "AST", "ALT", "PLT"]]
df.ID = df.ID.map(lambda x: str(x).zfill(8))

mre_df = pd.merge(mre_df, df, on="ID", how="left")
mre_df = pd.merge(mre_df, results, on="ID", how="left")
mre_df = mre_df.loc[:, ["ID", "kPa_mre", "v1", "v2", "v3", "v4", "v5", "age", "AST", "ALT", "PLT"]]
mre_df = mre_df.drop_duplicates().reset_index(drop=True)

mre_df.to_csv("data/mre_predict_df.csv", index=False)
