# Notes
Added a preconverted jpg data library to kernel from https://www.kaggle.com/vaillant/discussion see explanation for RGB channels<BR>
Starter code from https://www.kaggle.com/orkatz2/pulmonary-embolism-pytorch-train

In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
# for dirname, _, filenames in os.walk('/kaggle/input'):
#     for filename in filenames:
#         print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [2]:
import torch
import torch.nn as nn
import torch.functional as F
import torch.optim as optim
import torchvision
from torchvision import models
from torch.utils.data import Dataset,DataLoader
import cv2
import albumentations as albu
import functools
import glob
from albumentations.pytorch.transforms import ToTensorV2

from tqdm.auto import tqdm
import gc

ModuleNotFoundError: No module named 'albumentations'

# Data

In [3]:
#get the data
train_df = pd.read_csv("../input/rsna-str-pulmonary-embolism-detection/train.csv")
# test_df = pd.read_csv("../input/rsna-str-pulmonary-embolism-detection/test.csv")
# train.head()
# len(train_df.values)

In [4]:
from snippets import reduce_mem_usage
train_df = reduce_mem_usage(train_df)

Memory usage of dataframe is 232.24 MB
Memory usage after optimization is: 131.97 MB
Decreased by 43.2%


In [6]:
PATH = '../input/rsna-str-pulmonary-embolism-detection'
PATH_TRAIN=PATH+'/train/'
jpeg_dir = '../input/rsna-str-pe-detection-jpeg-256/train-jpegs'

# Create Dataset

In [7]:
#where should it go
dev = torch.device(
    "cuda") if torch.cuda.is_available() else torch.device("cpu")
dev

device(type='cuda')

In [8]:
def get_training_augmentation(y=256,x=256):
    train_transform = [albu.RandomBrightnessContrast(p=0.3),
                           albu.VerticalFlip(p=0.5),
                           albu.HorizontalFlip(p=0.5),
                           albu.Downscale(p=1.0,scale_min=0.35,scale_max=0.75,),
                           albu.Resize(y, x)]
    return albu.Compose(train_transform)


formatted_settings = {
            'input_size': [3, 224, 224],
            'input_range': [0, 1],
            'mean': [0.485, 0.456, 0.406],
            'std': [0.229, 0.224, 0.225],}

def preprocess_input(x, mean=None, std=None, input_space="RGB", input_range=None, **kwargs):
    if input_space == "BGR":
        x = x[..., ::-1].copy()
        gc.collect()

    if input_range is not None:
        if x.max() > 1 and input_range[1] == 1:
            x = x / 255.0

    if mean is not None:
        mean = np.array(mean)
        x = x - mean

    if std is not None:
        std = np.array(std)
        x = x / std

    return x

def get_preprocessing(preprocessing_fn):
    _transform = [
        albu.Lambda(image=preprocessing_fn),
        albu.Lambda(image=to_tensor, mask=to_tensor),
    ]
    return albu.Compose(_transform)

def get_validation_augmentation(y=256,x=256):
    """Add paddings to make image shape divisible by 32"""
    test_transform = [albu.Resize(y, x)]
    return albu.Compose(test_transform)

def to_tensor(x, **kwargs):
    """
    Convert image or mask.
    """
    return x.transpose(2, 0, 1).astype('float32')


class CTDataset2D(Dataset):
    def __init__(self,df,transforms = albu.Compose([albu.HorizontalFlip()]),preprocessing=None,size=256,mode='val'):
        
        #get a numpy representation of the pandas dataframe
        self.df_main = df.values
        
        #either use all the validation data as given
        #or generate a balanced set
        if mode=='val':
            self.df = self.df_main
        else:
            self.generate_balanced_set()
            
        self.transforms = transforms
        self.preprocessing = preprocessing
        self.size=size


    def __getitem__(self, idx):
        row = self.df[idx]
        img = cv2.imread(glob.glob(f"{jpeg_dir}/{row[0]}/{row[1]}/*{row[2]}.jpg")[0])
        label = row[3:].astype(int)
        label[2:] = label[2:] if label[0]==1 else 0
        if self.transforms:
            img = self.transforms(image=img)['image']
        if self.preprocessing:
            img = self.preprocessing(image=img)['image']
        return img,torch.from_numpy(label.reshape(-1))

    def __len__(self):
        return len(self.df)
    
    #this function gets a balanced set, 1/2 have pe_present_on_image=1, 1/2 have pe_present_on_image=0
    #note that we discard a bunch of images that have no pe present
    def generate_balanced_set(self):
        df0 = self.df_main[self.df_main[:,3]==0]
        df1 = self.df_main[self.df_main[:,3]==1]
        np.random.shuffle(df0)
        self.df = np.concatenate([df0[:len(df1)],df1],axis=0)
        

def norm(img):
    img-=img.min()
    return img/img.max()

In [7]:
#see what above class does
# t_df.head()
# df_tmp=t_df.values
# df_tmp.shape

# df0 = df_tmp[df_tmp[:,3]==0]
# df1 = df_tmp[df_tmp[:,3]==1]
# print(len(df0))
# print(len(df1))

# df_tmp_balanced = np.concatenate([df0[:len(df1)],df1],axis=0)
# print(len(df_tmp_balanced))
# print(sum(df_tmp_balanced[:,3]==0))
# print(sum(df_tmp_balanced[:,3]==1))
# # df0 = self.df_main[self.df_main[:,3]==0]
# #         df1 = self.df_main[self.df_main[:,3]==1]
# #         np.random.shuffle(df0)
# #         self.df = np.concatenate([df0[:len(df1)],df1],axis=0)

In [9]:
StudyInstanceUID = list(set(train_df['StudyInstanceUID']))
# print(len(StudyInstanceUID))

#create train and val sets
#TODO change back to train on full dataset, or to do an even mix of PE and nonPE images
t_df = train_df[train_df['StudyInstanceUID'].isin(StudyInstanceUID[0:6000])]
v_df = train_df[train_df['StudyInstanceUID'].isin(StudyInstanceUID[6000:])]
len(t_df)

1473582

In [10]:
class config:
    model_name="resnet18"
    batch_size = 64
    WORKERS = 4
    classes =14
    resume = False
    epochs = 10
    MODEL_PATH = 'log/cpt'
    if not os.path.exists(MODEL_PATH):
        os.makedirs(MODEL_PATH)

## Create Datasets and Dataloader

In [28]:
!pip install memory_profiler
%load_ext memory_profiler



In [11]:

preprocessing_fn = functools.partial(preprocess_input, **formatted_settings)
train_dataset = CTDataset2D(t_df,
                            transforms=get_training_augmentation(),
                            preprocessing=get_preprocessing(preprocessing_fn),mode='train')
val_dataset = CTDataset2D(v_df,
                            transforms=get_validation_augmentation(),
                            preprocessing=get_preprocessing(preprocessing_fn))
gc.collect()

60

In [12]:
del t_df
del v_df
gc.collect()

20

In [13]:
train = DataLoader(train_dataset, batch_size=config.batch_size, shuffle=True, num_workers=config.WORKERS, pin_memory=True)
val = DataLoader(val_dataset, batch_size=config.batch_size*2, shuffle=False, num_workers=config.WORKERS, pin_memory=True)

In [12]:
x,y = train_dataset[-400]
x.shape,len(y),y,len(train_dataset)

((3, 256, 256), 14, tensor([1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0]), 159718)

# Create Model

In [14]:
class config:
    model_name="resnet18"
    batch_size = 128
    WORKERS = 4
    numb_classes =14
    resume = False
    epochs = 10
    MODEL_PATH = 'log/cpt'
    if not os.path.exists(MODEL_PATH):
        os.makedirs(MODEL_PATH)


In [15]:
# classes = len(target_columns)
model = models.resnet18(pretrained=True)
in_features = model.fc.in_features
model.fc = nn.Linear(in_features,config.numb_classes)
model=model.to(dev)

Downloading: "https://download.pytorch.org/models/resnet18-5c106cde.pth" to /root/.cache/torch/hub/checkpoints/resnet18-5c106cde.pth


HBox(children=(FloatProgress(value=0.0, max=46827520.0), HTML(value='')))




In [16]:
optimizer = torch.optim.Adam(model.parameters(),lr=5e-4,weight_decay= 0.00001)
scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer,T_max= 300,eta_min= 0.000001)
loss_fn = torch.nn.BCEWithLogitsLoss()

In [17]:
from torch.optim.lr_scheduler import ReduceLROnPlateau

In [18]:
class trainer:
    def __init__(self,loss_fn,model,optimizer,scheduler):
        self.loss_fn = loss_fn
        self.model = model
        self.optimizer = optimizer
        self.scheduler = scheduler

        
    def batch_train(self, batch_imgs, batch_labels, batch_idx):
        #self.model.train() done in train_epoch()
        self.optimizer.zero_grad()
        batch_imgs, batch_labels = batch_imgs.to(dev).float(), batch_labels.to(dev).float()       
        predicted = self.model(batch_imgs)
        loss = self.loss_fn(predicted.float(), batch_labels)
        
        del batch_imgs
        del batch_labels
        gc.collect()
        
        loss.backward()
        self.optimizer.step()
        
        return loss.item(), predicted
    
    def batch_valid(self, batch_imgs,get_fet):
        # KP modify
        #self.model.eval() done in valid_epoch
        batch_imgs = batch_imgs.to(dev)
        
        with torch.no_grad():
            res= torch.sigmoid(self.model(batch_imgs))
        
        del batch_imgs
        gc.collect()
        return res
    
    def train_epoch(self, loader):
        self.model.train()
        tqdm_loader = tqdm(loader)
        current_loss_mean = 0
        for batch_idx, (imgs,labels) in enumerate(tqdm_loader):
            loss, predicted = self.batch_train(imgs, labels, batch_idx)
            current_loss_mean = (current_loss_mean * batch_idx + loss) / (batch_idx + 1)
            tqdm_loader.set_description('loss: {:.4} lr:{:.6}'.format(
                    current_loss_mean, self.optimizer.param_groups[0]['lr']))
            self.scheduler.step(batch_idx)
        return current_loss_mean
    
    def valid_epoch(self, loader,name="valid"):
        self.model.eval()
        tqdm_loader = tqdm(loader)
        current_loss_mean = 0
        for batch_idx, (imgs,labels) in enumerate(tqdm_loader):
            with torch.no_grad():
                batch_imgs = imgs.to(dev).float()
                batch_labels = labels.to(dev)
                predicted = self.model(batch_imgs)
                loss = self.loss_fn(predicted.float(),batch_labels.float()).item()
                current_loss_mean = (current_loss_mean * batch_idx + loss) / (batch_idx + 1)
        score = 1-current_loss_mean
        print('metric {}'.format(score))
        return score
    
    def run(self,train_loder,val_loder):
        best_score = -100000
        for e in range(config.epochs):
            print("----------Epoch {}-----------".format(e))
            current_loss_mean = self.train_epoch(train_loder)
            train_loder.dataset.generate_balanced_set()
            score = self.valid_epoch(val_loder)
            if best_score < score:
                best_score = score
                torch.save(self.model.state_dict(),config.MODEL_PATH+"/{}_best.pth".format(config.model_name))
                
    def batch_valid_tta(self, batch_imgs):
        batch_imgs = batch_imgs.to(dev)
        gc.collect()
        predicted = model(batch_imgs)
        tta_flip = [[-1],[-2]]
        for axis in tta_flip:
            predicted += torch.flip(model(torch.flip(batch_imgs, axis)), axis)
        predicted = predicted/(1+len(tta_flip))
        predicted = torch.sigmoid(predicted)
        return predicted.cpu().numpy()
    
    def load_best_model(self):
        if os.path.exists(config.MODEL_PATH+"/{}_best.pth".format(config.model_name)):
            self.model.load_state_dict(torch.load(config.MODEL_PATH+"/{}_best.pth".format(config.model_name)))
    
    def predict(self,imgs_tensor,get_fet = False):
        self.model.train()
        with torch.no_grad():
            return self.batch_valid(imgs_tensor,get_fet=get_fet)
            

In [19]:
Trainer = trainer(loss_fn,model,optimizer,scheduler)

In [None]:
%load_ext memory_profiler
%mprun -f Trainer.run(train,val)

----------Epoch 0-----------


HBox(children=(FloatProgress(value=0.0, max=2477.0), HTML(value='')))

In [20]:
Trainer.run(train,val)

----------Epoch 0-----------


HBox(children=(FloatProgress(value=0.0, max=2477.0), HTML(value='')))






KeyboardInterrupt: 

In [22]:
gc.collect()
import sys
sys.getsizeof(model)
# model
#torch.cuda.empty_cache()

# !nvidia-smi

64

# Junk

In [None]:
# import matplotlib.pyplot as plt
# row = train_dataset.df[0]
# print(f"{jpeg_dir}/{row[0]}/{row[1]}/*{row[2]}.jpg")
# img = cv2.imread(glob.glob(f"{jpeg_dir}/{row[0]}/{row[1]}/*{row[2]}.jpg")[0])
# plt.imshow(img)#discard

#how many?
# tot_instances=0

# studies=(os.listdir(PATH_TRAIN))
# studies=sorted(studies)
# print("tot_studies= "+str(len(studies)))

# tot_series=0
# for study in studies:
#     pth=os.path.join(PATH_TRAIN,study)
# #     print(pth)
#     series=os.listdir(pth)
#     tot_series+=len(series)
#     for serie in series:
#         tot_instances+=len(os.listdir(os.path.join(pth,serie)))
# print("tot_series= "+str(tot_series))
# print("tot_instances= "+str(tot_instances))


# class RsnaDataset(Dataset):
    
#     def __init__(self,df,transforms):
#         super().__init__()
#         self.df = df
#         self.transforms = transforms
    
#     def __getitem__(self,index):      
#         image_path = self.df.image_paths[index]
#         data = self.df[self.df['ImagePath']==image_path]
#         labels = data[target_columns].values.reshape(-1)
#         image = get_img(image_path)
#         image = convert_to_rgb(image)
        
#         if self.transforms:
#             image = self.transforms(image=image)['image']
            
#         image = torch.tensor(image,dtype=torch.float)        
#         labels = torch.tensor(labels,dtype=torch.float)
        
#         return image,labels
           
#     def __len__(self):
#         return self.image_paths.shape[0]