# Notes
Added a preconverted jpg data library to kernel from https://www.kaggle.com/vaillant/discussion see explanation for RGB channels<BR>
Starter code from https://www.kaggle.com/orkatz2/pulmonary-embolism-pytorch-train

In [8]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
# for dirname, _, filenames in os.walk('/kaggle/input'):
#     for filename in filenames:
#         print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

import torch
import torch.nn as nn
import torch.functional as F
import torch.optim as optim
import torchvision
from torchvision import models
from torchvision import transforms
from torch.utils.data import Dataset,DataLoader
import cv2
import albumentations as albu
import functools
import glob
from albumentations.pytorch.transforms import ToTensorV2

from tqdm.auto import tqdm
import gc

In [2]:
# !jupyter nbextension enable --py widgetsnbextension
# !jupyter labextension install @jupyter-widgets/jupyterlab-manager

# Data

In [3]:
#get the data
train_df = pd.read_csv("../input/rsna-str-pulmonary-embolism-detection/train.csv")

# train_df.head()

In [4]:
from snippets import reduce_mem_usage
train_df = reduce_mem_usage(train_df)

Memory usage of dataframe is 232.24 MB
Memory usage after optimization is: 131.97 MB
Decreased by 43.2%


In [5]:
#lets get the precomputed images
# PATH = '../input/rsna-str-pulmonary-embolism-detection'
# PATH_TRAIN=PATH+'/train/'
jpeg_dir = '../input/rsna-str-pe-detection-jpeg-256/train-jpegs'

# Create Dataset

In [20]:
# def get_training_augmentation(y=256,x=256):
# #     train_transform = [albu.RandomBrightnessContrast(p=0.3),
# #                            albu.VerticalFlip(p=0.5),
# #                            albu.HorizontalFlip(p=0.5),
# #                            albu.Downscale(p=1.0,scale_min=0.35,scale_max=0.75,),
# #                            albu.Resize(y, x)]
#     train_transform = [albu.RandomBrightnessContrast(p=0.3),
#                            albu.HorizontalFlip(p=0.5),
#                            albu.Resize(y, x)]
#     return albu.Compose(train_transform)

# def get_validation_augmentation(y=256,x=256):
#     """Add paddings to make image shape divisible by 32"""
#     test_transform = [albu.Resize(y, x)]
#     return albu.Compose(test_transform)

# formatted_settings = {
#             'input_size': [3, 224, 224],
#             'input_range': [0, 1],
#             'mean': [0.485, 0.456, 0.406],
#             'std': [0.229, 0.224, 0.225],}

# def preprocess_input(x, mean=None, std=None, input_space="RGB", input_range=None, **kwargs):
#     if input_space == "BGR":
#         x = x[..., ::-1].copy()
#         gc.collect()

#     if input_range is not None:
#         if x.max() > 1 and input_range[1] == 1:
#             x = x / 255.0

#     if mean is not None:
#         mean = np.array(mean)
#         x = x - mean

#     if std is not None:
#         std = np.array(std)
#         x = x / std

#     return x

# def get_preprocessing(preprocessing_fn):
#     _transform = [
#         albu.Lambda(image=preprocessing_fn),
#         albu.Lambda(image=to_tensor, mask=to_tensor),
#     ]
#     return albu.Compose(_transform)



# def to_tensor(x, **kwargs):
#     """
#     Convert image or mask.
#     """
#     return x.transpose(2, 0, 1).astype('float32')

In [17]:
preprocessing_train = transforms.Compose([
    transforms.ToPILImage(),
    transforms.RandomHorizontalFlip(p=0.5),
    transforms.Resize(224),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406],
                                 std=[0.229, 0.224, 0.225]),
])
preprocessing_val = transforms.Compose([
    transforms.ToPILImage(),
    transforms.Resize(224),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406],
                                 std=[0.229, 0.224, 0.225]),
])

class CTDatasetJPEG(Dataset):
    def __init__(self,df,path,transforms=None,preprocessing=None,size=254,mode='val'):
        
        #get a numpy representation of the pandas dataframe
        self.df_main = df.values
        self.path = path
        self.transforms = transforms
        self.preprocessing = preprocessing
        self.size=size
    
        #either use all the validation data as given
        #or generate a balanced set
        if mode=='val':
            self.df = self.df_main
        else:
            self.generate_balanced_set()

    def __getitem__(self, idx):
        '''
        returns the image and a label
        '''
        row = self.df[idx]
        img = cv2.imread(glob.glob(f'{self.path}/{row[0]}/{row[1]}/*{row[2]}.jpg')[0])
        
        #lets flip from BGR to RGB
        img=cv2.cvtColor(img,cv2.COLOR_BGR2RGB)
      
        label = row[3:].astype(int)#pe_present_on_image 
        label[2:] = label[2:] if label[0]==1 else 0
        if self.transforms:
            img = self.transforms(img)
        if self.preprocessing:
            img = self.preprocessing(img)
            
        return img,torch.from_numpy(label.reshape(-1))

    def __len__(self):
        return len(self.df)
    
    #this function gets a balanced set, 1/2 have pe_present_on_image=1, 1/2 have pe_present_on_image=0
    #note that we discard a bunch of images that have no pe present
    def generate_balanced_set(self):
        df0 = self.df_main[self.df_main[:,3]==0]
        df1 = self.df_main[self.df_main[:,3]==1]
        np.random.shuffle(df0)
        self.df = np.concatenate([df0[:len(df1)],df1],axis=0)

In [18]:
#get a list of unque Studies
StudyInstanceUID = list(set(train_df['StudyInstanceUID']))
# print(len(StudyInstanceUID))

#create train and val sets
#TODO change back to train on full dataset, or to do an even mix of PE and nonPE images
t_df = train_df[train_df['StudyInstanceUID'].isin(StudyInstanceUID[0:6200])]
v_df = train_df[train_df['StudyInstanceUID'].isin(StudyInstanceUID[6200:])]
print(len(t_df))
print(len(v_df))

1527158
263436


## Create Datasets and Dataloader

In [19]:
train_dataset = CTDatasetJPEG(t_df,jpeg_dir,
                            transforms=preprocessing_train,mode='train')
val_dataset = CTDatasetJPEG(v_df,jpeg_dir,
                            transforms=preprocessing_val,mode='val')
gc.collect()

28

In [20]:
img,label=val_dataset[0]
img.shape

torch.Size([3, 224, 224])

In [21]:
img

tensor([[[-2.1179, -2.1179, -2.1179,  ..., -2.1179, -2.1179, -2.1179],
         [-2.1179, -2.1179, -2.1179,  ..., -2.1179, -2.1179, -2.1179],
         [-2.1179, -2.1179, -2.1179,  ..., -2.1179, -2.1179, -2.1179],
         ...,
         [-2.1179, -2.1179, -2.1179,  ..., -2.1179, -2.1179, -2.1179],
         [-2.1179, -2.1179, -2.1179,  ..., -2.1179, -2.1179, -2.1179],
         [-2.1179, -2.1179, -2.1179,  ..., -2.1179, -2.1179, -2.1179]],

        [[-2.0357, -2.0357, -2.0357,  ..., -2.0357, -2.0357, -2.0357],
         [-2.0357, -2.0357, -2.0357,  ..., -2.0357, -2.0357, -2.0357],
         [-2.0357, -2.0357, -2.0357,  ..., -2.0357, -2.0357, -2.0357],
         ...,
         [-2.0357, -2.0357, -2.0357,  ..., -2.0357, -2.0357, -2.0357],
         [-2.0357, -2.0357, -2.0357,  ..., -2.0357, -2.0357, -2.0357],
         [-2.0357, -2.0357, -2.0357,  ..., -2.0357, -2.0357, -2.0357]],

        [[-1.8044, -1.8044, -1.8044,  ..., -1.8044, -1.8044, -1.8044],
         [-1.8044, -1.8044, -1.8044,  ..., -1

In [23]:
t_df.head()

Unnamed: 0,StudyInstanceUID,SeriesInstanceUID,SOPInstanceUID,pe_present_on_image,negative_exam_for_pe,qa_motion,qa_contrast,flow_artifact,rv_lv_ratio_gte_1,rv_lv_ratio_lt_1,leftsided_pe,chronic_pe,true_filling_defect_not_pe,rightsided_pe,acute_and_chronic_pe,central_pe,indeterminate
0,6897fa9de148,2bfbb7fd2e8b,c0f3cb036d06,0,0,0,0,0,0,1,1,0,0,1,0,0,0
1,6897fa9de148,2bfbb7fd2e8b,f57ffd3883b6,0,0,0,0,0,0,1,1,0,0,1,0,0,0
2,6897fa9de148,2bfbb7fd2e8b,41220fda34a3,0,0,0,0,0,0,1,1,0,0,1,0,0,0
3,6897fa9de148,2bfbb7fd2e8b,13b685b4b14f,0,0,0,0,0,0,1,1,0,0,1,0,0,0
4,6897fa9de148,2bfbb7fd2e8b,be0b7524ffb4,0,0,0,0,0,0,1,1,0,0,1,0,0,0


In [8]:
print(len(train_dataset))
print(len(val_dataset))

165642
269828


In [12]:
class config:
    model_name="resnet18"
    batch_size = 256
    WORKERS = 4
    numb_classes =14
    resume = False
    epochs = 10
    MODEL_PATH = 'log/cpt'
    
if not os.path.exists(config.MODEL_PATH):
        os.makedirs(config.MODEL_PATH)


In [10]:
train_dl = DataLoader(train_dataset, batch_size=config.batch_size, shuffle=True, num_workers=config.WORKERS, pin_memory=True)
val_dl = DataLoader(val_dataset, batch_size=config.batch_size*2, shuffle=False, num_workers=config.WORKERS, pin_memory=True)

# len(val_dl)
# x,y = train_dataset[-400]
# print(x.shape,len(y),y,len(train_dataset))

# del x
# del y

# !free -m

# len(train_dl)

# Create, train and save Model

In [11]:
#where should it go
dev = torch.device(
    "cuda") if torch.cuda.is_available() else torch.device("cpu")
dev

device(type='cuda')

In [12]:
# classes = len(target_columns)
model = models.resnet18(pretrained=True)
in_features = model.fc.in_features
model.fc = nn.Linear(in_features,config.numb_classes)
model=model.to(dev)

In [13]:
optimizer = torch.optim.Adam(model.parameters(),lr=5e-4,weight_decay= 0.00001)
scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer,T_max= 300,eta_min= 0.000001)
criterion = torch.nn.BCEWithLogitsLoss()

In [14]:
in_features

512

In [15]:
from torch.optim.lr_scheduler import ReduceLROnPlateau

In [16]:

class ValMonitor():
    def __init__(self, max_consecutive_increases):
        '''
        stops validation when loss has increased max_consecutive_increases
        also indicates if its time to save the model
        :param max_consecutive_increases: how many val increases before we should stop
        '''
        self.max_consecutive_increases=max_consecutive_increases
        self.stop_counter=0
        self.old_val = None
        self.should_stop_training=False

    def time_to_save(self, new_val):
        '''
        determines if its time to save model
        :param new_val:
        :return:
        '''
        should_save_model = False
        if( self.old_val is None):
            self.old_val = new_val+1

        if(new_val<self.old_val):
            self.old_val=new_val
            if(self.stop_counter>0):
                self.stop_counter-=1
            should_save_model = True    #save if validation is better than last
        else:
            self.stop_counter+=1

        #stop if validtion loss is rising
        self.should_stop_training= (self.stop_counter>=self.max_consecutive_increases)
        return should_save_model

    def time_to_stop(self):
        '''
        determines if its time to stop training
        :param new_val:
        :return:
        '''
        return self.should_stop_training
    

class Trainer():
    
    def __init__(self,model, criterian,optimizer,scheduler,load_old_model=True, num_epochs=10):
        self.model = model
        self.load_old_model = load_old_model
        self.criterian = criterian      
        self.optimizer = optimizer
        self.scheduler = scheduler
        self.num_epochs=num_epochs
        self.vm= ValMonitor(2)
        
    def _train_epoch(self, loader):
        self.model.train()
        tqdm_loader = tqdm(loader)
        current_loss_mean = 0
        for batch_idx, (batch_imgs,batch_labels) in enumerate(tqdm_loader):
            self.optimizer.zero_grad()
            batch_imgs, batch_labels = batch_imgs.to(dev).float(), batch_labels.to(dev).float()        
            predicted = self.model(batch_imgs)
            loss = self.criterian(predicted.float(), batch_labels)       
            loss.backward()
            self.optimizer.step()
            
            del batch_imgs
            del batch_labels
 
            current_loss_mean = (current_loss_mean * batch_idx + loss.item()) / (batch_idx + 1)
            tqdm_loader.set_description('loss: {:.4} lr:{:.6}'.format(
                    current_loss_mean, self.optimizer.param_groups[0]['lr']))
            self.scheduler.step(batch_idx)
            del loss
        return current_loss_mean 

    def _val_epoch(self, loader):
        self.model.eval()
        tqdm_loader = tqdm(loader)
        current_loss_mean = 0
        for batch_idx, (imgs,labels) in enumerate(tqdm_loader):
            with torch.no_grad():
                batch_imgs = imgs.to(dev).float()
                batch_labels = labels.to(dev).float()
                predicted = self.model(batch_imgs)
                loss = self.criterian(predicted.float(),batch_labels.float()).item()
                current_loss_mean = (current_loss_mean * batch_idx + loss) / (batch_idx + 1)
                
                del batch_imgs
                del batch_labels
        print(f'Validation loss {current_loss_mean}')
        return current_loss_mean   
    
    
#     def predict(self,imgs_tensor,get_fet = False):
#         batch_imgs = batch_imgs.to(dev)
        
#         with torch.no_grad():
#             res= torch.sigmoid(self.model(batch_imgs)) 
#         return predicted.cpu().numpy()
    
    
    def train(self,train_loader, val_loader):
#         old_current_loss_mean_val = None#initialize to bus low number
        best_val_loss = None
        val_rise_tracker=1 #initialize
        BAIL_WHEN_VAL_RISES_THIS_MANY_TIMES =2
        
        for epoch in range(self.num_epochs):
            print(f'----- Epoch {epoch} -----')           
            # Each epoch has a training and validation phase      
            train_loader.dataset.generate_balanced_set()
            
            current_loss_mean_train = self._train_epoch(train_loader)          
            current_loss_mean_val = self._val_epoch(val_loader)
             
            #initialize
            if( self.vm.time_to_save(current_loss_mean_val)):
                print("saving model")
                torch.save(self.model.state_dict(),config.MODEL_PATH+"/{}_best.pth".format(config.model_name))
            
            if(self.vm.time_to_stop()):
                return

In [17]:
trainer = Trainer(model, criterion,optimizer,scheduler)


In [18]:
trainer.train(train_dl,val_dl)

----- Epoch 0 -----


HBox(children=(FloatProgress(value=0.0, max=648.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=528.0), HTML(value='')))


Validation loss 0.14332724610435296
saving model
----- Epoch 1 -----


HBox(children=(FloatProgress(value=0.0, max=648.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=528.0), HTML(value='')))


Validation loss 0.1527683661746173
----- Epoch 2 -----


HBox(children=(FloatProgress(value=0.0, max=648.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=528.0), HTML(value='')))


Validation loss 0.1259488269360824
saving model
----- Epoch 3 -----


HBox(children=(FloatProgress(value=0.0, max=648.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=528.0), HTML(value='')))


Validation loss 0.12377547187806516
saving model
----- Epoch 4 -----


HBox(children=(FloatProgress(value=0.0, max=648.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=528.0), HTML(value='')))


Validation loss 0.12734036495914366
----- Epoch 5 -----


HBox(children=(FloatProgress(value=0.0, max=648.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=528.0), HTML(value='')))


Validation loss 0.13471229909919194


# Prediction generator

See KP_RSNA_gen_submission.ipynb 

# Junk

In [None]:
# for img, lbl in train_dl:
#     print(img[0].shape)
#     print(len(lbl[0]))
#     break
# # tmp=next((train_dl))

# import matplotlib.pyplot as plt
# row = train_dataset.df[0]
# print(f"{jpeg_dir}/{row[0]}/{row[1]}/*{row[2]}.jpg")
# img = cv2.imread(glob.glob(f"{jpeg_dir}/{row[0]}/{row[1]}/*{row[2]}.jpg")[0])
# plt.imshow(img)#discard

#how many?
# tot_instances=0

# studies=(os.listdir(PATH_TRAIN))
# studies=sorted(studies)
# print("tot_studies= "+str(len(studies)))

# tot_series=0
# for study in studies:
#     pth=os.path.join(PATH_TRAIN,study)
# #     print(pth)
#     series=os.listdir(pth)
#     tot_series+=len(series)
#     for serie in series:
#         tot_instances+=len(os.listdir(os.path.join(pth,serie)))
# print("tot_series= "+str(tot_series))
# print("tot_instances= "+str(tot_instances))


# class RsnaDataset(Dataset):
    
#     def __init__(self,df,transforms):
#         super().__init__()
#         self.df = df
#         self.transforms = transforms
    
#     def __getitem__(self,index):      
#         image_path = self.df.image_paths[index]
#         data = self.df[self.df['ImagePath']==image_path]
#         labels = data[target_columns].values.reshape(-1)
#         image = get_img(image_path)
#         image = convert_to_rgb(image)
        
#         if self.transforms:
#             image = self.transforms(image=image)['image']
            
#         image = torch.tensor(image,dtype=torch.float)        
#         labels = torch.tensor(labels,dtype=torch.float)
        
#         return image,labels
           
#     def __len__(self):
#         return self.image_paths.shape[0]

#see what above class does
# t_df.head()
# df_tmp=t_df.values
# df_tmp.shape

# df0 = df_tmp[df_tmp[:,3]==0]
# df1 = df_tmp[df_tmp[:,3]==1]
# print(len(df0))
# print(len(df1))

# df_tmp_balanced = np.concatenate([df0[:len(df1)],df1],axis=0)
# print(len(df_tmp_balanced))
# print(sum(df_tmp_balanced[:,3]==0))
# print(sum(df_tmp_balanced[:,3]==1))
# # df0 = self.df_main[self.df_main[:,3]==0]
# #         df1 = self.df_main[self.df_main[:,3]==1]
# #         np.random.shuffle(df0)
# #         self.df = np.concatenate([df0[:len(df1)],df1],axis=0)