In [1]:

import os
import sys
import math
from timeit import default_timer as timer
from datetime import datetime
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, roc_curve, precision_recall_curve, matthews_corrcoef, auc, accuracy_score, recall_score, precision_score, f1_score
from sklearn.utils import shuffle
from PIL import Image
from tqdm import tqdm
import seaborn as sns
import torchvision
from torchvision import transforms, datasets, models
import torch
from torch import optim, cuda
from torch.utils.data import DataLoader, sampler
import torch.nn as nn
from torch.autograd import Variable
from torch.utils.data import Dataset
import time
import warnings
warnings.filterwarnings('ignore', category=FutureWarning)
from collections import OrderedDict
from torchsummary import summary
from torchvision.transforms import InterpolationMode


In [2]:
data_df = pd.read_csv('fitzpatrick17k.csv').drop(['Unnamed: 0'],axis=1)
data_df = data_df[data_df.fitzpatrick!=-1].reset_index(drop=True)
data_df = data_df.dropna(subset=['url']).reset_index(drop=True)

In [3]:
file_list = os.listdir('fitzpatrick17k_dataset_url_download/')
file_list = [x[:-4] for x in file_list]
data_df = data_df[data_df.md5hash.isin(file_list)].reset_index(drop=True)

In [4]:
data_df.insert(2,'path','x')

In [5]:
data_df.path =  '/tf/notebooks/SSD_data/dermatology_fitzpatrick17k_dataset/fitzpatrick17k_dataset_url_download/' + data_df.md5hash + '.png'

In [6]:
data_df.shape

(15964, 9)

In [7]:
data_df.fitzpatrick.value_counts()

2    4796
3    3296
1    2941
4    2776
5    1527
6     628
Name: fitzpatrick, dtype: int64

In [8]:
#data_df.insert(2,"actinic keratosis",0)
data_df.insert(2,"basal cell carcinoma",0)
data_df.insert(2,"melanoma",0)
data_df.insert(2,"squamous cell carcinoma",0)

In [9]:
bcc = ['basal cell carcinoma',
       'basal cell carcinoma morpheiform',
       'solid cystic basal cell carcinoma']
data_df['basal cell carcinoma'].loc[data_df.label.isin(bcc)]=1
melanoma = ['malignant melanoma',
            'melanoma',
            'superficial spreading melanoma ssm',
            'lentigo maligna']
data_df['melanoma'].loc[data_df.label.isin(melanoma)]=1
data_df['squamous cell carcinoma'].loc[data_df.label=="squamous cell carcinoma"]=1

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iloc._setitem_with_indexer(indexer, value)


In [10]:
test_df = shuffle(data_df, random_state=42).reset_index(drop=True)[12000:]
data_df = shuffle(data_df, random_state=42).reset_index(drop=True)[:12000]

In [11]:
len(data_df)

12000

In [12]:
light_df = data_df[data_df.fitzpatrick.isin([1,2,3])]
dark_df = data_df[data_df.fitzpatrick.isin([4,5,6])]

In [13]:
disease_list = data_df.label.unique().tolist()

In [14]:
dark_combine_df = list(range(0,len(disease_list)))
light_combine_df = list(range(0,len(disease_list)))

for i in tqdm(range(0,len(disease_list))):
    dark_combine_df[i] = shuffle(dark_df[dark_df.label==disease_list[i]], random_state=42)
    light_combine_df[i] = shuffle(light_df[light_df.label==disease_list[i]], random_state=42)
    dark_combine_df[i] = dark_combine_df[i][:len(light_combine_df[i])]
    light_combine_df[i] = light_combine_df[i][:len(dark_combine_df[i])]
    

100%|██████████| 114/114 [00:00<00:00, 343.97it/s]


In [15]:
dark_append_df = dark_df[:0]
light_append_df = light_df[:0]

In [16]:
for i in tqdm(range(0,len(dark_combine_df))):
    dark_append_df = pd.concat([dark_append_df, dark_combine_df[i]])
    light_append_df = pd.concat([light_append_df, light_combine_df[i]])

100%|██████████| 114/114 [00:00<00:00, 313.24it/s]


In [17]:
data_df = shuffle(pd.concat([dark_append_df, light_append_df]), random_state=42).reset_index(drop=True)

In [18]:
data_df.shape

(6852, 12)

In [19]:
train_df = data_df[:4941]
validate_df = data_df[4941:5600]
test_df = data_df[5600:]

In [20]:
imgtransResize = (512, 512)

In [21]:

#TRANSFORM DATA

normalize = transforms.Normalize(mean = [0.485, 0.456, 0.406], std = [0.229, 0.224, 0.225])
transformList = []
transformList.append(transforms.Resize(imgtransResize,interpolation=InterpolationMode.BILINEAR))
transformList.append(transforms.ColorJitter(brightness=0.1, contrast=0.1, saturation=0.1, hue=0.05))
transformList.append(transforms.RandomHorizontalFlip())
transformList.append(transforms.RandomVerticalFlip())
transformList.append(torchvision.transforms.RandomRotation(90, interpolation=InterpolationMode.BILINEAR))
transformList.append(transforms.ToTensor())
transformList.append(normalize)      
transformSequence=transforms.Compose(transformList)


val_transformList = []
val_transformList.append(transforms.Resize(imgtransResize,interpolation=InterpolationMode.BILINEAR))
val_transformList.append(transforms.ToTensor())
val_transformList.append(normalize)   
val_transformSequence=transforms.Compose(val_transformList)



In [22]:

y_list = [
'basal cell carcinoma',
'melanoma',
'squamous cell carcinoma'
]


In [23]:
label_list = y_list
class_names = y_list

In [24]:
class_names

['basal cell carcinoma', 'melanoma', 'squamous cell carcinoma']

In [25]:
class DermDataSet(Dataset):
    def __init__(self, image_list_file, transform=None, policy="ones"):
        """
        image_list_file: path to the file containing images with corresponding labels.
        transform: optional transform to be applied on a sample.
        Upolicy: name the policy with regard to the uncertain labels
        """
        image_names = []
        labels = []

        for row, line in image_list_file.iterrows():
            #k+=1
            image_name = line['path']
            label = line[class_names]
            
            for i in range(len(label)):
                label[i] = line[class_names[i]]
            
            image_names.append(image_name)
            labels.append(label)

        self.image_names = image_names
        self.labels = labels
        self.transform = transform

    def __getitem__(self, index):
        """Take the index of item and returns the image and its labels"""
 
        image_name = self.image_names[index]
        image = Image.open(image_name).convert("RGB")
        label = self.labels[index]
        if self.transform is not None:
            image = self.transform(image)
        return image, torch.FloatTensor(label)

    def __len__(self):
        return len(self.image_names)

In [26]:
trBatchSize = 96

In [27]:
#LOAD DATASET

datasetTrain = DermDataSet(train_df, transformSequence, policy="ones")
datasetValid = DermDataSet(validate_df, val_transformSequence, policy="ones")
print("next")

#why can't I do shuffle true???
dataLoaderTrain = DataLoader(dataset=datasetTrain, batch_size=trBatchSize, shuffle=True,  num_workers=32, pin_memory=True)
dataLoaderVal = DataLoader(dataset=datasetValid, batch_size=trBatchSize, shuffle=False, num_workers=32, pin_memory=True)


next


In [28]:
class DermTrainer():

    def train (model, dataLoaderTrain, dataLoaderVal, nnClassCount, trMaxEpoch, launchTimestamp, checkpoint):
        
        overall_start = timer()
        #SETTINGS: OPTIMIZER & SCHEDULER
        optimizer = optim.Adam(model.parameters(), lr=3e-4, betas=(0.9, 0.999), eps=1e-08, weight_decay=0)
        scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=0, min_lr=1e-5, verbose=True)
        epochs_no_improve = 0 
        epoch_early_stop = 3
        #SETTINGS: LOSS
        loss = torch.nn.BCELoss()

        #LOAD CHECKPOINT 
        if checkpoint != None and use_gpu:
            modelCheckpoint = torch.load(checkpoint)
            model.load_state_dict(modelCheckpoint['state_dict'])
            optimizer.load_state_dict(modelCheckpoint['optimizer'])

        
        #TRAIN THE NETWORK
        lossMIN = 100000
        start = timer()
        dir_path = dir_str+launchTimestamp
        #os.mkdir(dir_path)
        for epochID in range(0, trMaxEpoch):
            
            timestampTime = time.strftime("%H%M%S")
            timestampDate = time.strftime("%d%m%Y")
            timestampSTART = timestampDate + '-' + timestampTime
            
            batchs, losst = DermTrainer.epochTrain(model, dataLoaderTrain, optimizer, trMaxEpoch, epochID, nnClassCount, loss, start, lossMIN, launchTimestamp, dir_path)
            print("\n")
            outLoss, ground_truth, prediction, rocauc, aurocIndividual = DermTrainer.test(model, dataLoaderVal, nnClassCount, None, class_names, loss)
            outLoss = outLoss.cpu().detach().numpy()
            print("val loss: " + str(outLoss))
            print("\n")

            timestampTime = time.strftime("%H%M%S")
            timestampDate = time.strftime("%d%m%Y")
            timestampEND = timestampDate + '-' + timestampTime
            scheduler.step(outLoss)
            
            torch.save({'epoch': epochID + 1, 'state_dict': model.state_dict(), 'optimizer' : optimizer.state_dict()}, 'saved_models/dermatology_model_fitzpatrick_bin_' + str(epochID) + '_' + str(rocauc) + '.pth.tar')
            #if lossVal < lossMIN:
            #    best_epoch = epochID
            #    lossMIN = lossVal    
            #    torch.save({'epoch': epochID + 1, 'state_dict': model.state_dict(), 'best_loss': lossMIN, 'optimizer' : optimizer.state_dict()}, 'm-epoch_448x448_'+str(epochID)+'-' + launchTimestamp + '.pth.tar')
                #print ('Epoch [' + str(epochID + 1) + '] [save] [' + timestampEND + '] loss= ' + str(lossVal))
            #else:
            #    print ('Epoch [' + str(epochID + 1) + '] [----] [' + timestampEND + '] loss= ' + str(lossVal))
            
   
        return batchs, losst        
    #-------------------------------------------------------------------------------- 
       
    def epochTrain(model, dataLoader, optimizer, epochMax, epochID, classCount, loss, start, lossMIN, launchTimestamp, dir_path):
        
        batch = []
        losstrain = []
        losseval = []
        
        
        
        model.train()

        for batchID, (varInput, target) in enumerate(dataLoaderTrain):
            
            varTarget = target.cuda(non_blocking = True)
            
            #varTarget = target.cuda()         


            varOutput = model(varInput)
            lossvalue = loss(varOutput, varTarget)
                       
            optimizer.zero_grad()
            lossvalue.backward()
            optimizer.step()

            
            l = lossvalue.item()
            losstrain.append(l)
            print(
                f'Epoch: {epochID}\t{100 * (batchID / (len(datasetTrain)//trBatchSize)):.1f}% complete. {timer() - start:.1f} seconds elapsed in epoch. Training loss: ' + str(round(np.mean(losstrain), 4)),
                end='\r')
            
            #if batchID==0:
               # print("begin")
           
        
        return batch, losstrain
    
    #-------------------------------------------------------------------------------- 
    
    def epochVal(model, dataLoader, optimizer, loss):
        
        model.eval()
        
        lossVal = 0
        lossValNorm = 0
        
        outGT = torch.FloatTensor().cuda()
        outPRED = torch.FloatTensor().cuda()

        with torch.no_grad():
            for i, (varInput, target) in enumerate(dataLoader):
                
                target = target.cuda(non_blocking = True)
                varOutput = model(varInput)
                losstensor = loss(varOutput, target)
                lossVal += losstensor
                lossValNorm += 1
                
                outGT = torch.cat((outGT, target), 0).cuda()
                outPRED = torch.cat((outPRED, varOutput), 0)
        
        outLoss = lossVal / lossValNorm

        
        outAUROC = []
        
        datanpGT = outGT.cpu().numpy()
        datanpPRED = outPRED.cpu().numpy()
        
        for i in range(len(class_names)):
            print(class_names[i])
            #try:
            outAUROC.append(roc_auc_score(datanpGT[:, i], datanpPRED[:, i]))
            #except ValueError:
            #    pass
        
        aurocMean = np.array(outAUROC).mean()
        #print('validation_loss : ' + str(outLoss))
        print ('\nAUROC mean ', round(aurocMean, 5))
        
        for i in range (0, len(outAUROC)):
            print (class_names[i], ' ', round(outAUROC[i],5))
            print("")
        print("\n")
        print(class_names)
        return outLoss, datanpGT, datanpPRED, aurocMean
    
    
    #--------------------------------------------------------------------------------     
     
    #---- Computes area under ROC curve 
    #---- dataGT - ground truth data
    #---- dataPRED - predicted data
    #---- classCount - number of classes
    
    def computeAUROC (dataGT, dataPRED, classCount):
        
        outAUROC = []
        
        datanpGT = dataGT.cpu().numpy()
        datanpPRED = dataPRED.cpu().numpy()
        
        for i in range(classCount):
            #print(class_names[i])
            #try:
            outAUROC.append(roc_auc_score(datanpGT[:, i], datanpPRED[:, i]))
            #except ValueError:
            #    pass
        return outAUROC
        
        
    #-------------------------------------------------------------------------------- 
    
    
    def test(model, dataLoaderTest, nnClassCount, checkpoint, class_names, loss):   
        
        #cudnn.benchmark = True
        
        if checkpoint != None and use_gpu:
            modelCheckpoint = torch.load(checkpoint)
            model.load_state_dict(modelCheckpoint['state_dict'])

        #if use_gpu:
        outGT = torch.FloatTensor().cuda()
        outPRED = torch.FloatTensor().cuda()
        #else:
         #   outGT = torch.FloatTensor()
          #  outPRED = torch.FloatTensor()
        lossVal = 0
        lossValNorm = 0
        
        model.eval()
        
        with torch.no_grad():
            for i, (varInput, target) in enumerate(dataLoaderTest):
                
                target = target.cuda(non_blocking = True)
                varOutput = model(varInput)
                losstensor = loss(varOutput, target)
                lossVal += losstensor
                lossValNorm += 1
                
                outGT = torch.cat((outGT, target), 0).cuda()
                outPRED = torch.cat((outPRED, varOutput), 0)
        
        outLoss = lossVal / lossValNorm
        

        aurocIndividual = DermTrainer.computeAUROC(outGT, outPRED, nnClassCount)
        aurocMean = np.array(aurocIndividual).mean()
        
        print ('AUROC mean ', round(aurocMean, 5))
        
        for i in range (0, len(aurocIndividual)):
            print (class_names[i], ' ', round(aurocIndividual[i],5))
       
        return outLoss, outGT, outPRED, aurocMean, aurocIndividual


In [29]:
nnClassCount = len(class_names)

In [30]:
class ResNet18(nn.Module):

    def __init__(self, out_size):
        super(ResNet18, self).__init__()
        self.resnet18 = models.resnet18(pretrained=True)
        num_ftrs = self.resnet18.fc.in_features
        self.resnet18.fc = nn.Sequential(
            nn.Linear(num_ftrs, out_size),
            nn.Sigmoid()
        )
            
        print(out_size)

    def forward(self, x):
        x = self.resnet18(x)
        return x

In [31]:

model = ResNet18(nnClassCount).cuda()
model = torch.nn.DataParallel(model).cuda()


3


In [32]:
trMaxEpoch = 20

In [33]:

timestampTime = time.strftime("%H%M%S")
timestampDate = time.strftime("%d%m%Y")
timestampLaunch = timestampDate + '-' + timestampTime


In [34]:
dir_str = 'dermatology_2022_09_23_'

In [None]:
print(timestampLaunch)
batch, losst = DermTrainer.train(model, dataLoaderTrain, dataLoaderVal, nnClassCount, trMaxEpoch, timestampLaunch, checkpoint = None)


04102022-222849


  return torch.max_pool2d(input, kernel_size, stride, padding, dilation, ceil_mode)


Epoch: 0	100.0% complete. 35.7 seconds elapsed in epoch. Training loss: 0.159

AUROC mean  0.85611
basal cell carcinoma   0.8511
melanoma   0.88133
squamous cell carcinoma   0.83592
val loss: 0.11917649


Epoch: 1	100.0% complete. 66.7 seconds elapsed in epoch. Training loss: 0.0907

AUROC mean  0.87464
basal cell carcinoma   0.90313
melanoma   0.86864
squamous cell carcinoma   0.85216
val loss: 0.11422281


Epoch: 2	100.0% complete. 98.6 seconds elapsed in epoch. Training loss: 0.0848

AUROC mean  0.87347
basal cell carcinoma   0.91917
melanoma   0.81766
squamous cell carcinoma   0.88357
val loss: 0.11103947


Epoch: 3	0.0% complete. 117.8 seconds elapsed in epoch. Training loss: 0.0851