In [3]:
import os
import glob
import random
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import imageio
from skimage.transform import rescale, resize, downscale_local_mean
from skimage.restoration import (denoise_wavelet)
from skimage import exposure
from torch.utils import data
import pickle
from torchvision import transforms
from kymatio import Scattering2D
import torch
from PIL import Image
import torch.nn as nn
import torch.optim as optim
from torch.optim import lr_scheduler
import torchvision.models as models
from sklearn.metrics import roc_auc_score
from torch.optim import lr_scheduler
from torch.autograd import Variable

In [4]:
import torchvision

In [6]:
print(torch.__version__)

0.4.0


In [5]:
print(torchvision.__version__)

0.2.2


In [2]:
class cancer(data.Dataset):
    def __init__(self, parent_dir,samplerate,transform=None,phase = 'trainc',load_prob = True,green = False):
        """
        Args:
            transform (optional): Optional transform to be applied on a sample.
            green: If true, only take the green channel
            samplerate: take how much percent of the original tadaset
            Note: Whenever you change a samplerate, you need to resave the pickle
        """
        self.green = green
        self.transform = transform
        
        
        if not parent_dir.endswith('/'):
            # Make sure the directory name is correctly given
            parent_dir = parent_dir + '/'

        data_list = []
        for lab in ['/malignant/','/benign/']:
            filelist  = glob.glob(parent_dir +lab+ '**/40X/*.png', recursive=True)
            data_list.extend([(file,0) if lab == '/benign/' else (file,1) for file in filelist]) #include labels before random split
        
        self.data_list = data_list #All the (filename,label)
        
#         self.num_allsamples = len(data_list)
#         self.transform = transform
    
        random.seed(3)
        random.shuffle(data_list)       
        trainlst = data_list[:round(samplerate*len(data_list))]
        testlst = data_list[round(samplerate*len(data_list)):]
       
        
        if phase == 'trainc':
            jpg_list = trainlst
        else:
            jpg_list = testlst
      
        
        self.image_data_dict = {}

        
        if load_prob:
            f_myfile = open(phase + '.pickle', 'rb')
            self.image_data_dict = pickle.load(f_myfile)
            f_myfile.close()
        else:
            for i in range(len(jpg_list)):
                #dataset = np.array(Image.open(jpg_list[i][0]).convert('LA'))
                dataset = imageio.imread(jpg_list[i][0]) #rgb
                self.image_data_dict[i] = [dataset, jpg_list[i][1]] #pixle and label
            with open(phase + '.pickle', 'wb') as handle:
                pickle.dump(self.image_data_dict, handle)

    def __len__(self):
        return len(self.image_data_dict)

    def __getitem__(self,index):
        '''
        Return a tuple containing the image tensor and corresponding class for the given index.
        Parameter:
        index: This is the index created by _init_, it's the key of the dict in _init_
               Notice that a single patient could have multiple index associated.
        '''
        if index not in self.image_data_dict:
            raise ValueError('Index out of bound')
        img,tag = self.image_data_dict[index]
        
       #isolating green channel:
        if self.green:
            img = img[:,:,1]
            
        img = transforms.ToPILImage()(img)
        img = transforms.functional.resize(img,(60,60))
          
       
        if self.transform:
            img = self.transform(img)
        
        sample = (img, tag)
        
        return sample


In [3]:
def save_model(epoch,model):
    torch.save(model.state_dict(), "Cancer_CNN_trial_{}.model".format(epoch))
    print("Model saved!")

In [4]:
class ConvNet(nn.Module):
    def __init__(self):
        super(ConvNet, self).__init__()
        self.layer1 = nn.Sequential(
            nn.Conv2d(3, 16, kernel_size = 3, stride = 1, padding = 0, groups=1, bias=True),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size = 2, stride = 2))
        self.batchnorm1 = nn.BatchNorm2d(16, momentum = 0.1)
#         self.layer2 = nn.Sequential(
#             nn.Conv2d(16, 32, kernel_size = 3, stride = 1, padding = 1),
#             nn.ReLU(inplace=True),
#             nn.MaxPool2d(kernel_size = 2, stride = 2))
#         self.layer3 = nn.Sequential(
#             nn.Conv2d(16, 32, kernel_size = 3, stride = 1, padding = 1),
#             nn.ReLU(inplace=True))
#         self.batchnorm2 = nn.BatchNorm2d(32, momentum = 0.1)
        self.drop_out = nn.Dropout()
        self.fc_layer = nn.Sequential(nn.Linear(13456, 2))
        
    def forward(self, x):
        output = self.layer1(x)
        output = self.batchnorm1(output)
        #output = self.layer2(output)
        #output = self.layer3(output)
        #output = self.batchnorm2(output)
        output = output.view(output.size(0), -1) # flatten
        output = self.fc_layer(output)
        return output#nn.functional.softmax(output, dim = 1)

In [5]:
#trainmean 0.8058, 0.6531, 0.7753 trainstd0.08847197, 0.13046794, 0.0852273 testmean0.8009, 0.6541, 0.7711 teststd 0.11159356, 0.12982282, 0.10046051
allauc = {}
allrecall = {}
allacc = {}
pdir = '/scratch/jx1047/project/Scattering-colonography/breast'
for rate in [0.9]:
    #Update the new file
    img = cancer(pdir,samplerate = rate,phase =  'trainc',load_prob = False)
    img = cancer(pdir,samplerate = rate,phase =  'testc',load_prob = False)
    
    
    
    trainimg = cancer(pdir,samplerate = 1,phase = 'trainc',load_prob = True,transform = transforms.Compose([transforms.ToTensor(),
            transforms.Normalize((0.8058, 0.6531, 0.7753),(0.08847197, 0.13046794, 0.0852273 ))]))
    testimg = cancer(pdir,samplerate = 1,phase = 'testc',load_prob = True,transform = transforms.Compose([transforms.ToTensor(),
             transforms.Normalize(( 0.8009, 0.6541, 0.7711),(0.11159356, 0.12982282, 0.1004605,))]))
    
    
    use_cuda = torch.cuda.is_available()
    device = torch.cuda.device("cuda:0" if use_cuda else "cpu")
    params = {'batch_size': 32, 
              'shuffle': True,
              'num_workers': 4}
    patch_training_generator = data.DataLoader(trainimg, **params)
    patch_test_generator = data.DataLoader(testimg,**params)
    
    train_model = ConvNet()
    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.SGD(train_model.parameters(), lr=0.01)
    scheduler = lr_scheduler.StepLR(optimizer, step_size=5)


    best_acc = 0.0
    best_acc1 = 0.0
    num_epochs = 15
    predall = [] #max
    predprob = [] #
    truelabel = []
    
    for epoch in range(num_epochs):
        train_model.train()
        train_acc = 0.0
        lentra = 0
        lente = 0
        for i, (images, label) in enumerate(patch_training_generator):
            lentra += images.size()[0]
            #images.requires_grad_(True)
            images = Variable(images, requires_grad=True)
            label = Variable(label)
            # Run the forward pass
            outputs = train_model(images)
            loss = criterion(outputs, label)
            #a = list(train_model.parameters())[0].clone()
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            #b = list(train_model.parameters())[0].clone()
            #print(torch.equal(a.data, b.data))
            # Store result
            #print(outputs.data)
            _, predicted = torch.max(outputs.data, 1)

            # Save train accuracy
            train_acc += torch.sum(predicted == label.data)

        train_acc = float(train_acc) / lentra
        scheduler.step()
        # Check test accuracy, if the accuracy is higher than before, save the model

        train_model.eval()
        for child in train_model.children():
            if type(child)==nn.BatchNorm2d:
                child.track_running_stats = False
        ta = 0.0
        predall = [] #max
        predprob = [] #
        truelabel = []

        for i, (image, label) in enumerate(patch_test_generator):
            lente += image.size()[0]
            images = Variable(image)
            label = Variable(label)
            out = train_model(images)
            pos_proba = torch.nn.functional.softmax(out.data)[:,1]
            predprob.append(pos_proba)
            truelabel.append(label.data)  
            _, prediction = torch.max(out.data,1)  
            prediction2 = (pos_proba > 0.5)*1 #prediction based on 0.5 threshold
            predall.append(prediction)
            prediction2 = (pos_proba > 0.5)*1
            ta1 =+ np.sum(prediction2.numpy() == label.data.numpy()) #used to calculate the accuracy based on 0.5 threshold
            ta += torch.sum(prediction == label.data)
        labels = torch.cat(truelabel).numpy()
        preds = torch.cat(predprob).numpy()
        bpreds = torch.cat(predall).numpy()
        ta = float(ta) / lente #test size
        auc = roc_auc_score(labels, preds)
        recall = np.sum((bpreds == labels )&(labels == 1))/np.sum(labels == 1)
        print('auc = {}'.format(auc))
        print('recall = {}'.format(recall))
        print('accuracy = {}'.format(ta))
        print('accuracy from bpreds = {}'.format(np.sum(bpreds == labels)/len(labels)))
        print("Epoch {}: train_accuracy is {}, test_accuracy is {}".format(epoch, train_acc, ta))
        if epoch >= 10:
            if rate not in allauc.keys():
                allauc[rate] = [auc]
                allrecall[rate] = [recall]
                allacc[rate] = [ta]
            else:
                allauc[rate].append(auc)
                allrecall[rate].append(recall)
                allacc[rate].append(ta)


    
    
    
    
    



auc = 0.847889537494387
recall = 0.8549618320610687
accuracy = 0.8040201005025126
accuracy from bpreds = 0.8040201005025126
Epoch 0: train_accuracy is 0.7383073496659243, test_accuracy is 0.8040201005025126


Process Process-9:
Process Process-10:
Process Process-11:
Process Process-12:
Traceback (most recent call last):
  File "/home/jx1047/.conda/envs/image/lib/python3.6/multiprocessing/process.py", line 249, in _bootstrap
    self.run()
  File "/home/jx1047/.conda/envs/image/lib/python3.6/multiprocessing/process.py", line 93, in run
    self._target(*self._args, **self._kwargs)
  File "/home/jx1047/.conda/envs/image/lib/python3.6/site-packages/torch/utils/data/dataloader.py", line 57, in _worker_loop
    samples = collate_fn([dataset[i] for i in batch_indices])
  File "/home/jx1047/.conda/envs/image/lib/python3.6/site-packages/torch/utils/data/dataloader.py", line 57, in <listcomp>
    samples = collate_fn([dataset[i] for i in batch_indices])
  File "<ipython-input-2-8f48a4e391aa>", line 78, in __getitem__
    img = self.transform(img)
  File "/home/jx1047/.conda/envs/image/lib/python3.6/site-packages/torchvision/transforms/transforms.py", line 60, in __call__
    img = t(img)
  File "/h

KeyboardInterrupt: 

In [6]:
images.size()

torch.Size([32, 3, 60, 60])

In [None]:
avauc = {key: np.mean(val) for key,val in allauc.items()}
avrecall = {key: np.mean(val) for key,val in allrecall.items()}
avacc = {key: np.mean(val) for key,val in allacc.items()}

In [10]:
allauc

{0.9: [0.87876066457117208,
  0.88358778625954193,
  0.88078132016165245,
  0.88224068253255505,
  0.88370004490345755]}

In [11]:
allacc

{0.9: [0.8241206030150754,
  0.8140703517587939,
  0.8140703517587939,
  0.8090452261306532,
  0.8190954773869347]}

In [12]:
allrecall

{0.9: [0.8854961832061069,
  0.87786259541984735,
  0.87786259541984735,
  0.87786259541984735,
  0.89312977099236646]}