In [10]:
import numpy as np
import csv
import os
import matplotlib.pyplot as plt
%matplotlib inline 
# import re
import pandas as pd
import gc
import time
import torch.utils.data as D
from torch import from_numpy
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import nbimporter
from UNetModel import UNet

In [11]:
'''
Custom dataset for the processed lung images:
    - Takes in the corresponding meta csv file, nodule csv file, and processed img directory
    - When getting an item (image), creates the label and returns 
      a dict with the image as a tensor and label as a tensor
'''
class LungsDataset(D.Dataset):
    def __init__(self, meta_file, nodule_file, img_dir):
        self.meta = pd.read_csv(meta_file)
        self.cands = pd.read_csv(nodule_file)
        self.img_dir = img_dir
        
    def __len__(self):
        return len(self.meta)
    
    def __getitem__(self, idx):
        row = self.meta.iloc[0]
        
        # meta information for the scan
        name = row['Name']
        originX = row['OriginX']
        originY = row['OriginY']
        originZ = row['OriginZ']
        spacingX = row['SpacingX']
        spacingY = row['SpacingY']
        spacingZ = row['SpacingZ']

        # nodules for each scan
        nodules = self.cands[self.cands['seriesuid'] == name][['coordX', 'coordY', 'coordZ']]
        nodules['coordX'] = ((nodules['coordX'] - originX)/spacingX).astype(int)
        nodules['coordY'] = ((nodules['coordY'] - originY)/spacingY).astype(int)
        nodules['coordZ'] = ((nodules['coordZ'] - originZ)/spacingZ).astype(int)
        
        # processed image (numpy array)
        for file in os.listdir(self.img_dir):
            if re.search(name + '.npy$', file):
                img = np.load(self.img_dir + 'p_' + name + '.npy')
                break
                
        # convert nodules to 1-hot
        label = np.zeros(img.shape)
        nodules = nodules.values
        for ind in range(nodules.shape[0]):
            nod = nodules[ind, :]
            label[nod[2], nod[1], nod[0]] = 1
        
        # convert img, label into tensors
        return {'img': from_numpy(img), 'label': from_numpy(label)}

In [12]:
# ds_0 = LungsDataset('/Volumes/KaneData/processed0/meta_0.csv', 
#                     '/Volumes/KaneData/candidates_V2.csv', 
#                     '/Volumes/KaneData/processed0/')
# dl_0 = D.DataLoader(ds_0, shuffle = True)

In [13]:
'''
Build optimizer, loss function family.
'''

model = UNet()
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr = 0.001)

In [14]:
'''
Train the model.
'''

def train(model, criterion, optimizer, data_loader):
    for epoch in range(2):
        running_loss = 0.0
        
        # currently, this code only works for batch size = 1, can change with inner for loop
        for i_batch, batch in enumerate(data_loader): 
            tr_img = batch['img']
            tr_label = batch['label']

            optimizer.zero_grad()

            output = model.forward(tr_img)
            loss = criterion(output, tr_label)

            loss.backward()
            optimizer.step()
            
            running_loss += loss.item()
        print("loss for epoch of the processed batch: " + str(running_loss / len(train_data)))