In [13]:
from sklearn.preprocessing import minmax_scale
import pandas as pd
import numpy as np

In [1]:
def batchGeneratorRegression(xPos, xNeg, ixPos, ixNeg, 
                             batchSize, xFP=None, ixFP=None, 
                             fp=False, posFraction=0.5):

    df = pd.read_csv('/home/katya/data/CSVFILES/annotations_enhanced.csv')

    #scaling values between 0 and 1:
    yMalig = minmax_scale(df['malignancy'].values)
    yDiam = minmax_scale(df['diameter_mm'].values)
    yLobul = minmax_scale(df['lobulation'].values)
    ySpic = minmax_scale(df['spiculation'].values)
    
    while True:
        
        #calculating the numbers of positive and negative samples to include into a batch
        #according to the positive fraction (posFraction) specified within the parameters:
        pSize = int(posFraction * batchSize)
        nSize = fpSize = int((batchSize - pSize)/2)
        
        pInds = np.random.choice(range(xPos.shape[0]), size=pSize, replace=False)
        nInds = np.random.choice(range(xNeg.shape[0]), size=nSize, replace=False)
        
        xPosBatch, ixPosBatch = xPos[pInds], ixPos[pInds]
        xNegBatch, ixNegBatch = xNeg[nInds], ixNeg[nInds]
        
        if fp==True:
            
            fpInds = np.random.choice(range(xFP.shape[0]), size=fpSize, replace=False)
            xFPBatch, ixFPBatch = xFP[fpInds], ixFP[fpInds]
            
            xBatch = np.concatenate([xPosBatch, xNegBatch, xFPBatch], axis=0)
            ixBatch = np.concatenate([ixPosBatch, ixNegBatch, ixFPBatch], axis=0)            

        else:
            
            xBatch = np.concatenate([xPosBatch, xNegBatch], axis=0)
            ixBatch = np.concatenate([ixPosBatch, ixNegBatch], axis=0)
        
        #labeling false positive samples (-2) same as all non-nodule samples (-1)
        ixBatch[ixBatch == -2] = -1
        
        #adding a dimension, corresponding to colour channels (we only have 1)
#         xBatch = np.expand_dims(xBatch, 1)
        
        #normalizing batch to values between 0 and 1
        xBatch = (xBatch - xBatch.flatten().min()) / (xBatch.flatten().max() - xBatch.flatten().min())
        xBatch = np.clip(xBatch,0,1)
        
        yBatchMalig = yMalig[ixBatch]
        yBatchDiam = yDiam[ixBatch]
        yBatchLobul = yLobul[ixBatch]
        yBatchSpic = ySpic[ixBatch]
        
        #zeroing all parameters of non-nodule samples:
        yBatchMalig[ixBatch == -1] = 0.0
        yBatchDiam[ixBatch == -1] = 0.0
        yBatchLobul[ixBatch == -1] = 0.0
        yBatchSpic[ixBatch == -1] = 0.0
        
        yield xBatch, {'Malignancy':yBatchMalig, 'Diameter':yBatchDiam,
                       'Lobulation':yBatchLobul, 'Spiculation':yBatchSpic}   

In [2]:
def loadCategory(category):
    
    x = np.load('/home/katya/data/voxels_'+category+'64/subset0X'+category+'.npy')
    ix = np.load('/home/katya/data/voxels_'+category+'64/subset0IX'+category+'.npy')

    for subset in range(1,10):
        
        xTemp = np.load('/home/katya/data/voxels_'+category+'64/subset' + str(subset) + 'X'+category+'.npy')
        ixTemp = np.load('/home/katya/data/voxels_'+category+'64/subset' + str(subset) + 'IX'+category+'.npy')

        x = np.concatenate((x, xTemp),axis=0)
        ix = np.concatenate((ix, ixTemp),axis=0)
        
        del xTemp, ixTemp
        
    print ('The number of samples for %s category constitutes %d' % (category, len(ix)))
    
    return x, ix

In [3]:
def nodulePredictor(category, modelPath, validInd):

    model = load_model(modelPath)
    x, ix = loadCategory(category)
    
    x = np.expand_dims(x, 1)
    
    xVal = x[validInd[category]]
    ixVal = ix[validInd[category]]
    
    xTrain = np.array([n for i,n in enumerate(x) if i not in validInd[category]])
    ixTrain = np.array([n for i,n in enumerate(ix) if i not in validInd[category]])
    
    print ('Predicting...')
    
    yValidHat = model.predict(xVal, batch_size=10, verbose=1)
    yTrainHat = model.predict(xTrain, batch_size=10, verbose=1)
    
    posValInds = np.where(yValidHat[:,1]>0.5)
    posTrainInds = np.where(yTrainHat[:,1]>0.5)
    
    nodulesVal = xVal[posValInds]
    iNodulesVal = ixVal[posValInds]
    
    print ('Number of predicted validation nodules is %d' % iNodulesVal.shape[0])
    
    nodulesTrain = xTrain[posTrainInds]
    iNodulesTrain = ixTrain[posTrainInds]  
    
    print ('Number of predicted train nodules is %d' % iNodulesTrain.shape[0])
    print ('-------------------------------------------------------------------')
    
    return nodulesTrain, iNodulesTrain, nodulesVal, iNodulesVal