In [1]:
import tensorflow as tf
from keras.backend.tensorflow_backend import set_session
config = tf.ConfigProto()
config.gpu_options.per_process_gpu_memory_fraction = 0.48
set_session(tf.Session(config=config))

from keras.models import load_model
PATH_MODEL = "../Models/LUNA_model_v3_class.h5"
model_class = load_model(PATH_MODEL)

PATH_MODEL = "../Models/LUNA_model_v3_regression.h5"
model_reg = load_model(PATH_MODEL)

Using TensorFlow backend.
  return cls(**config)


In [2]:
import numpy as np
import pandas as pd
import os
import time
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import log_loss
from matplotlib import pyplot as plt
%matplotlib inline

PATH_VOXELS = '../../data/stage1_voxels_mask/'

In [None]:
df_labels_1 = pd.read_csv('/home/lin/data/stage1_labels.csv')
df_labels_2 = pd.read_csv('/home/lin/data/stage1_solution.csv')
df_labels_2 = df_labels_2.drop(['Usage'],1)
df_labels = df_labels_1.append(df_labels_2)
print ("Total %d labels"%df_labels.shape[0])

df_labels = df_labels.set_index(df_labels['id'])
df_labels.drop(['id'],1,inplace=True)

Total 1595 labels


In [None]:
patients = [f for f in os.listdir(PATH_VOXELS)]
print ("patient numbers: ", len(patients))

noduleDict = {}
cancerDistr = []
nonCancerDistr = []

for num, patient in enumerate(patients):
    
    if num%100==0:
        print (num)
    
    patient_array = np.load(PATH_VOXELS + patient)
    voxels = patient_array['vox']  

    preds = np.array(model_class.predict(x= voxels))
    
    inds = np.array([x for x in range(preds.shape[0])])
    
    noduleDict[patient[:-4]] = inds[preds[:,1]>0.5]
    
    if df_labels.loc[patient[:-4]]['cancer']==1:
        cancerDistr.extend(list(preds[:,1][preds[:,1]>0.5]))
    else:
        nonCancerDistr.extend(list(preds[:,1][preds[:,1]>0.5]))

patient numbers:  1434
0


In [None]:
plt.figure(figsize=[10,7])
plt.hist(cancerDistr, label='Cancer patients', alpha=0.5, color='c', normed=True)
plt.hist(nonCancerDistr, label='Healthy individuals', alpha=0.5, color='k', normed=True)
plt.xlabel('Probability of being a malignant nodule')
plt.legend()
plt.xlim([0,1])
plt.savefig('nodule_prob.png')

In [None]:
all_features = []
for num,patient in enumerate(patients):
    patient_array = np.load(PATH_VOXELS + patient)

    voxels = patient_array['vox'][noduleDict[patient]]
    
    preds = np.array(model_reg.predict(x= voxels))
    ixs = np.argmax(preds[0])
    
    xmax_malig = np.max(preds[0], axis=0)
    xmax_spiculation = np.max(preds[1], axis=0)
    xmax_lobulation = np.max(preds[2], axis=0)
    xmax_diameter = np.max(preds[3], axis=0)
    
    xsd_malig = np.std(preds[0], axis=0)
    xsd_spiculation = np.std(preds[1], axis=0)
    xsd_lobulation = np.std(preds[2], axis=0)
    xsd_diameter = np.std(preds[3], axis=0)
    
    centroids = patient_array['cents']
    shape = patient_array['shape']
    normalized_locs = centroids.astype('float32') / shape.astype('float32')
    
    if len(preds)==0:
        feats = np.zeros(14)
    else:
        feats = (np.concatenate([xmax_malig,xmax_spiculation,xmax_lobulation,xmax_diameter,\
               xsd_malig,xsd_spiculation,xsd_lobulation,xsd_diameter,\
               normalized_locs[ixs],normalized_locs.std(axis=0)]))        
    print (feats)
    
    all_features.append(feats)    
X = np.stack(all_features)

col=['max_malig','max_spiculation','max_lobulation','max_diameter',\
     'xsd_malig', 'xsd_spiculation', 'xmax_lobulation','xsd_diameter','a','a','a','a','a','a']

df = pd.DataFrame(data=X, columns=col)
df['labels'] = labels

df.to_csv('./model3_feature_matrix_kaggle1.csv')