In [1]:
import tensorflow as tf
from keras.backend.tensorflow_backend import set_session
config = tf.ConfigProto()
config.gpu_options.per_process_gpu_memory_fraction = 0.48
set_session(tf.Session(config=config))

from keras.models import load_model
PATH_MODEL = "../Models/LUNA_model_v3_class.h5"
model_class = load_model(PATH_MODEL)

PATH_MODEL = "../Models/LUNA_model_v3_regression.h5"
model_reg = load_model(PATH_MODEL)

Using TensorFlow backend.
  return cls(**config)


In [2]:
import numpy as np
import pandas as pd
import os
import time
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import log_loss
from matplotlib import pyplot as plt
%matplotlib inline

PATH_VOXELS = '../../data/stage1_voxels_mask/'

In [None]:
df_labels_1 = pd.read_csv('/home/lin/data/stage1_labels.csv')
df_labels_2 = pd.read_csv('/home/lin/data/stage1_solution.csv')
df_labels_2 = df_labels_2.drop(['Usage'],1)
df_labels = df_labels_1.append(df_labels_2)
print ("Total %d labels"%df_labels.shape[0])

df_labels = df_labels.set_index(df_labels['id'])
df_labels.drop(['id'],1,inplace=True)

Total 1595 labels


In [None]:
patients = [f for f in os.listdir(PATH_VOXELS)]
print ("patient numbers: ", len(patients))

noduleDict = {}
cancerDistr = []
nonCancerDistr = []

for num, patient in enumerate(patients):
    
    if num%100==0:
        print (num)
    
    patient_array = np.load(PATH_VOXELS + patient)
    voxels = patient_array['vox']  

    preds = np.array(model_class.predict(x= voxels))
    
    inds = np.array([x for x in range(preds.shape[0])])
    
    noduleDict[patient[:-4]] = inds[preds[:,1]>0.5]
    
    if df_labels.loc[patient[:-4]]['cancer']==1:
        cancerDistr.extend(list(preds[:,1][preds[:,1]>0.5]))
    else:
        nonCancerDistr.extend(list(preds[:,1][preds[:,1]>0.5]))

patient numbers:  1434
0
100
200
300
400
500
600
700
1000
1100
1200
1300
1400


In [None]:
plt.figure(figsize=[10,7])
plt.hist(cancerDistr, label='Cancer patients', alpha=0.5, color='c', normed=True)
plt.hist(nonCancerDistr, label='Healthy individuals', alpha=0.5, color='k', normed=True)
plt.xlabel('Probability of being a malignant nodule')
plt.legend()
plt.xlim([0,1])
plt.savefig('nodule_prob.png')

In [None]:
start = time.time()
all_features = []
for num,patient in enumerate(patients):
    patient_array = np.load(PATH_VOXELS + patient)

    voxels = patient_array['vox'][noduleDict[patient[:-4]]]
    
    preds = np.array(model_reg.predict(x= voxels))
    ixs = np.argmax(preds[0])
    
    xmax_malig = np.max(preds[0], axis=0)
    xmax_spiculation = np.max(preds[1], axis=0)
    xmax_lobulation = np.max(preds[2], axis=0)
    xmax_diameter = np.max(preds[3], axis=0)
    
    xsd_malig = np.std(preds[0], axis=0)
    xsd_spiculation = np.std(preds[1], axis=0)
    xsd_lobulation = np.std(preds[2], axis=0)
    xsd_diameter = np.std(preds[3], axis=0)
    
    centroids = patient_array['cents']
    shape = patient_array['shape']
    normalized_locs = centroids.astype('float32') / shape.astype('float32')
    
    if len(preds)==0:
        feats = np.zeros(14)
    else:
        feats = (np.concatenate([xmax_malig,xmax_spiculation,xmax_lobulation,xmax_diameter,\
               xsd_malig,xsd_spiculation,xsd_lobulation,xsd_diameter,\
               normalized_locs[ixs],normalized_locs.std(axis=0)]))        
    all_features.append(feats)    
    
    if num%100==0:
        print ("\t%4d patients predicted"%num, time.time()-start )
    
X = np.stack(all_features)

col=['max_malig','max_spiculation','max_lobulation','max_diameter',\
     'xsd_malig', 'xsd_spiculation', 'xmax_lobulation','xsd_diameter',\
     'loc_from_malig_x', 'loc_from_malig_y', 'loc_from_malig_z',\
     'std_locs_x', 'std_locs_y', 'std_locs_z']

df = pd.DataFrame(data=X, columns=col)
df['labels'] = labels

df.to_csv('./feature_matrix_model3_stage1.csv')

	   0 patients predicted 13.039426565170288
	 300 patients predicted 2794.9232683181763
	 700 patients predicted 6585.902639389038
	 800 patients predicted 7504.007497787476


In [12]:
df.to_csv('./feature_matrix_model3_stage1.csv')

In [13]:
df.shape

(1434, 14)

In [26]:
df['id']=[x[:-4] for x in patients]

In [27]:
df

Unnamed: 0,max_malig,max_spiculation,max_lobulation,max_diameter,xsd_malig,xsd_spiculation,xmax_lobulation,xsd_diameter,loc_from_malig_x,loc_from_malig_y,loc_from_malig_z,std_locs_x,std_locs_y,std_locs_z,id
0,0.407618,0.211248,0.229299,0.196713,0.060364,0.028035,0.030435,0.010581,0.114286,0.364286,0.326389,0.244258,0.163056,0.242655,d777a77cc7a2ec2f1eed68799cc9075c
1,0.301607,0.195614,0.219990,0.194113,0.067776,0.028539,0.034208,0.010117,0.088889,0.416667,0.102564,0.243736,0.159560,0.216008,1631637f08f27347e8f23d7a0e18c100
2,0.349085,0.231036,0.224010,0.202245,0.066410,0.035221,0.025599,0.008611,0.200000,0.339394,0.598726,0.231711,0.155614,0.217998,9065f2b133129c5747d42db18a424749
3,0.370535,0.214804,0.227077,0.195514,0.048811,0.028892,0.032502,0.009913,0.112281,0.294737,0.555556,0.233297,0.183184,0.207843,e3a9a6f8d21c6c459728066bcf18c615
4,0.318191,0.169963,0.229695,0.192455,0.060015,0.025715,0.038635,0.011385,0.094118,0.420588,0.278736,0.249184,0.183342,0.214586,f39a1e54d79731e4417aa8159d19b7d4
5,0.379130,0.198604,0.228618,0.194666,0.059033,0.032172,0.031038,0.009002,0.091429,0.171429,0.393836,0.219889,0.184321,0.200701,8987df2216ae2ab1a907d52bc9dc540d
6,0.329599,0.215058,0.229345,0.199101,0.059572,0.031746,0.032229,0.011190,0.114286,0.385714,0.315603,0.212833,0.155433,0.220319,7c2b72f9e0f5649c22902292febdc89f
7,0.412462,0.222255,0.228341,0.197242,0.068449,0.031100,0.028903,0.008404,0.090395,0.324859,0.423729,0.234608,0.177474,0.225888,e60d99ea9648e1ce859eb0b386365e26
8,0.265525,0.177683,0.221653,0.193369,0.052305,0.023806,0.036719,0.010748,0.087432,0.434426,0.831169,0.238636,0.172919,0.226779,8a2de07f6e9dbb8c6e4bfad7e83b3f0a
9,0.364786,0.219473,0.229088,0.191953,0.065329,0.033224,0.038511,0.011314,0.089888,0.258427,0.372727,0.236521,0.165068,0.206094,eaeebb7a63edc8a329a7c5fbc583a507


In [28]:
df_labels

Unnamed: 0_level_0,cancer
id,Unnamed: 1_level_1
0015ceb851d7251b8f399e39779d1e7d,1
0030a160d58723ff36d73f41b170ec21,0
003f41c78e6acfa92430a057ac0b306e,0
006b96310a37b36cccb2ab48d10b49a3,1
008464bb8521d09a42985dd8add3d0d2,1
0092c13f9e00a3717fdc940641f00015,0
00986bebc45e12038ef0ce3e9962b51a,0
00cba091fa4ad62cc3200a657aeb957e,0
00edff4f51a893d80dae2d42a7f45ad1,1
0121c2845f2b7df060945b072b2515d7,0


In [31]:
df_merged = pd.merge(left=df, right=df_labels_1.append(df_labels_2),how='outer',on='id').dropna()
print (df_merged.shape)
df_merged.head()

(1434, 16)


Unnamed: 0,max_malig,max_spiculation,max_lobulation,max_diameter,xsd_malig,xsd_spiculation,xmax_lobulation,xsd_diameter,loc_from_malig_x,loc_from_malig_y,loc_from_malig_z,std_locs_x,std_locs_y,std_locs_z,id,cancer
0,0.407618,0.211248,0.229299,0.196713,0.060364,0.028035,0.030435,0.010581,0.114286,0.364286,0.326389,0.244258,0.163056,0.242655,d777a77cc7a2ec2f1eed68799cc9075c,1
1,0.301607,0.195614,0.21999,0.194113,0.067776,0.028539,0.034208,0.010117,0.088889,0.416667,0.102564,0.243736,0.15956,0.216008,1631637f08f27347e8f23d7a0e18c100,0
2,0.349085,0.231036,0.22401,0.202245,0.06641,0.035221,0.025599,0.008611,0.2,0.339394,0.598726,0.231711,0.155614,0.217998,9065f2b133129c5747d42db18a424749,1
3,0.370535,0.214804,0.227077,0.195514,0.048811,0.028892,0.032502,0.009913,0.112281,0.294737,0.555556,0.233297,0.183184,0.207843,e3a9a6f8d21c6c459728066bcf18c615,0
4,0.318191,0.169963,0.229695,0.192455,0.060015,0.025715,0.038635,0.011385,0.094118,0.420588,0.278736,0.249184,0.183342,0.214586,f39a1e54d79731e4417aa8159d19b7d4,0


In [32]:
df_merged.to_csv('./feature_matrix_model3_stage1.csv')