https://github.com/dhammack/DSB2017/blob/master/scoring_code/score_ensemble2_final.py

In [1]:
import numpy as np
import pandas as pd
import os
from sklearn.cluster import DBSCAN
from sklearn.linear_model import LogisticRegression
# from keras.models import load_model

# import tensorflow as tf
# from keras.backend.tensorflow_backend import set_session
# config = tf.ConfigProto()
# config.gpu_options.per_process_gpu_memory_fraction = 0.3
# set_session(tf.Session(config=config))

In [2]:
def get_loc_features(locs, malig_scores, sizes):

#     print ("locs",locs, "\nmalig_scores: ", malig_scores, "\nsizes: ", sizes)
    normalized_locs = locs.astype('float32') / sizes.astype('float32')
#     print ("normalized_locs: ", normalized_locs)
    #location of the most malignant tumor
    loc_from_malig = normalized_locs[ np.argmax(malig_scores) ]

    dist_mat = np.zeros((locs.shape[0], locs.shape[0]))
    for i,loc_a in enumerate(locs):
        for j,loc_b in enumerate(locs):
            dist_mat[i,j] = np.mean(np.abs(loc_a - loc_b))
    print ("dist_mat shape: ", dist_mat.shape)		
    dbs = DBSCAN(eps=60, min_samples=2, metric='precomputed', leaf_size=2).fit(dist_mat)
    num_clusters = np.max(dbs.labels_) + 1
    num_noise = (dbs.labels_ == -1).sum()
    print ("num_clusters", num_clusters)
    print ("num_noise: ", num_noise)
    #new feature: sum of malig_scores but normalizing by cluster.
    cluster_avgs = []
    for clusternum in range(num_clusters):
        cluster_avgs.append( malig_scores[dbs.labels_ == clusternum].mean())
    print ("cluster_avgs",cluster_avgs)
    print ("dbs.labels_", dbs.labels_)
    #now get the -1's
    for i,(clusterix,malig) in enumerate(zip(dbs.labels_,malig_scores)):
        if clusterix == -1:
            cluster_avgs.append(malig)

    weighted_sum_malig = np.sum(cluster_avgs)
    weighted_mean_malig = np.mean(cluster_avgs)

    #size of biggest cluster
    sizes = np.bincount(dbs.labels_[dbs.labels_ > 0])
    if len(sizes) > 0:
        maxsize = np.max(sizes)
    else:
        maxsize = 1
    n_nodules = float(locs.shape[0])

    
    print ("loc_from_malig: ",loc_from_malig)
    print ("normalized_locs.std: ",normalized_locs.std(axis=0))
    print ("(num_clusters) / n_nodules: ", float(num_clusters) / n_nodules)
    print ("(num_noise) / n_nodules: ",float(num_noise) / n_nodules)
    print ("weighted_mean_malig",weighted_mean_malig )
    print ("float(maxsize) / n_nodules: ", float(maxsize) / n_nodules)
    return np.concatenate([loc_from_malig, normalized_locs.std(axis=0), [float(num_clusters) / n_nodules, float(num_noise) / n_nodules, weighted_mean_malig, float(maxsize) / n_nodules]])


Let's see an example. What location features do we obtain for the first patient? Does it make sense?


In [3]:
import numpy as np
import pandas as pd
import os
from sklearn.cluster import DBSCAN
PATH_VOXELS = "../../data/stage1_TOP_voxels/"
patient = [f.replace('vox_', '') for f in os.listdir(PATH_VOXELS) if 'vox_' in f][60]

# patient = '006b96310a37b36cccb2ab48d10b49a3.npy'
print (patient)
patient_vox = np.load(os.path.join(PATH_VOXELS, 'vox_' + patient)) #voxels[filter]
patient_locs = np.load(os.path.join(PATH_VOXELS, 'cents_' + patient))#locations[filter]
patient_sizes = np.load(os.path.join(PATH_VOXELS, 'shapes_' + patient))#sizes[filter]
patient_nodule_preds = np.load(os.path.join(PATH_VOXELS, 'preds_' + patient))
location_feats = get_loc_features(patient_locs, patient_nodule_preds, patient_sizes)


8c2f9e7025d6070e9f4ab8a65c8e9dbb.npy
dist_mat shape:  (50, 50)
num_clusters 1
num_noise:  0
cluster_avgs [0.69251013]
dbs.labels_ [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0]
loc_from_malig:  [ 0.22260274  0.14222223  0.33584905]
normalized_locs.std:  [ 0.21578209  0.21080655  0.16353342]
(num_clusters) / n_nodules:  0.02
(num_noise) / n_nodules:  0.0
weighted_mean_malig 0.69251
float(maxsize) / n_nodules:  0.02


In [4]:
def get_feature_matrix(PATH_MODEL,name ):

    PATH_VOXELS = "../../data/stage1_TOP_voxels/"
#     model_v24 = load_model(PATH_MODEL)

    files = [f.replace('vox_', '') for f in os.listdir(PATH_VOXELS) if 'vox_' in f][:200]
    all_features = []
    preds = []
    n_TTA = 10

    for i,patient in enumerate(files):
        patient_vox = np.load(os.path.join(PATH_VOXELS, 'vox_' + patient)) #voxels[filter]
        patient_locs = np.load(os.path.join(PATH_VOXELS, 'cents_' + patient))#locations[filter]
        patient_sizes = np.load(os.path.join(PATH_VOXELS, 'shapes_' + patient))#sizes[filter]
        patient_nodule_preds = np.load(os.path.join(PATH_VOXELS, 'preds_' + patient))

    #     Xmean = np.mean(patient_nodule_preds, axis=0)
        xmax = patient_nodule_preds.max()
        xsd = np.std(patient_nodule_preds, axis=0)
        location_feats = get_loc_features(patient_locs, patient_nodule_preds, patient_sizes)

    #         print ("\nMin_pred: ", pred.min(), ", Max_pred: ", pred.max(), "\nLen of predict", len(pred), ", Mean:",Xmean, ", Size: ", size[0])

    # 0_xmax, 1_xsd, loc_from_malig, normalized_locs.std(axis=0), [float(num_clusters) / n_nodules, float(num_noise) / n_nodules, weighted_mean_malig, float(maxsize) / n_nodules]
        all_features.append(np.hstack([xmax, xsd, location_feats]))
        X = np.stack(all_features)
    df = pd.DataFrame(data=X,index=files)
    df.to_csv(name + '.csv')
    df.to_csv('model_v24_feature_matrix.csv')

In [5]:
PATH_MODEL = "../../../katya/LungCancer/Katya/CNN_v2/model_and_weights/model_LUNA_v2_24.h5"
get_feature_matrix(PATH_MODEL, 'model_v24_feature_matrix')

dist_mat shape:  (50, 50)
num_clusters 1
num_noise:  0
cluster_avgs [0.71922308]
dbs.labels_ [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0]
loc_from_malig:  [ 0.44333333  0.50205761  0.88104087]
normalized_locs.std:  [ 0.1940074   0.22990318  0.21854457]
(num_clusters) / n_nodules:  0.02
(num_noise) / n_nodules:  0.0
weighted_mean_malig 0.719223
float(maxsize) / n_nodules:  0.02
dist_mat shape:  (50, 50)
num_clusters 1
num_noise:  0
cluster_avgs [0.71027237]
dbs.labels_ [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0]
loc_from_malig:  [ 0.33110368  0.14349777  0.44522968]
normalized_locs.std:  [ 0.2428243   0.20628147  0.19213621]
(num_clusters) / n_nodules:  0.02
(num_noise) / n_nodules:  0.0
weighted_mean_malig 0.710272
float(maxsize) / n_nodules:  0.02
dist_mat shape:  (50, 50)
num_clusters 1
num_noise:  0
cluster_avgs [0.69027132]
dbs.labels_ [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 

# baseline: LOGISTIC REGRESSION

In [6]:
import numpy as np
import pandas as pd
import os

df_model_v24 = pd.read_csv('./model_v24_feature_matrix.csv')
df_model_v24 = df_model_v24.rename(\
                                   columns={'Unnamed: 0':'patient_id',\
                                            '0':'max',\
                                            '1':'std',\
                                            '2':'loc_from_malig_x',\
                                            '3':'loc_from_malig_y',\
                                            '4':'loc_from_malig_z',\
                                            '5':'norm_locs_x',\
                                            '6':'norm_locs_y',\
                                            '7':'norm_locs_z',\
                                            '8':'num_clusters_norm',\
                                            '9':'num_noise_norm',\
                                            '10':'weighted_mean_malig',\
                                            '11':'maxsize_norm'})
df_model_v24['id'] = df_model_v24['patient_id'].apply(lambda x: x.split('.')[0])

In [7]:
df_model_v24.head()

Unnamed: 0,patient_id,max,std,loc_from_malig_x,loc_from_malig_y,loc_from_malig_z,norm_locs_x,norm_locs_y,norm_locs_z,num_clusters_norm,num_noise_norm,weighted_mean_malig,maxsize_norm,id
0,c0f0eb84e70b19544943bed0ea6bd374.npy,0.92228,0.058397,0.443333,0.502058,0.881041,0.194007,0.229903,0.218545,0.02,0.0,0.719223,0.02,c0f0eb84e70b19544943bed0ea6bd374
1,9065f2b133129c5747d42db18a424749.npy,0.813629,0.046162,0.331104,0.143498,0.44523,0.242824,0.206281,0.192136,0.02,0.0,0.710272,0.02,9065f2b133129c5747d42db18a424749
2,07b1defcfae5873ee1f03c90255eb170.npy,0.847212,0.05095,0.667785,0.379167,0.251969,0.247585,0.274627,0.228109,0.02,0.0,0.690271,0.02,07b1defcfae5873ee1f03c90255eb170
3,624a34fa8fd36847724e749877343847.npy,0.828911,0.047976,0.217105,0.662835,0.555944,0.217827,0.233123,0.233237,0.02,0.0,0.693843,0.02,624a34fa8fd36847724e749877343847
4,eb8d5136918d6859ca3cc3abafe369ac.npy,0.939379,0.060694,0.39759,0.660156,0.78481,0.211209,0.19108,0.225201,0.02,0.0,0.714184,0.02,eb8d5136918d6859ca3cc3abafe369ac


https://github.com/dhammack/DSB2017/blob/271eb5f8cd88a51ca0a335a847cbda09e3edd028/scoring_code/create_preds_from_model_outputs.py

In [8]:
data_dir = '/home/lin/data/stage1/'
patients = os.listdir(data_dir)
labels_df = pd.read_csv('/home/lin/data/stage1_labels.csv')
print (labels_df.shape)
labels_df.head()

(1397, 2)


Unnamed: 0,id,cancer
0,0015ceb851d7251b8f399e39779d1e7d,1
1,0030a160d58723ff36d73f41b170ec21,0
2,003f41c78e6acfa92430a057ac0b306e,0
3,006b96310a37b36cccb2ab48d10b49a3,1
4,008464bb8521d09a42985dd8add3d0d2,1


In [9]:
df_ens1 = pd.merge(left= df_model_v24, right= labels_df, how='outer',on='id')
print (labels_df.shape, df_ens1.shape)
df_ens1 = df_ens1.dropna() 
print (df_ens1.shape)
df_ens1.head()

(1397, 2) (1423, 15)
(174, 15)


Unnamed: 0,patient_id,max,std,loc_from_malig_x,loc_from_malig_y,loc_from_malig_z,norm_locs_x,norm_locs_y,norm_locs_z,num_clusters_norm,num_noise_norm,weighted_mean_malig,maxsize_norm,id,cancer
0,c0f0eb84e70b19544943bed0ea6bd374.npy,0.92228,0.058397,0.443333,0.502058,0.881041,0.194007,0.229903,0.218545,0.02,0.0,0.719223,0.02,c0f0eb84e70b19544943bed0ea6bd374,1.0
3,624a34fa8fd36847724e749877343847.npy,0.828911,0.047976,0.217105,0.662835,0.555944,0.217827,0.233123,0.233237,0.02,0.0,0.693843,0.02,624a34fa8fd36847724e749877343847,1.0
4,eb8d5136918d6859ca3cc3abafe369ac.npy,0.939379,0.060694,0.39759,0.660156,0.78481,0.211209,0.19108,0.225201,0.02,0.0,0.714184,0.02,eb8d5136918d6859ca3cc3abafe369ac,0.0
5,afc15e047f3e127871d13e39cde7557d.npy,0.800611,0.036036,0.666667,0.855204,0.887719,0.225471,0.231574,0.183772,0.02,0.0,0.696288,0.02,afc15e047f3e127871d13e39cde7557d,0.0
6,28824d52b6425841bb263393c3211693.npy,0.947508,0.052005,0.298507,0.5,0.881481,0.233023,0.2184,0.236282,0.02,0.0,0.712125,0.02,28824d52b6425841bb263393c3211693,0.0


In [28]:
ens1_cols = df_ens1.drop(['id', 'cancer', 'patient_id'],1).columns
X = df_ens1.loc[:][ens1_cols].values
Y = df_ens1.loc[:]['cancer'].values
print (X.shape, Y.shape)

(174, 12) (174,)


In [29]:
n_samples = len(Y)
X_train = X[ :int(0.9 * n_samples)]
Y_train = Y[:int(.9 * n_samples)]
X_test = X[int(0.9 * n_samples):]
Y_test = Y[int(0.9 * n_samples):]
lr = LogisticRegression()
print('LogisticRegression score: %f'% lr.fit(X_train, Y_train).score(X_test, Y_test))

LogisticRegression score: 0.611111


In [15]:
print (np.sum(Y), Y.shape, 1-np.sum(Y)/Y.shape)

47.0 (174,) [ 0.72988506]


This score is wayyyy too low!!! By guessing always 0, we would get a better score.
What if we ony some of the columns?


In [27]:
cols = ['num_noise_norm'] #, 'loc_from_malig_x','loc_from_malig_y','loc_from_malig_z']
X = df_ens1.loc[:][cols].values
Y = df_ens1.loc[:]['cancer'].values
print (X.shape, Y.shape)

lr = LogisticRegression()

n_samples = len(Y)
X_train = X[ :int(0.9 * n_samples)]
Y_train = Y[:int(0.9 * n_samples)]
X_test = X[int(0.9 * n_samples):]
Y_test = Y[int(0.9 * n_samples):]
lr = LogisticRegression()
l = lr.fit(X_train, Y_train)
p = l.predict(X_test)
print(p.shape)
print('LogisticRegression score: ', )
l.score(X_test, Y_test)

(174, 1) (174,)
(18,)
LogisticRegression score: 


0.61111111111111116

I am getting exactly the same score?? What is wrong here?

In [31]:
print(X_train.shape, Y_train.shape)
l = lr.fit(X_train, Y_train)
p = l.predict(X_test)

(156, 12) (156,)


In [32]:
for i, t in zip(p, Y_test):
    print( i, t)

0.0 1.0
0.0 0.0
0.0 0.0
0.0 0.0
0.0 0.0
0.0 1.0
0.0 1.0
0.0 1.0
0.0 0.0
0.0 0.0
0.0 1.0
0.0 0.0
0.0 1.0
0.0 0.0
0.0 1.0
0.0 0.0
0.0 0.0
0.0 0.0


In [35]:
1-np.sum(Y_test)/Y_test.shape[0]

0.61111111111111116