# Building a KNN model for Kaggle Data Science Bowl 2018 - Quentin

Load training set

In [1]:
%%time
# 02/12/18
import cv2
import numpy as np
import skimage
import os

train_sample_num = 10



# import training set
training_set = open('../input/stage1_train_labels.csv', 'r').readlines()

# split the imageID and encodedPixels in the labels
training_set_split = np.array([line[:-1].split(",") for line in training_set])

# find all unique imageIDs (670 images)
unique_imageIDs = np.unique(training_set_split[1:, 0])
#print len(unique_imageIDs)


#initialize X_train, y_train
X_train = np.empty((0,4),int)
y_train = np.empty((0),int)


#load training data X_train, y_train
for tain_im_ind in range(train_sample_num):
#opencv uses BGR color format
#imageID = "00071198d059ba7f5914a526d124d28e6d010c92466da21d4a04cd5413362552"

    imageID = unique_imageIDs[tain_im_ind]
    

    #im_3 = cv2.imread("../"+imageID+"/images/"+imageID+".png")
    im_3 = cv2.imread("../input/stage1_train/"+imageID+"/images/"+imageID+".png")
    width = im_3.shape[0]
    height = im_3.shape[1]
    
    #find BGR values and convert them into 1D vectors by concatenating columns
    blue = im_3[:,:,0].flatten('F')
    green = im_3[:,:,1].flatten('F')
    red = im_3[:,:,2].flatten('F')
    #convert to grayscale and find intensity
    im_3_gray = cv2.cvtColor(im_3, cv2.COLOR_BGR2GRAY)
    intensity = im_3_gray.flatten('F')
    # create feature-based input data X. Four features are: blue, green, red, intensity
    X_train_new = np.column_stack((blue, green, red, intensity))
    #add the data from the new image to the previous X_train
    X_train = np.vstack((X_train, X_train_new))
    
    y_train_new = np.zeros((width*height,))
    # lines indices related to our sample image 
    line_ind = [ind for ind in range(len(training_set_split)) if training_set_split[ind][0] == imageID]
    # complete mask of an image in format of pairs (start, run-length)
    masks_pair = " ".join([training_set_split[i][1] for i in line_ind])

    # project the masks on y
    mask1 = masks_pair.split()
    mask2 = [int(item) for item in mask1]
    mask3 = np.array(mask2)
    mask4 = np.reshape(mask3, (len(mask3)/2, 2))
    mask5 = [range(row[0],row[0]+row[1]) for row in mask4[:]]
    import itertools
    #find the complete mask of the image as an 1D array of indices (convert to Python indexing convention - start from 0)
    mask_array = np.array(list(itertools.chain.from_iterable(mask5))) - np.array(1)
    #print mask_array.shape
    y_train_new[mask_array] = 255
    
    #add the data from the new image to the previous y_train
    y_train = np.concatenate((y_train, y_train_new))

# print X_train.shape
# print y_train.shape




    

    


CPU times: user 872 ms, sys: 166 ms, total: 1.04 s
Wall time: 1.08 s


Training

In [None]:
%%time
# train a KNN model
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier()
knn.fit(X_train, y_train)


CPU times: user 1min 10s, sys: 966 ms, total: 1min 10s
Wall time: 1min 14s


Predicting

In [None]:
%%time

test_sample_num = 65

outfname = str(train_sample_num) + 'train' + str(test_sample_num) + 'test.txt'
outf = open(outfname,'w')
#write
file_header = ('ImageId' + '\t'+ 'EncodedPixels' + '\n')
outf.write(file_header)
outf.flush()


# load test set
#test set directory
test_dir = "../input/stage1_test/"
#find all imageIDs in test set
imageIDs_test = [name for name in os.listdir(test_dir) if os.path.isdir(test_dir+name)]

#initialize X_test
X_test = np.empty((0,4),int)


for test_im_ind in range(test_sample_num):
#opencv uses BGR color format
#imageID = "00071198d059ba7f5914a526d124d28e6d010c92466da21d4a04cd5413362552"

    imageID = imageIDs_test[test_im_ind]
    
    im_3 = cv2.imread("../input/stage1_test/"+imageID+"/images/"+imageID+".png")
    width = im_3.shape[0]
    height = im_3.shape[1]
    
    
    #find BGR values and convert them into 1D vectors by concatenating columns
    blue = im_3[:,:,0].flatten('F')
    green = im_3[:,:,1].flatten('F')
    red = im_3[:,:,2].flatten('F')
    #convert to grayscale and find intensity
    im_3_gray = cv2.cvtColor(im_3, cv2.COLOR_BGR2GRAY)
    intensity = im_3_gray.flatten('F')
    # create feature-based input data X. Four features are: blue, green, red, intensity
    X_test = np.column_stack((blue, green, red, intensity))
    
    # predict
    y_pred = knn.predict(X_test) 
    # get the indices of pixels that belong to "nuclei" class. Convert indices to python convention
    mask_pred = np.where(y_pred==255)[0] + np.array(1)
    
#     #temp
#     print "##############"
#     print imageID
#     print width, height
#     print len(y_pred)
#     print "##############"
    
    
    from script import PixelsToRLenc
    #convert pixel array to a run-length string
    mask_pred_rl = PixelsToRLenc(mask_pred)
#     print mask_pred_rl
#     print type(mask_pred_rl)
    #mask_str = ' '.join(x for x in mask_pred_rl)
    outf.write(imageID + '\t' + mask_pred_rl + '\n')
    outf.flush()
    


    
print "mission complete"
    
