In [1]:
import json
import matplotlib.pyplot as plt
import numpy as np 
import pickle
import PIL.Image

import matplotlib.image as mpimg
import skimage.io
from skimage.filters import threshold_otsu
from skimage.segmentation import clear_border
from skimage.measure import label, regionprops
from skimage.morphology import closing, square
from skimage.color import label2rgb
from skimage.transform import resize
import matplotlib.patches as mpatches
from skimage import data
import skimage
from skimage.transform import rotate

import matplotlib as mpl
from mpl_toolkits.mplot3d import Axes3D
from matplotlib import cm
from matplotlib.ticker import LinearLocator, FormatStrFormatter

### Loading dataset and extracting label information

In [2]:
train = json.loads(open('data/processed/train.json').read())
print('Number of data samples: ',len(train))

Number of data samples:  1604


In [None]:
def create_data(train):
    '''
    This function does the following: 
            1. The band 1 and band 2 pixel values are normalized 
            2. Plots the labels as a function of incidence angles. It can be seen that the data can 
               be divided into two groups as discussed in the kaggle winning model.It was checked that 
               the labels for all the images with 'na' incidence angles were 0 (not icebergs). Since 
               such a ship or a non-iceberg should belong to the patches that have a mixture of ships
               and icebergs, a random incidence known angle that belongs to a ship is assigned to each
               of the 'na' incidence angles. 
            3. The corresponding angles and labels (split into those with 'na' and without 'na') were 
               concatenated.
    
    Input: 
            1. The JSON dataset
    
    Output: 
            1. X_band1 - Band 1 pixel values (samples in rows, predictors in columns)
            2. X_band2 - Band 2 pixel values (samples in rows, predictors in columns)
            3. IS_ICEBERG - Labels for all images including 'na' incidence angles (Column Vector)
            4. INC_ANGLES - Incidence angles for all images (Column Vector)
    '''
    
    X_band1 = np.zeros((len(train),75*75))
    X_band2 = np.zeros((len(train),75*75))
    s = 0 
    inc_angle = []
    is_iceberg = []
    for samp in train: 
        X_band1[s,:] = np.array(samp['band_1']).reshape(1,75*75)
        X_band2[s,:] = np.array(samp['band_2']).reshape(1,75*75)
        s+=1
       
        inc_angle.append(samp['inc_angle'])
        is_iceberg.append(samp['is_iceberg'])
        
    #Output label and incident angles including 'na' 
    inc_angle = np.array(inc_angle).reshape(len(train),1)
    is_iceberg = np.array(is_iceberg).reshape(len(train),1)
    
    #Normalizing the values for principal component analysis
#     X_band1 = ((X_band1 - np.mean(X_band1,axis=1).reshape(75*75,1))/np.std(X_band1,axis =1)).reshape(X_band1.shape[0],X_band1.shape[1])
#     X_band2 = ((X_band2 - np.mean(X_band2,axis=1).reshape(75*75,1))/np.std(X_band2,axis =1)).reshape(X_band2.shape[0],X_band2.shape[1])
    
    
    #Samples for which incidence angle is given
    inc_angle_number = np.where(inc_angle!='na')[0]
    inc_angles_final = inc_angle[inc_angle_number]

    #Labels only for the samples for which incidence angle is given
    labels_inc_angle_number = is_iceberg[inc_angle_number]

    #angles and labels
    ang = []
    lab = []
    for c in range(0,len(inc_angles_final)):
        ang.append(float(inc_angles_final[c][0]))
        lab.append(labels_inc_angle_number[c].squeeze())

    ang = np.array(ang)
    lab = np.array(lab)
    
    #Plotting angles vs labels

    plt.figure(figsize=(5,20))
    for c in range(0,len(ang)): 
        if lab[c]==0:
            plt.scatter(c,ang[c],marker='.',color='green')
        else:
            plt.scatter(c,ang[c],marker='.',color = 'red')

    plt.ylim(ymin=30,ymax=max(ang))
    plt.title('Variation of labels with incidence angles\n Green: Ship\n Red: Iceberg')
    plt.xlabel('Sample number')
    plt.ylabel('Incidence angle in degrees')
    plt.savefig("inc_angle_labels.png")
    plt.show()
    
    #Filling NA values
    #Indices in the inc_angle where it is na 
    inc_angle_na = np.where(inc_angle=='na')[0]

    #All the labels with 'na' inc angle are ships
    labels_inc_angle_na = is_iceberg[inc_angle_na]

    ships_ang = inc_angle[np.where(is_iceberg==0)[0]]
    ships_angles = ships_ang[np.where(ships_ang!='na')]

    #All the 'na' incidence angles are ships. So we assign a random incidence angle of a random ship sample since ships belong to group 2. 

    #Assigning an angle to all the 'na' incidence angles
    inc_angles_NA = []
    for n_na in labels_inc_angle_na:

        assign = np.random.randint(len(ships_angles))
        inc_angles_NA.append(float(ships_angles[assign]))

    inc_angles_NA = np.array(inc_angles_NA).reshape(len(inc_angle_na),1)

    INC_ANGLES = np.vstack([ang.reshape(ang.shape[0],1),inc_angles_NA])
    IS_ICEBERG =  np.vstack([lab.reshape(lab.shape[0],1),labels_inc_angle_na])
    
    return X_band1,X_band2,IS_ICEBERG,INC_ANGLES

#Main 
X_band1,X_band2, is_iceberg, inc_angle = create_data(train)
print('Shapes: \n')
print(X_band1.shape)
print(X_band2.shape)
print(is_iceberg.shape)
print(inc_angle.shape)

## Data augmentation

In [23]:
def create_augment_data(X_band1,X_band2, is_iceberg, inc_angle):
    '''
    Input: 
        Output of the function create_data - 
            1. X_band1 - Band 1 pixel values (samples in rows, predictors in columns)
            2. X_band2 - Band 2 pixel values (samples in rows, predictors in columns)
            3. is_iceberg - Labels for all images including 'na' incidence angles (Column Vector)
            4. inc_angles - Incidence angles for all images (Column Vector)
    Output: 
            1. X_band1_final - Band 1 Concatenated original and augmented data (Samples in rows, 
               predictors in columns)
            2. X_band2_final -  Band 2 Concatenated original and augmented data (Samples in rows, 
               predictors in columns)
            3. IS_ICEBERG - Labels concatenated original and augmented data(Column Vector)
            4. INC_ANGLES - Incidence angles original and augmented data (Column Vector)
    '''
    
    X_band1_aug = np.zeros((len(train),75*75))
    X_band2_aug = np.zeros((len(train),75*75))
    
    row = 0 
    for samp in range(0,X_band1.shape[0]):
        img1 = X_band1[samp].reshape(75,75)
        img2 = X_band2[samp].reshape(75,75)

        #Choose a random rotation angle between from 90,180,270 degrees
        rot_angle = [90,180,270]
        angle_ind = np.random.randint(0,3)
        
        img1_rot= rotate(img1,angle = rot_angle[angle_ind])
        img2_rot = rotate(img2,angle = rot_angle[angle_ind])
        
        X_band1_aug[row,:] = img1_rot.reshape(1,75*75)
        X_band2_aug[row,:] = img2_rot.reshape(1,75*75)
        
        row+=1
    
    #Normalizing the values for principal component analysis
#     X_band1_aug = ((X_band1_aug - np.mean(X_band1_aug,axis=1).reshape(1,75*75))/np.std(X_band1_aug,axis =1)).reshape(X_band1_aug.shape[0],X_band1_aug.shape[1])
#     X_band2_aug = ((X_band2_aug - np.mean(X_band2_aug,axis=1).reshape(1,75*75))/np.std(X_band2_aug,axis =1)).reshape(X_band2_aug.shape[0],X_band2_aug.shape[1])
    
    
    #Band 1 and band 2 concatenation 
    X_band1_final = np.vstack([X_band1,X_band1_aug])
    X_band2_final = np.vstack([X_band2,X_band2_aug])
    
    #Output labels concatenation
    IS_ICEBERG = np.vstack([is_iceberg, is_iceberg])
    
    #Incidence angles concatenations
    INC_ANGLES = np.vstack([inc_angle,inc_angle])
    
    return X_band1_final, X_band2_final, IS_ICEBERG, INC_ANGLES  


#Main
Xband1, Xband2, Y, A = create_augment_data(X_band1,X_band2, is_iceberg, inc_angle)

print('Shapes: \n')
print(Xband1.shape)
print(Xband2.shape)
print(Y.shape)
print(A.shape)

Shapes: 

(3208, 5625)
(3208, 5625)
(3208, 1)
(3208, 1)


### Splitting data into training and test set

In [24]:
def train_test_split(Xband1, Xband2, Y, A, test):
    '''
    Input: 
            1. Xband1 - Band 1 Original Feature Matrix (Final Augmented)
            2. Xband2 - Band 2 Original Feature Matrix (Final Augmented)
            3. Y - Output Label Vector 
            4. A - Incidence angles
            5. test - Percentage of dataset for testing (between 0 and 1)
    
    Output: 
            1. X_band1_tr - Band 1 training set
            2. X_band2_tr - Band 2 training set
            3. X_band1_te - Band 1 testing set
            4. X_band2_te - Band 2 testing set
            5. Y_tr - Output label training set
            6. Y_te - Output label test set
            7. A_tr - Inc angles training
            8. A_te - Inc angles testing set
    '''
    
    rand = np.random.permutation(Xband1.shape[0])
    
    split_ind = int((1-test)*len(rand))

    X_band1_tr = Xband1[rand[0:split_ind]]
    X_band2_tr = Xband2[rand[0:split_ind]]
    Y_tr = Y[rand[0:split_ind]]
    A_tr = A[rand[0:split_ind]]
    
    X_band1_te = Xband1[rand[split_ind:]]
    X_band2_te = Xband2[rand[split_ind:]]
    Y_te = Y[rand[split_ind:]]
    A_te = A[rand[split_ind:]]
    
    return(X_band1_tr,X_band2_tr,Y_tr,A_tr, X_band1_te,X_band2_te,Y_te, A_te)

#Main
X_band1_tr,X_band2_tr,Y_tr,A_tr, X_band1_te, X_band2_te, Y_te, A_te = train_test_split(Xband1, Xband2, Y, A,test=0.1)

#Normalizing 
print('Shapes: ')
print(X_band1_tr.shape)
print(X_band2_tr.shape)
print(Y_tr.shape)
print(A_tr.shape)
print(X_band1_te.shape)
print(X_band2_te.shape)
print(Y_te.shape)
print(A_te.shape)

Shapes: 
(2887, 5625)
(2887, 5625)
(2887, 1)
(2887, 1)
(321, 5625)
(321, 5625)
(321, 1)
(321, 1)


In [25]:
#Saving

np.save('aug_data_split/Xb1_tr.npy',X_band1_tr)
np.save('aug_data_split/Xb2_tr.npy',X_band2_tr)
np.save('aug_data_split/Y_tr.npy',Y_tr)
np.save('aug_data_split/A_tr.npy',A_tr)
np.save('aug_data_split/Xb1_te.npy',X_band1_te)
np.save('aug_data_split/Xb2_te.npy',X_band2_te)
np.save('aug_data_split/Y_te.npy',Y_te)
np.save('aug_data_split/A_te.npy',A_te)