**Hugo Queinnec - IMA205**
# Pap smear cells classification

# 0. Imports et chargement des données

In [49]:
import numpy as np
import pandas as pd
from skimage.io import imread

# for reading and displaying images
from skimage.io import imread
import matplotlib.pyplot as plt

# for creating validation set
from sklearn.model_selection import train_test_split

# methods
from sklearn.linear_model import LinearRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV

#dimension reduction
from sklearn.decomposition import PCA

#features
from skimage import measure

In [50]:
# to compute matthews_correlation_coefficient
from tensorflow.keras import backend as K

def matthews_correlation_coefficient(y_true, y_pred):
    tp = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    tn = K.sum(K.round(K.clip((1 - y_true) * (1 - y_pred), 0, 1)))
    fp = K.sum(K.round(K.clip((1 - y_true) * y_pred, 0, 1)))
    fn = K.sum(K.round(K.clip(y_true * (1 - y_pred), 0, 1)))

    num = tp * tn - fp * fn
    den = (tp + fp) * (tp + fn) * (tn + fp) * (tn + fn)
    return num / K.sqrt(den + K.epsilon())

In [51]:
# Load Train Data

#____ CHANGE WORKING DIRECTORY HERE________________________
Working_directory="./"
#__________________________________________________________


df = pd.read_csv(Working_directory+'metadataTrain.csv') # reading data
train_y = df['ABNORMAL'].values # 1 for Melanoma and 0 for healthy
class_names = ["normal","abnormal"]
N=train_y.shape[0]
print('Number of normal: {0}; Number of abnormal: {1}'.format((N-np.sum(train_y)), np.sum(train_y)))

Number of normal: 1302; Number of abnormal: 1619


# 1. Déterminer des features

In [52]:
# loading training images
X_labels = df['ID'].values
train_img = [0]*N
train_imgSegCyt = [0]*N
train_imgSegNuc = [0]*N

i = 0
for id in X_labels:
    # defining the image path
    image_path = Working_directory+'Train/Train/' + str(id) + '.bmp'
    image_pathSegCyt = Working_directory+'Train/Train/' + str(id) + '_segCyt.bmp'
    image_pathSegNuc = Working_directory+'Train/Train/' + str(id) + '_segNuc.bmp'

    img = imread(image_path)
    train_img[i]=img

    imgSegCyt = imread(image_pathSegCyt)
    train_imgSegCyt[i]=imgSegCyt

    imgSegNuc = imread(image_pathSegNuc)
    train_imgSegNuc[i]=imgSegNuc

    i+=1



In [53]:
def computeManualFeaturesNormalized(img, maskCyt, maskNuc, features, colorFeatures, featuresToNormalize, tupleFeatures):
    errorCount = 0

    numberOfColorDescriptors = 6
    numberOfFeatures = 2*len(features) + numberOfColorDescriptors*len(colorFeatures) + len(featuresToNormalize) + 4*len(tupleFeatures)
    

    train_img_features = np.zeros((1, numberOfFeatures))

    labelsCyt = measure.label(np.round(maskCyt), background=0)
    regionsCytR = measure.regionprops(labelsCyt, img[:,:,0])
    regionsCytG = measure.regionprops(labelsCyt, img[:,:,1])
    regionsCytB = measure.regionprops(labelsCyt, img[:,:,2])

    labelsNuc = measure.label(np.round(maskNuc), background=0)
    regionsNucR = measure.regionprops(labelsNuc, img[:,:,0])
    regionsNucG = measure.regionprops(labelsNuc, img[:,:,1])
    regionsNucB = measure.regionprops(labelsNuc, img[:,:,2])

    # maskExt = np.int64(255*np.ones((len(maskCyt), len(maskCyt[0]))) - maskCyt - maskNuc)
    # labelsExt = measure.label(np.round(maskExt), background=0)
    # regionsExtR = measure.regionprops(labelsExt, img[:,:,0])
    # regionsExtG = measure.regionprops(labelsExt, img[:,:,1])
    # regionsExtB = measure.regionprops(labelsExt, img[:,:,2])

    for j in range(len(colorFeatures)):
        feature = colorFeatures[j]
        if(len(regionsCytR)!=0):
            train_img_features[0,j*3] = getattr(regionsCytR[0], feature)
            train_img_features[0,j*3+1] = getattr(regionsCytG[0], feature)
            train_img_features[0,j*3+2] = getattr(regionsCytB[0], feature)
        else:
            train_img_features[0,j*3] = None
            train_img_features[0,j*3+1] = None
            train_img_features[0,j*3+2] = None
            errorCount+=1

    for j in range(len(colorFeatures)):
        feature = colorFeatures[j]
        if(len(regionsNucR)!=0):
            train_img_features[0,j*3+3*len(colorFeatures)] = getattr(regionsNucR[0], feature)
            train_img_features[0,j*3+3*len(colorFeatures)+1] = getattr(regionsNucG[0], feature)
            train_img_features[0,j*3+3*len(colorFeatures)+2] = getattr(regionsNucB[0], feature)
        else:
            train_img_features[0,j*3+3*len(colorFeatures)] = None
            train_img_features[0,j*3+3*len(colorFeatures)+1] = None
            train_img_features[0,j*3+3*len(colorFeatures)+2] = None
            errorCount+=1

    # for j in range(len(colorFeatures)):
    #     feature = colorFeatures[j]
    #     if(len(regionsExtR)!=0):
    #         train_img_features[0,j*3+6*len(colorFeatures)+1] = getattr(regionsExtG[0], feature)
    #         train_img_features[0,j*3+6*len(colorFeatures)+2] = getattr(regionsExtB[0], feature)
    #         train_img_features[0,j*3+6*len(colorFeatures)] = getattr(regionsExtR[0], feature)
    #     else:
    #         train_img_features[0,j*3+6*len(colorFeatures)] = None
    #         train_img_features[0,j*3+6*len(colorFeatures)+1] = None
    #         train_img_features[0,j*3+6*len(colorFeatures)+2] = None
    #         errorCount+=1
    
    for j in range(len(featuresToNormalize)):
        feature = featuresToNormalize[j]
        if(len(regionsCytR)!=0 and len(regionsNucR)!=0 and getattr(regionsCytR[0], feature)!=0):
            train_img_features[0,j+numberOfColorDescriptors*len(colorFeatures)] = getattr(regionsNucR[0], feature)/getattr(regionsCytR[0], feature)
        else:
            train_img_features[0,j+numberOfColorDescriptors*len(colorFeatures)] = None
            #errorCount+=1


    for j in range(len(tupleFeatures)):
        feature = tupleFeatures[j]
        if(len(regionsCytR)!=0 and len(regionsNucR)!=0):
            x1,y1 = getattr(regionsCytR[0], feature)
            x2,y2 = getattr(regionsNucR[0], feature)
            train_img_features[0,j*4+numberOfColorDescriptors*len(colorFeatures)+len(featuresToNormalize)] = x1/y1
            train_img_features[0,j*4+1+numberOfColorDescriptors*len(colorFeatures)+len(featuresToNormalize)] = x2/y2
            train_img_features[0,j*4+2+numberOfColorDescriptors*len(colorFeatures)+len(featuresToNormalize)] = x1/x2
            train_img_features[0,j*4+3+numberOfColorDescriptors*len(colorFeatures)+len(featuresToNormalize)] = y1/y2
        else:
            train_img_features[0,j*4+numberOfColorDescriptors*len(colorFeatures)+len(featuresToNormalize)] = None
            train_img_features[0,j*4+1+numberOfColorDescriptors*len(colorFeatures)+len(featuresToNormalize)] = None
            train_img_features[0,j*4+2+numberOfColorDescriptors*len(colorFeatures)+len(featuresToNormalize)] = None
            train_img_features[0,j*4+3+numberOfColorDescriptors*len(colorFeatures)+len(featuresToNormalize)] = None
            #errorCount+=1
    
    for j in range(len(features)):
        feature = features[j]
        if feature=='symmetry_lr':
            diff_area_h_cyt = np.count_nonzero(maskCyt * ~np.fliplr(maskCyt))
            train_img_features[0,j+numberOfColorDescriptors*len(colorFeatures)+len(featuresToNormalize)+4*len(tupleFeatures)] = diff_area_h_cyt/np.count_nonzero(maskCyt)
        elif feature=='symmetry_ud':
            diff_area_v_cyt = np.count_nonzero(maskCyt * ~np.flipud(maskCyt))
            train_img_features[0,j+numberOfColorDescriptors*len(colorFeatures)+len(featuresToNormalize)+4*len(tupleFeatures)] = diff_area_v_cyt/np.count_nonzero(maskCyt)
        elif feature=='perimeter_norm':
            a = getattr(regionsCytR[0], 'minor_axis_length')
            p = getattr(regionsCytR[0], 'perimeter')
            train_img_features[0,j+numberOfColorDescriptors*len(colorFeatures)+len(featuresToNormalize)+4*len(tupleFeatures)] = p/a
        else:
            if(len(regionsCytR)!=0):
                train_img_features[0,j+numberOfColorDescriptors*len(colorFeatures)+len(featuresToNormalize)+4*len(tupleFeatures)] = getattr(regionsCytR[0], feature)
            else:
                train_img_features[0,j+numberOfColorDescriptors*len(colorFeatures)+len(featuresToNormalize)+4*len(tupleFeatures)] = None
                #errorCount+=1

    for j in range(len(features)):
        feature = features[j]
        if(len(regionsNucR)!=0):
            if feature=='symmetry_lr':
                diff_area_h_nuc = np.count_nonzero(maskNuc * ~np.fliplr(maskNuc))
                train_img_features[0,j+numberOfColorDescriptors*len(colorFeatures)+len(featuresToNormalize)+4*len(tupleFeatures)+len(features)] = diff_area_h_nuc/np.count_nonzero(maskNuc)
            elif feature=='symmetry_ud':
                diff_area_v_cyt = np.count_nonzero(maskCyt * ~np.flipud(maskCyt))
                train_img_features[0,j+numberOfColorDescriptors*len(colorFeatures)+len(featuresToNormalize)+4*len(tupleFeatures)+len(features)] = diff_area_v_cyt/np.count_nonzero(maskCyt)
            elif feature=='perimeter_norm':
                a = getattr(regionsNucR[0], 'minor_axis_length')
                p = getattr(regionsNucR[0], 'perimeter')
                train_img_features[0,j+numberOfColorDescriptors*len(colorFeatures)+len(featuresToNormalize)+4*len(tupleFeatures)+len(features)] = p/a
            else:
               train_img_features[0,j+numberOfColorDescriptors*len(colorFeatures)+len(featuresToNormalize)+4*len(tupleFeatures)+len(features)] = getattr(regionsNucR[0], feature)
        else:
            train_img_features[0,j+numberOfColorDescriptors*len(colorFeatures)+len(featuresToNormalize)+4*len(tupleFeatures)+len(features)] = None
            #errorCount+=1

    

    return train_img_features, errorCount


# 2. Prédictions

In [54]:
def fitLinearRegression(train_img_features,train_y):
    resolution_param = 150  
    regr = LinearRegression()
    regr.fit(train_img_features, train_y)
    return regr

def fitLDA(train_img_features,train_y):
    resolution_param = 150  
    clf_LDA = LinearDiscriminantAnalysis()
    clf_LDA.fit(train_img_features, train_y)
    return clf_LDA

def fitQDA(train_img_features,train_y):
    resolution_param = 150  
    clf_QDA = QuadraticDiscriminantAnalysis()
    clf_QDA.fit(train_img_features, train_y)
    return clf_QDA

def fitBayes(train_img_features,train_y):
    resolution_param = 150  
    clf_GNB = GaussianNB()
    clf_GNB.fit(train_img_features, train_y)
    return clf_GNB

def fitKNN(train_img_features,train_y):
    resolution_param = 150  
    clf_KNN = KNeighborsClassifier()
    clf_KNN.n_neighbors=5
    clf_KNN.fit(train_img_features, train_y)
    return clf_KNN

In [55]:
# Fonction de prédiction adaptée au splits du training set

def predictForTestSplit(one_split_test_img, one_split_test_imgCyt, one_split_test_imgNuc, classifier):
    #compute features
    f = computeManualFeaturesNormalized(one_split_test_img, one_split_test_imgCyt, one_split_test_imgNuc, features, colorFeatures, featuresToNormalize, tupleFeatures)[0]

    for i in range(len(f[0])): #cleaning None values
        if np.isnan(f[0][i]):
            f[0][i] = meanOfTrainingFeatures[i]

    f = scaler.transform(f) #scale

    y_test = classifier.predict(f)

    return y_test[0]

In [56]:
def predictEntireTestSplit(split_test_img, split_test_imgSegCyt, split_test_imgSegNuc, classifier):
    allFeatures = np.zeros((len(split_test_img), numberOfFeatures))

    for i in range(len(split_test_img)):
        f = computeManualFeaturesNormalized(split_test_img[i], split_test_imgSegCyt[i], split_test_imgSegNuc[i], features, colorFeatures, featuresToNormalize, tupleFeatures)[0]
        
        for j in range(len(f[0])): #cleaning None values
            if np.isnan(f[0][j]):
                f[0][j] = meanOfTrainingFeatures[j]

        f = scaler.transform(f) #scale

        allFeatures[i,:] = f[0]

    
    if boolDimensionReduction:
        allFeatures = pca.transform(allFeatures)
    
    y_test = classifier.predict(allFeatures)

    return y_test
    

In [57]:
def class_int_round(z, n_class): #for Linear Regression
    # rounding needed to go from real to integer values 
    output = np.round(z).astype(int)
    if isinstance(z, np.ndarray):
        j = z < 0
        output[j] = 0
        k = z > n_class - 1
        output[k] = n_class - 1
    else:
        if output < 0:
            output = 0
        else:
            if output > n_class - 1:
                output = n_class - 1
    return output

# Export CSV

In [58]:
# create submission csv

def submissionCSV(fileName, classifier):

    sample = pd.read_csv(Working_directory+'SampleSubmission.csv') # reading data
    X_test_values = sample['ID'].values
    N = X_test_values.shape[0]

    test_img = [0]*N
    test_imgSegCyt = [0]*N
    test_imgSegNuc = [0]*N

    i = 0
    for id in X_test_values:
        # defining the image path
        image_path = Working_directory+'Test/Test/' + str(id) + '.bmp'
        image_pathSegCyt = Working_directory+'Test/Test/' + str(id) + '_segCyt.bmp'
        image_pathSegNuc = Working_directory+'Test/Test/' + str(id) + '_segNuc.bmp'

        img = imread(image_path)
        test_img[i]=img

        imgSegCyt = imread(image_pathSegCyt)
        test_imgSegCyt[i]=imgSegCyt

        imgSegNuc = imread(image_pathSegNuc)
        test_imgSegNuc[i]=imgSegNuc

        i+=1

    two_columns = np.zeros((X_test_values.shape[0],2), dtype=int)
    two_columns[:,0] = X_test_values

    y_prediction = predictEntireTestSplit(test_img, test_imgSegCyt, test_imgSegNuc, classifier)
    
    for i in range(X_test_values.shape[0]):
        two_columns[i,1] = y_prediction[i]

    computedValues = pd.DataFrame(two_columns, columns=['ID','ABNORMAL'])
    computedValues.to_csv(Working_directory+fileName, index=False)


In [59]:
# create submission csv

def submissionCSV_MLP(fileName, classifier):

    sample = pd.read_csv(Working_directory+'SampleSubmission.csv') # reading data
    X_test_values = sample['ID'].values
    N = X_test_values.shape[0]

    test_img = [0]*N
    test_imgSegCyt = [0]*N
    test_imgSegNuc = [0]*N

    i = 0
    for id in X_test_values:
        # defining the image path
        image_path = Working_directory+'Test/Test/' + str(id) + '.bmp'
        image_pathSegCyt = Working_directory+'Test/Test/' + str(id) + '_segCyt.bmp'
        image_pathSegNuc = Working_directory+'Test/Test/' + str(id) + '_segNuc.bmp'

        img = imread(image_path)
        test_img[i]=img

        imgSegCyt = imread(image_pathSegCyt)
        test_imgSegCyt[i]=imgSegCyt

        imgSegNuc = imread(image_pathSegNuc)
        test_imgSegNuc[i]=imgSegNuc

        i+=1

    two_columns = np.zeros((X_test_values.shape[0],2), dtype=int)
    two_columns[:,0] = X_test_values

    y_prediction = predictEntireTestSplit(test_img, test_imgSegCyt, test_imgSegNuc, classifier)
    y_prediction = np.argmax(y_prediction, axis=1)
    
    for i in range(X_test_values.shape[0]):
        two_columns[i,1] = y_prediction[i]

    computedValues = pd.DataFrame(two_columns, columns=['ID','ABNORMAL'])
    computedValues.to_csv(Working_directory+fileName, index=False)


# 3. Tests
On tranforme le train initial en deux ensembles train et test, pour obtenir plus facilement des scores, sans devoir passer par Kaggle.

In [60]:
#____to modify______________________
boolSplitAndTestLocally = True #True : the following cells will print Test and Train accuracy | False : the followinf cells will print Train accuracy, and the last cell will export a CSV with the Train predictions of a chosen estimator
boolDimensionReduction = False #compute a PCA before Non linear SVM, Boosting and MLP
#___________________________________


In [61]:
# Partage de l'ensemble de test initial

if boolSplitAndTestLocally:
    split_train_img, split_test_img, split_train_imgSegCyt, split_test_imgSegCyt, split_train_imgSegNuc, split_test_imgSegNuc, split_train_y, split_test_y = train_test_split(train_img, train_imgSegCyt, train_imgSegNuc, train_y, random_state=42, test_size=0.25, stratify=train_y)
else:
    split_train_img, split_train_imgSegCyt, split_train_imgSegNuc, split_train_y = train_img, train_imgSegCyt, train_imgSegNuc, train_y

print("Training Set : "+str(np.sum(split_train_y))+" ABNORMAL, "+str(split_train_y.shape[0]-np.sum(split_train_y))+" NORMAL")
if boolSplitAndTestLocally:
    print("Testing Set : "+str(np.sum(split_test_y))+" ABNORMAL, "+str(split_test_y.shape[0]-np.sum(split_test_y))+" NORMAL")


Training Set : 1214 ABNORMAL, 976 NORMAL
Testing Set : 405 ABNORMAL, 326 NORMAL


## Calcul et pre-processing des features

In [62]:
split_N = split_train_y.shape[0]

## LOT OF FEATURES
# colorFeatures = ['mean_intensity', 'max_intensity', 'min_intensity']
# featuresToNormalize = ['area', 'equivalent_diameter', 'perimeter', 'euler_number', 'convex_area', 'minor_axis_length', 'major_axis_length']
# tupleFeatures = ['centroid']
# features = ['solidity', 'eccentricity', 'extent','symmetry_lr', 'symmetry_ud']#, 'perimeter_norm']


## FEW BEST FEATURES
colorFeatures = ['mean_intensity', 'max_intensity', 'min_intensity']
featuresToNormalize = ['area', 'equivalent_diameter', 'perimeter', 'euler_number']
tupleFeatures = ['centroid']
features = ['solidity']

numberOfFeatures = 2*len(features) + 6*len(colorFeatures) + len(featuresToNormalize) + 4*len(tupleFeatures)
train_img_features_0 = np.zeros((split_N, numberOfFeatures))

errorCount = 0

for i in range(split_N):
    img = split_train_img[i]
    maskCyt = split_train_imgSegCyt[i]
    maskNuc = split_train_imgSegNuc[i]

    f, e = computeManualFeaturesNormalized(img, maskCyt, maskNuc, features, colorFeatures, featuresToNormalize, tupleFeatures)
    train_img_features_0[i] = f
    errorCount+=e

print("Errors in features: " + str(errorCount))


Errors in features: 36


In [63]:
def cleanNoneValues(train_img_features_0, verbose):
    # replace missing values (None) of train_img_features
    if verbose: print("INITIALLY")
    m0 = np.mean(train_img_features_0, axis=0)
    if verbose: print(m0)

    # compute mean of features
    if verbose: print("\nMEAN")
    featuresWithoutNone = train_img_features_0[~np.isnan(train_img_features_0).any(axis=1)]
    if verbose: print("All features: "+str(train_img_features_0.shape)+" | Features without None: "+str(featuresWithoutNone.shape))
    m = np.mean(featuresWithoutNone, axis=0)
    if verbose: print(m)

    #replace missing values
    if verbose: print("\nFINALLY")
    train_img_features = np.array([[line[i] if ~np.isnan(line[i]) else m[i] for i in range(len(line))] for line in train_img_features_0])
    if verbose: print("Cleaned features: "+str(train_img_features.shape))
    m1 = np.mean(train_img_features, axis=0)
    if verbose: print(m1)

    return train_img_features, m

train_img_features, meanOfTrainingFeatures = cleanNoneValues(train_img_features_0, True)

INITIALLY
[139.72334326 125.92519197 149.72425656 187.42009132 177.63607306
 196.12328767  84.18310502  74.04931507 102.16164384          nan
          nan          nan          nan          nan          nan
          nan          nan          nan          nan          nan
          nan          nan          nan          nan          nan
          nan   0.76432367          nan]

MEAN
All features: (2190, 28) | Features without None: (221, 28)
[141.65690285 132.71376483 155.08326071 186.81900452 176.21266968
 192.47963801  99.11312217  93.13574661 119.23529412  97.31950815
  85.12598488 122.20817908 140.47963801 127.61538462 158.76923077
  69.26696833  60.24886878  96.36199095   3.98145325   1.32395347
   0.80218177   0.74230769   1.01713392   1.02025948   0.96494403
   1.02092981   0.53421701   0.95132065]

FINALLY
Cleaned features: (2190, 28)
[139.72334326 125.92519197 149.72425656 187.42009132 177.63607306
 196.12328767  84.18310502  74.04931507 102.16164384  93.14887321
  77.9345329

In [64]:
# Scaler
scaler = StandardScaler()
scaler.fit(train_img_features)
train_img_features = scaler.transform(train_img_features)

m1 = np.mean(train_img_features, axis=0)
print(m1)

[ 2.08055922e-16 -1.75221012e-15  7.21866440e-15  4.91235667e-17
 -2.34059347e-16 -3.15830568e-17  2.49977613e-16  2.48989059e-16
 -1.52738045e-16 -1.72791766e-15 -2.73138944e-15  2.89543250e-15
  6.01117329e-16  5.66099679e-16  1.10434556e-16 -1.33411800e-15
  3.36653587e-16 -3.15493451e-16  5.83183932e-17  6.24088554e-16
  2.16381802e-16 -1.70590823e-13 -7.76446386e-16 -5.69012113e-15
 -1.08695397e-14  3.03673880e-15 -1.04295061e-15 -4.12735295e-14]


## Méthodes Linéaires

In [65]:
# Linear Regression
regr = fitLinearRegression(train_img_features, split_train_y)

split_test_y_predicted = []

if boolSplitAndTestLocally:
    for i in range(split_test_y.shape[0]):
        f = computeManualFeaturesNormalized(split_test_img[i], split_test_imgSegCyt[i], split_test_imgSegNuc[i], features, colorFeatures, featuresToNormalize, tupleFeatures)[0]

        for i in range(len(f[0])): #cleaning None values
            if np.isnan(f[0][i]):
                f[0][i] = meanOfTrainingFeatures[i]

        f = scaler.transform(f) #scale
        split_test_y_predicted.append(class_int_round(regr.predict(f), 2))

    print("Linear Regression accuracy: "+ str(accuracy_score(split_test_y, split_test_y_predicted)) + " | Training accuracy: "+ str(accuracy_score(split_train_y, class_int_round(regr.predict(train_img_features), 2))))
else:
    print("Linear Regression training accuracy: "+ str(accuracy_score(split_train_y, class_int_round(regr.predict(train_img_features), 2))))


# LDA
clf_LDA = fitLDA(train_img_features, split_train_y)

split_test_y_predicted = []

if boolSplitAndTestLocally:
    for i in range(split_test_y.shape[0]):
        split_test_y_predicted.append(predictForTestSplit(split_test_img[i], split_test_imgSegCyt[i], split_test_imgSegNuc[i],clf_LDA))

    print("LDA accuracy: "+ str(accuracy_score(split_test_y, split_test_y_predicted))+ " | Training accuracy: "+ str(accuracy_score(split_train_y, clf_LDA.predict(train_img_features))))
else:
    print("LDA training accuracy: "+ str(accuracy_score(split_train_y, clf_LDA.predict(train_img_features))))


# QDA
clf_QDA = fitQDA(train_img_features, split_train_y)

split_test_y_predicted = []

if boolSplitAndTestLocally:
    for i in range(split_test_y.shape[0]):
        split_test_y_predicted.append(predictForTestSplit(split_test_img[i], split_test_imgSegCyt[i], split_test_imgSegNuc[i],clf_QDA))

    print("QDA accuracy: "+ str(accuracy_score(split_test_y, split_test_y_predicted))+ " | Training accuracy: "+ str(accuracy_score(split_train_y, clf_QDA.predict(train_img_features))))
else:
    print("QDA training accuracy: "+ str(accuracy_score(split_train_y, clf_QDA.predict(train_img_features))))


# Bayes
clf_Bayes = fitBayes(train_img_features, split_train_y)

split_test_y_predicted = []

if boolSplitAndTestLocally:
    for i in range(split_test_y.shape[0]):
        split_test_y_predicted.append(predictForTestSplit(split_test_img[i], split_test_imgSegCyt[i], split_test_imgSegNuc[i],clf_Bayes))

    print("Bayes accuracy: "+ str(accuracy_score(split_test_y, split_test_y_predicted))+ " | Training accuracy: "+ str(accuracy_score(split_train_y, clf_Bayes.predict(train_img_features))))
else:
    print("Bayes training accuracy: "+ str(accuracy_score(split_train_y, clf_Bayes.predict(train_img_features))))


# QDA
clf_KNN = fitKNN(train_img_features, split_train_y)

split_test_y_predicted = []

if boolSplitAndTestLocally:
    for i in range(split_test_y.shape[0]):
        split_test_y_predicted.append(predictForTestSplit(split_test_img[i], split_test_imgSegCyt[i], split_test_imgSegNuc[i],clf_KNN))

    print("KNN accuracy: "+ str(accuracy_score(split_test_y, split_test_y_predicted))+ " | Training accuracy: "+ str(accuracy_score(split_train_y, clf_KNN.predict(train_img_features))))
else:
    print("KNN training accuracy: "+ str(accuracy_score(split_train_y, clf_KNN.predict(train_img_features))))


Linear Regression accuracy: 0.9233926128590971 | Training accuracy: 0.910958904109589
LDA accuracy: 0.9233926128590971 | Training accuracy: 0.910958904109589
QDA accuracy: 0.8057455540355677 | Training accuracy: 0.8141552511415525
Bayes accuracy: 0.7551299589603283 | Training accuracy: 0.7406392694063927
KNN accuracy: 0.9192886456908345 | Training accuracy: 0.9552511415525115


## PCA (avant SVM)

In [66]:
print("Number of features: " + str(numberOfFeatures)+"\n")
train_img_features_reduced = train_img_features

if boolDimensionReduction:
    print("Features size before PCA: "+str(train_img_features.shape))
    pca = PCA(n_components=25, random_state=1)
    train_img_features_reduced=pca.fit_transform(train_img_features)
    print("Features size after PCA: "+str(train_img_features_reduced.shape))

Number of features: 28



## Non linear SVM

In [67]:
# Looking for the best hyperparameters C and Gamma
print("Fitting Non-linear SVM to the training set")
cList = [1e-3,1e-2,1e-1,1,2,3,4,5,6,7,8,9,1e1]
gammaList = [0.0001, 0.0005, 0.001, 0.01, 0.05, 0.1, 0.15, 0.2, 0.3]
p_grid_nlsvm = {'C': cList,
                'gamma': gammaList}
NLsvm = SVC(kernel='rbf', probability=True) #use probability=True to make boosting with initialisation bestEstimator
grid_nlsvm = GridSearchCV(NLsvm,p_grid_nlsvm,cv=5,scoring=('balanced_accuracy'),return_train_score=True, refit=True, n_jobs=-1) # n_jobs divides computation time by 4 (parallelisation)

if boolDimensionReduction:
    grid_nlsvm.fit(train_img_features_reduced, split_train_y)
else:
    grid_nlsvm.fit(train_img_features, split_train_y)

print(grid_nlsvm.best_params_)

bestEstimator = grid_nlsvm.best_estimator_


Fitting Non-linear SVM to the training set
{'C': 7, 'gamma': 0.05}


In [68]:
split_test_y_predicted = []
if boolSplitAndTestLocally:
    split_test_y_predicted = predictEntireTestSplit(split_test_img, split_test_imgSegCyt, split_test_imgSegNuc, bestEstimator)

    if boolDimensionReduction:
        print("NL SVM accuracy: "+ str(accuracy_score(split_test_y, split_test_y_predicted))+ " | Training accuracy: "+ str(accuracy_score(split_train_y, bestEstimator.predict(train_img_features_reduced))))
    else:
        print("NL SVM accuracy: "+ str(accuracy_score(split_test_y, split_test_y_predicted))+ " | Training accuracy: "+ str(accuracy_score(split_train_y, bestEstimator.predict(train_img_features))))
else:
    if boolDimensionReduction:
        print("NL SVM training accuracy: "+ str(accuracy_score(split_train_y, bestEstimator.predict(train_img_features_reduced))))
    else:
        print("NL SVM training accuracy: "+ str(accuracy_score(split_train_y, bestEstimator.predict(train_img_features))))


NL SVM accuracy: 0.9548563611491108 | Training accuracy: 0.9926940639269406


## Boost

In [69]:
from sklearn.ensemble import GradientBoostingClassifier

if boolDimensionReduction:
    clf = GradientBoostingClassifier(n_estimators=500, learning_rate=0.1, max_depth=1, random_state=0).fit(train_img_features_reduced, split_train_y)
else:
    clf = GradientBoostingClassifier(n_estimators=500, learning_rate=0.1, max_depth=1, random_state=0).fit(train_img_features, split_train_y)

split_test_y_predicted = []
if boolSplitAndTestLocally:
    split_test_y_predicted = predictEntireTestSplit(split_test_img, split_test_imgSegCyt, split_test_imgSegNuc, clf)

    if boolDimensionReduction:
        print("Gradient boosting accuracy: "+ str(accuracy_score(split_test_y, split_test_y_predicted))+ " | Training accuracy: "+ str(accuracy_score(split_train_y, clf.predict(train_img_features_reduced))))
    else:
        print("Gradient boosting accuracy: "+ str(accuracy_score(split_test_y, split_test_y_predicted))+ " | Training accuracy: "+ str(accuracy_score(split_train_y, clf.predict(train_img_features))))
else:
    if boolDimensionReduction:
        print("Gradient boosting training accuracy: "+ str(accuracy_score(split_train_y, clf.predict(train_img_features_reduced))))
    else:
        print("Gradient boosting training accuracy: "+ str(accuracy_score(split_train_y, clf.predict(train_img_features))))

Gradient boosting accuracy: 0.9521203830369357 | Training accuracy: 0.9648401826484019


In [70]:
from sklearn.ensemble import AdaBoostClassifier

if boolDimensionReduction:
    clf = AdaBoostClassifier(n_estimators=500, learning_rate=0.1).fit(train_img_features_reduced, split_train_y)
else:
    clf = AdaBoostClassifier(n_estimators=2000, learning_rate=0.1).fit(train_img_features, split_train_y)

split_test_y_predicted = []
if boolSplitAndTestLocally:
    split_test_y_predicted = predictEntireTestSplit(split_test_img, split_test_imgSegCyt, split_test_imgSegNuc, clf)

    if boolDimensionReduction:
        print("AdaBoost accuracy: "+ str(accuracy_score(split_test_y, split_test_y_predicted))+ " | Training accuracy: "+ str(accuracy_score(split_train_y, clf.predict(train_img_features_reduced))))
    else:
        print("AdaBoost accuracy: "+ str(accuracy_score(split_test_y, split_test_y_predicted))+ " | Training accuracy: "+ str(accuracy_score(split_train_y, clf.predict(train_img_features))))
else:
    if boolDimensionReduction:
        print("AdaBoost training accuracy: "+ str(accuracy_score(split_train_y, clf.predict(train_img_features_reduced))))
    else:
        print("AdaBoost training accuracy: "+ str(accuracy_score(split_train_y, clf.predict(train_img_features))))

AdaBoost accuracy: 0.948016415868673 | Training accuracy: 0.989041095890411


In [71]:
from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import HistGradientBoostingClassifier

if boolDimensionReduction:
    clf = HistGradientBoostingClassifier(max_iter=100, l2_regularization=1, learning_rate=0.1).fit(train_img_features_reduced, split_train_y)
else:
    clf = HistGradientBoostingClassifier(max_iter=100, l2_regularization=0.75, learning_rate=0.1).fit(train_img_features, split_train_y)

split_test_y_predicted = []
if boolSplitAndTestLocally:
    split_test_y_predicted = predictEntireTestSplit(split_test_img, split_test_imgSegCyt, split_test_imgSegNuc, clf)

    if boolDimensionReduction:
        print("Histogram-Gradient boosting accuracy: "+ str(accuracy_score(split_test_y, split_test_y_predicted))+ " | Training accuracy: "+ str(accuracy_score(split_train_y, clf.predict(train_img_features_reduced))))
    else:
        print("Histogram-Gradient boosting accuracy: "+ str(accuracy_score(split_test_y, split_test_y_predicted))+ " | Training accuracy: "+ str(accuracy_score(split_train_y, clf.predict(train_img_features))))
else:
    if boolDimensionReduction:
        print("Histogram-Gradient boosting training accuracy: "+ str(accuracy_score(split_train_y, clf.predict(train_img_features_reduced))))
    else:
        print("Histogram-Gradient boosting training accuracy: "+ str(accuracy_score(split_train_y, clf.predict(train_img_features))))

Histogram-Gradient boosting accuracy: 0.9658002735978112 | Training accuracy: 1.0


## Test de MLP

In [72]:
import tensorflow as tf

from tensorflow.keras.utils import to_categorical

from tensorflow.keras.layers import Dense, BatchNormalization
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Dense, Dropout, Activation, Flatten, Input
from tensorflow.keras.layers import Conv2D, MaxPooling2D
from tensorflow.keras import optimizers

import tensorflow_addons as tfa
from tensorboard.plugins.hparams import api as hp

print(tf.keras.__version__)


2.4.0


In [73]:
split_train_y_cat = to_categorical(split_train_y)
split_test_y_cat = to_categorical(split_test_y)

# number of classes
nb_classes = split_train_y_cat.shape[1]
print(nb_classes)

2


In [74]:

# Network Parameters
n_hidden_1 = 128
n_hidden_2 = 256
n_hidden_3 = 256
n_hidden_4 = 256
n_hidden_5 = 128

n_input = train_img_features.shape[1]
print(n_input)

model_mlp_multi_layer = Sequential()
model_mlp_multi_layer.add(Dense(n_hidden_1,input_shape=(n_input,),activation='relu'))
model_mlp_multi_layer.add(BatchNormalization())
model_mlp_multi_layer.add(Dense(n_hidden_2,input_shape=(n_hidden_1,),activation='relu'))
model_mlp_multi_layer.add(BatchNormalization())
model_mlp_multi_layer.add(Dense(n_hidden_3,input_shape=(n_hidden_2,),activation='relu'))
model_mlp_multi_layer.add(BatchNormalization())
model_mlp_multi_layer.add(Dense(n_hidden_4,input_shape=(n_hidden_3,),activation='relu'))
model_mlp_multi_layer.add(BatchNormalization())
model_mlp_multi_layer.add(Dense(n_hidden_5,input_shape=(n_hidden_4,),activation='relu'))
model_mlp_multi_layer.add(BatchNormalization())
model_mlp_multi_layer.add(Dense(2,input_shape=(n_hidden_5,),activation='sigmoid')) #softmax for multi layer

learning_rate = 0.001

model_mlp_multi_layer.compile(loss='binary_crossentropy', optimizer=optimizers.Adam(lr=learning_rate), metrics=[matthews_correlation_coefficient])

# Run optimisation algorithm
n_epochs = 300
batch_size = 84

print('Training')
model_mlp_multi_layer.fit(train_img_features, split_train_y_cat, epochs=n_epochs,batch_size=batch_size, use_multiprocessing=True) # TO FILL IN

print('Testing')
split_test_y_predicted = []
if boolSplitAndTestLocally:
    split_test_y_predicted = predictEntireTestSplit(split_test_img, split_test_imgSegCyt, split_test_imgSegNuc, model_mlp_multi_layer)
    split_test_y_predicted = np.argmax(split_test_y_predicted, axis=1)
    print("MLP accuracy: "+ str(accuracy_score(split_test_y, split_test_y_predicted))+ " | Training accuracy: "+ str(accuracy_score(split_train_y, np.argmax(model_mlp_multi_layer.predict(train_img_features), axis=1))))
else:
    print("MLP training accuracy: "+ str(accuracy_score(split_train_y, np.argmax(model_mlp_multi_layer.predict(train_img_features), axis=1))))


 - loss: 0.0117 - matthews_correlation_coefficient: 0.9949
Epoch 142/300
Epoch 143/300
Epoch 144/300
Epoch 145/300
Epoch 146/300
Epoch 147/300
Epoch 148/300
Epoch 149/300
Epoch 150/300
Epoch 151/300
Epoch 152/300
Epoch 153/300
Epoch 154/300
Epoch 155/300
Epoch 156/300
Epoch 157/300
Epoch 158/300
Epoch 159/300
Epoch 160/300
Epoch 161/300
Epoch 162/300
Epoch 163/300
Epoch 164/300
Epoch 165/300
Epoch 166/300
Epoch 167/300
Epoch 168/300
Epoch 169/300
Epoch 170/300
Epoch 171/300
Epoch 172/300
Epoch 173/300
Epoch 174/300
Epoch 175/300
Epoch 176/300
Epoch 177/300
Epoch 178/300
Epoch 179/300
Epoch 180/300
Epoch 181/300
Epoch 182/300
Epoch 183/300
Epoch 184/300
Epoch 185/300
Epoch 186/300
Epoch 187/300
Epoch 188/300
Epoch 189/300
Epoch 190/300
Epoch 191/300
Epoch 192/300
Epoch 193/300
Epoch 194/300
Epoch 195/300
Epoch 196/300
Epoch 197/300
Epoch 198/300
Epoch 199/300
Epoch 200/300
Epoch 201/300
Epoch 202/300
Epoch 203/300
Epoch 204/300
Epoch 205/300
Epoch 206/300
Epoch 207/300
Epoch 208/300
Epo

# Export CSV

## CSV pour Méthodes Linéaires, Non linear SVM ou Boosting

In [75]:
if not boolSplitAndTestLocally: #predict test set and export CSV
    # to modify ___________
    fileName = "NLSVM.csv"
    classifier = bestEstimator
    #______________________

    submissionCSV(fileName, classifier)

## CSV pour MLP

In [76]:
if not boolSplitAndTestLocally: #predict test set and export CSV
    # to modify ___________
    fileName = "MLP.csv"
    classifier = model_mlp_multi_layer
    #______________________

    submissionCSV_MLP(fileName, classifier)