**Hugo Queinnec - IMA205**
# Pap smear cells multi-classification

# 0. Imports et chargement des données

In [1]:
import numpy as np
import pandas as pd
from skimage.io import imread

# for reading and displaying images
from skimage.io import imread
import matplotlib.pyplot as plt

# for creating validation set
from sklearn.model_selection import train_test_split

# methods
from sklearn.linear_model import LinearRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV

#dimension reduction
from sklearn.decomposition import PCA

#features
from skimage import measure

In [2]:
# to compute matthews_correlation_coefficient
from tensorflow.keras import backend as K

def matthews_correlation_coefficient(y_true, y_pred):
    tp = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    tn = K.sum(K.round(K.clip((1 - y_true) * (1 - y_pred), 0, 1)))
    fp = K.sum(K.round(K.clip((1 - y_true) * y_pred, 0, 1)))
    fn = K.sum(K.round(K.clip(y_true * (1 - y_pred), 0, 1)))

    num = tp * tn - fp * fn
    den = (tp + fp) * (tp + fn) * (tn + fp) * (tn + fn)
    return num / K.sqrt(den + K.epsilon())

In [3]:
# Load Train Data

#____ CHANGE WORKING DIRECTORY HERE________________________
Working_directory="./"
#__________________________________________________________

df = pd.read_csv(Working_directory+'metadataTrain.csv') # reading data
train_y = df['GROUP'].values # 1 for Melanoma and 0 for healthy
class_names = ["normal","abnormal"]
N=train_y.shape[0]

occurences = np.bincount(np.array(train_y))
for i in range(occurences.shape[0]):
    print("Group "+str(i)+": "+str(occurences[i])+" occurences")

Group 0: 682 occurences
Group 1: 551 occurences
Group 2: 69 occurences
Group 3: 105 occurences
Group 4: 127 occurences
Group 5: 102 occurences
Group 6: 138 occurences
Group 7: 578 occurences
Group 8: 569 occurences


# 1. Déterminer des features

In [4]:
# loading training images
X_labels = df['ID'].values
train_img = [0]*N
train_imgSegCyt = [0]*N
train_imgSegNuc = [0]*N

i = 0
for id in X_labels:
    # defining the image path
    image_path = Working_directory+'Train/Train/' + str(id) + '.bmp'
    image_pathSegCyt = Working_directory+'Train/Train/' + str(id) + '_segCyt.bmp'
    image_pathSegNuc = Working_directory+'Train/Train/' + str(id) + '_segNuc.bmp'

    img = imread(image_path)
    train_img[i]=img

    imgSegCyt = imread(image_pathSegCyt)
    train_imgSegCyt[i]=imgSegCyt

    imgSegNuc = imread(image_pathSegNuc)
    train_imgSegNuc[i]=imgSegNuc

    i+=1



In [5]:
def computeManualFeaturesNormalized(img, maskCyt, maskNuc, features, colorFeatures, featuresToNormalize, tupleFeatures):
    errorCount = 0

    numberOfColorDescriptors = 6
    numberOfFeatures = 2*len(features) + numberOfColorDescriptors*len(colorFeatures) + len(featuresToNormalize) + 4*len(tupleFeatures)
    

    train_img_features = np.zeros((1, numberOfFeatures))

    labelsCyt = measure.label(np.round(maskCyt), background=0)
    regionsCytR = measure.regionprops(labelsCyt, img[:,:,0])
    regionsCytG = measure.regionprops(labelsCyt, img[:,:,1])
    regionsCytB = measure.regionprops(labelsCyt, img[:,:,2])

    labelsNuc = measure.label(np.round(maskNuc), background=0)
    regionsNucR = measure.regionprops(labelsNuc, img[:,:,0])
    regionsNucG = measure.regionprops(labelsNuc, img[:,:,1])
    regionsNucB = measure.regionprops(labelsNuc, img[:,:,2])

    # maskExt = np.int64(255*np.ones((len(maskCyt), len(maskCyt[0]))) - maskCyt - maskNuc)
    # labelsExt = measure.label(np.round(maskExt), background=0)
    # regionsExtR = measure.regionprops(labelsExt, img[:,:,0])
    # regionsExtG = measure.regionprops(labelsExt, img[:,:,1])
    # regionsExtB = measure.regionprops(labelsExt, img[:,:,2])

    for j in range(len(colorFeatures)):
        feature = colorFeatures[j]
        if(len(regionsCytR)!=0):
            train_img_features[0,j*3] = getattr(regionsCytR[0], feature)
            train_img_features[0,j*3+1] = getattr(regionsCytG[0], feature)
            train_img_features[0,j*3+2] = getattr(regionsCytB[0], feature)
        else:
            train_img_features[0,j*3] = None
            train_img_features[0,j*3+1] = None
            train_img_features[0,j*3+2] = None
            errorCount+=1

    for j in range(len(colorFeatures)):
        feature = colorFeatures[j]
        if(len(regionsNucR)!=0):
            train_img_features[0,j*3+3*len(colorFeatures)] = getattr(regionsNucR[0], feature)
            train_img_features[0,j*3+3*len(colorFeatures)+1] = getattr(regionsNucG[0], feature)
            train_img_features[0,j*3+3*len(colorFeatures)+2] = getattr(regionsNucB[0], feature)
        else:
            train_img_features[0,j*3+3*len(colorFeatures)] = None
            train_img_features[0,j*3+3*len(colorFeatures)+1] = None
            train_img_features[0,j*3+3*len(colorFeatures)+2] = None
            errorCount+=1

    # for j in range(len(colorFeatures)):
    #     feature = colorFeatures[j]
    #     if(len(regionsExtR)!=0):
    #         train_img_features[0,j*3+6*len(colorFeatures)+1] = getattr(regionsExtG[0], feature)
    #         train_img_features[0,j*3+6*len(colorFeatures)+2] = getattr(regionsExtB[0], feature)
    #         train_img_features[0,j*3+6*len(colorFeatures)] = getattr(regionsExtR[0], feature)
    #     else:
    #         train_img_features[0,j*3+6*len(colorFeatures)] = None
    #         train_img_features[0,j*3+6*len(colorFeatures)+1] = None
    #         train_img_features[0,j*3+6*len(colorFeatures)+2] = None
    #         errorCount+=1
    
    for j in range(len(featuresToNormalize)):
        feature = featuresToNormalize[j]
        if(len(regionsCytR)!=0 and len(regionsNucR)!=0 and getattr(regionsCytR[0], feature)!=0):
            train_img_features[0,j+numberOfColorDescriptors*len(colorFeatures)] = getattr(regionsNucR[0], feature)/getattr(regionsCytR[0], feature)
        else:
            train_img_features[0,j+numberOfColorDescriptors*len(colorFeatures)] = None
            #errorCount+=1


    for j in range(len(tupleFeatures)):
        feature = tupleFeatures[j]
        if(len(regionsCytR)!=0 and len(regionsNucR)!=0):
            x1,y1 = getattr(regionsCytR[0], feature)
            x2,y2 = getattr(regionsNucR[0], feature)
            train_img_features[0,j*4+numberOfColorDescriptors*len(colorFeatures)+len(featuresToNormalize)] = x1/y1
            train_img_features[0,j*4+1+numberOfColorDescriptors*len(colorFeatures)+len(featuresToNormalize)] = x2/y2
            train_img_features[0,j*4+2+numberOfColorDescriptors*len(colorFeatures)+len(featuresToNormalize)] = x1/x2
            train_img_features[0,j*4+3+numberOfColorDescriptors*len(colorFeatures)+len(featuresToNormalize)] = y1/y2
        else:
            train_img_features[0,j*4+numberOfColorDescriptors*len(colorFeatures)+len(featuresToNormalize)] = None
            train_img_features[0,j*4+1+numberOfColorDescriptors*len(colorFeatures)+len(featuresToNormalize)] = None
            train_img_features[0,j*4+2+numberOfColorDescriptors*len(colorFeatures)+len(featuresToNormalize)] = None
            train_img_features[0,j*4+3+numberOfColorDescriptors*len(colorFeatures)+len(featuresToNormalize)] = None
            #errorCount+=1
    
    for j in range(len(features)):
        feature = features[j]
        if feature=='symmetry_lr':
            diff_area_h_cyt = np.count_nonzero(maskCyt * ~np.fliplr(maskCyt))
            train_img_features[0,j+numberOfColorDescriptors*len(colorFeatures)+len(featuresToNormalize)+4*len(tupleFeatures)] = diff_area_h_cyt/np.count_nonzero(maskCyt)
        elif feature=='symmetry_ud':
            diff_area_v_cyt = np.count_nonzero(maskCyt * ~np.flipud(maskCyt))
            train_img_features[0,j+numberOfColorDescriptors*len(colorFeatures)+len(featuresToNormalize)+4*len(tupleFeatures)] = diff_area_v_cyt/np.count_nonzero(maskCyt)
        elif feature=='perimeter_norm':
            a = getattr(regionsCytR[0], 'minor_axis_length')
            p = getattr(regionsCytR[0], 'perimeter')
            train_img_features[0,j+numberOfColorDescriptors*len(colorFeatures)+len(featuresToNormalize)+4*len(tupleFeatures)] = p/a
        else:
            if(len(regionsCytR)!=0):
                train_img_features[0,j+numberOfColorDescriptors*len(colorFeatures)+len(featuresToNormalize)+4*len(tupleFeatures)] = getattr(regionsCytR[0], feature)
            else:
                train_img_features[0,j+numberOfColorDescriptors*len(colorFeatures)+len(featuresToNormalize)+4*len(tupleFeatures)] = None
                #errorCount+=1

    for j in range(len(features)):
        feature = features[j]
        if(len(regionsNucR)!=0):
            if feature=='symmetry_lr':
                diff_area_h_nuc = np.count_nonzero(maskNuc * ~np.fliplr(maskNuc))
                train_img_features[0,j+numberOfColorDescriptors*len(colorFeatures)+len(featuresToNormalize)+4*len(tupleFeatures)+len(features)] = diff_area_h_nuc/np.count_nonzero(maskNuc)
            elif feature=='symmetry_ud':
                diff_area_v_cyt = np.count_nonzero(maskCyt * ~np.flipud(maskCyt))
                train_img_features[0,j+numberOfColorDescriptors*len(colorFeatures)+len(featuresToNormalize)+4*len(tupleFeatures)+len(features)] = diff_area_v_cyt/np.count_nonzero(maskCyt)
            elif feature=='perimeter_norm':
                a = getattr(regionsNucR[0], 'minor_axis_length')
                p = getattr(regionsNucR[0], 'perimeter')
                train_img_features[0,j+numberOfColorDescriptors*len(colorFeatures)+len(featuresToNormalize)+4*len(tupleFeatures)+len(features)] = p/a
            else:
               train_img_features[0,j+numberOfColorDescriptors*len(colorFeatures)+len(featuresToNormalize)+4*len(tupleFeatures)+len(features)] = getattr(regionsNucR[0], feature)
        else:
            train_img_features[0,j+numberOfColorDescriptors*len(colorFeatures)+len(featuresToNormalize)+4*len(tupleFeatures)+len(features)] = None
            #errorCount+=1

    

    return train_img_features, errorCount


# 2. Prédictions

In [6]:
def fitLinearRegression(train_img_features,train_y):
    resolution_param = 150  
    regr = LinearRegression()
    regr.fit(train_img_features, train_y)
    return regr

def fitLDA(train_img_features,train_y):
    resolution_param = 150  
    clf_LDA = LinearDiscriminantAnalysis()
    clf_LDA.fit(train_img_features, train_y)
    return clf_LDA

def fitQDA(train_img_features,train_y):
    resolution_param = 150  
    clf_QDA = QuadraticDiscriminantAnalysis()
    clf_QDA.fit(train_img_features, train_y)
    return clf_QDA

def fitBayes(train_img_features,train_y):
    resolution_param = 150  
    clf_GNB = GaussianNB()
    clf_GNB.fit(train_img_features, train_y)
    return clf_GNB

def fitKNN(train_img_features,train_y):
    resolution_param = 150  
    clf_KNN = KNeighborsClassifier()
    clf_KNN.n_neighbors=5
    clf_KNN.fit(train_img_features, train_y)
    return clf_KNN

In [7]:
# Fonction de prédiction adaptée au splits du training set

def predictForTestSplit(one_split_test_img, one_split_test_imgCyt, one_split_test_imgNuc, classifier):
    #compute features
    f = computeManualFeaturesNormalized(one_split_test_img, one_split_test_imgCyt, one_split_test_imgNuc, features, colorFeatures, featuresToNormalize, tupleFeatures)[0]

    for i in range(len(f[0])): #cleaning None values
        if np.isnan(f[0][i]):
            f[0][i] = meanOfTrainingFeatures[i]

    f = scaler.transform(f) #scale

    y_test = classifier.predict(f)

    return y_test[0]

In [8]:
def predictEntireTestSplit(split_test_img, split_test_imgSegCyt, split_test_imgSegNuc, classifier):
    allFeatures = np.zeros((len(split_test_img), numberOfFeatures))

    for i in range(len(split_test_img)):
        f = computeManualFeaturesNormalized(split_test_img[i], split_test_imgSegCyt[i], split_test_imgSegNuc[i], features, colorFeatures, featuresToNormalize, tupleFeatures)[0]
        
        for j in range(len(f[0])): #cleaning None values
            if np.isnan(f[0][j]):
                f[0][j] = meanOfTrainingFeatures[j]

        f = scaler.transform(f) #scale

        allFeatures[i,:] = f[0]

    
    if boolDimensionReduction:
        allFeatures = pca.transform(allFeatures)
    
    y_test = classifier.predict(allFeatures)

    return y_test
    

In [9]:
def class_int_round(z, n_class): #for Linear Regression
    # rounding needed to go from real to integer values 
    output = np.round(z).astype(int)
    if isinstance(z, np.ndarray):
        j = z < 0
        output[j] = 0
        k = z > n_class - 1
        output[k] = n_class - 1
    else:
        if output < 0:
            output = 0
        else:
            if output > n_class - 1:
                output = n_class - 1
    return output

# Export CSV

In [10]:
# create submission csv

def submissionCSV(fileName, classifier):

    sample = pd.read_csv(Working_directory+'SampleSubmission.csv') # reading data
    X_test_values = sample['ID'].values
    N = X_test_values.shape[0]

    test_img = [0]*N
    test_imgSegCyt = [0]*N
    test_imgSegNuc = [0]*N

    i = 0
    for id in X_test_values:
        # defining the image path
        image_path = Working_directory+'Test/Test/' + str(id) + '.bmp'
        image_pathSegCyt = Working_directory+'Test/Test/' + str(id) + '_segCyt.bmp'
        image_pathSegNuc = Working_directory+'Test/Test/' + str(id) + '_segNuc.bmp'

        img = imread(image_path)
        test_img[i]=img

        imgSegCyt = imread(image_pathSegCyt)
        test_imgSegCyt[i]=imgSegCyt

        imgSegNuc = imread(image_pathSegNuc)
        test_imgSegNuc[i]=imgSegNuc

        i+=1

    two_columns = np.zeros((X_test_values.shape[0],2), dtype=int)
    two_columns[:,0] = X_test_values

    y_prediction = predictEntireTestSplit(test_img, test_imgSegCyt, test_imgSegNuc, classifier)
    
    for i in range(X_test_values.shape[0]):
        two_columns[i,1] = y_prediction[i]

    computedValues = pd.DataFrame(two_columns, columns=['ID','GROUP'])
    computedValues.to_csv(Working_directory+fileName, index=False)


# 3. Tests
On tranforme le train initial en deux ensembles train et test, pour obtenir plus facilement des scores, sans devoir passer par Kaggle.

In [12]:
#____to modify______________________
boolSplitAndTestLocally = True #True : the following cells will print Test and Train accuracy | False : the followinf cells will print Train accuracy, and the last cell will export a CSV with the Train predictions of a chosen estimator
boolDimensionReduction = False #compute a PCA before Non linear SVM, Boosting and MLP
#___________________________________


In [13]:
# Partage de l'ensemble de test initial

if boolSplitAndTestLocally:
    split_train_img, split_test_img, split_train_imgSegCyt, split_test_imgSegCyt, split_train_imgSegNuc, split_test_imgSegNuc, split_train_y, split_test_y = train_test_split(train_img, train_imgSegCyt, train_imgSegNuc, train_y, test_size=0.25, random_state=42, stratify=train_y)
else:
    split_train_img, split_train_imgSegCyt, split_train_imgSegNuc, split_train_y = train_img, train_imgSegCyt, train_imgSegNuc, train_y

occurences = np.bincount(np.array(split_train_y))
for i in range(occurences.shape[0]):
    print("Group "+str(i)+": "+str(occurences[i])+" occurences")

print()

if boolSplitAndTestLocally:
    occurences = np.bincount(np.array(split_test_y))
    for i in range(occurences.shape[0]):
        print("Group "+str(i)+": "+str(occurences[i])+" occurences")


Group 0: 511 occurences
Group 1: 413 occurences
Group 2: 52 occurences
Group 3: 79 occurences
Group 4: 95 occurences
Group 5: 77 occurences
Group 6: 103 occurences
Group 7: 433 occurences
Group 8: 427 occurences

Group 0: 171 occurences
Group 1: 138 occurences
Group 2: 17 occurences
Group 3: 26 occurences
Group 4: 32 occurences
Group 5: 25 occurences
Group 6: 35 occurences
Group 7: 145 occurences
Group 8: 142 occurences


## Calcul et pre-processing des features

In [14]:
split_N = split_train_y.shape[0]

## LOT OF FEATURES
# colorFeatures = ['mean_intensity', 'max_intensity', 'min_intensity']
# featuresToNormalize = ['area', 'equivalent_diameter', 'perimeter', 'euler_number', 'convex_area', 'minor_axis_length', 'major_axis_length']
# tupleFeatures = ['centroid']
# features = ['solidity', 'eccentricity', 'extent','symmetry_lr', 'symmetry_ud']#, 'perimeter_norm']


## FEW BEST FEATURES
colorFeatures = ['mean_intensity', 'max_intensity', 'min_intensity']
featuresToNormalize = ['area', 'equivalent_diameter', 'perimeter', 'euler_number']
tupleFeatures = ['centroid']
features = ['solidity']

numberOfFeatures = 2*len(features) + 6*len(colorFeatures) + len(featuresToNormalize) + 4*len(tupleFeatures)
train_img_features_0 = np.zeros((split_N, numberOfFeatures))

errorCount = 0

for i in range(split_N):
    img = split_train_img[i]
    maskCyt = split_train_imgSegCyt[i]
    maskNuc = split_train_imgSegNuc[i]

    f, e = computeManualFeaturesNormalized(img, maskCyt, maskNuc, features, colorFeatures, featuresToNormalize, tupleFeatures)
    train_img_features_0[i] = f
    errorCount+=e

print("Errors in features: " + str(errorCount))


Errors in features: 39


In [15]:
def cleanNoneValues(train_img_features_0, verbose):
    # replace missing values (None) of train_img_features
    if verbose: print("INITIALLY")
    m0 = np.mean(train_img_features_0, axis=0)
    if verbose: print(m0)

    # compute mean of features
    if verbose: print("\nMEAN")
    featuresWithoutNone = train_img_features_0[~np.isnan(train_img_features_0).any(axis=1)]
    if verbose: print("All features: "+str(train_img_features_0.shape)+" | Features without None: "+str(featuresWithoutNone.shape))
    m = np.mean(featuresWithoutNone, axis=0)
    if verbose: print(m)

    #replace missing values
    if verbose: print("\nFINALLY")
    train_img_features = np.array([[line[i] if ~np.isnan(line[i]) else m[i] for i in range(len(line))] for line in train_img_features_0])
    if verbose: print("Cleaned features: "+str(train_img_features.shape))
    m1 = np.mean(train_img_features, axis=0)
    if verbose: print(m1)

    return train_img_features, m

train_img_features, meanOfTrainingFeatures = cleanNoneValues(train_img_features_0, True)

INITIALLY
[139.83393065 125.73210201 149.50317725 187.47990868 177.83972603
 196.42785388  84.57077626  74.23150685 102.22694064          nan
          nan          nan          nan          nan          nan
          nan          nan          nan          nan          nan
          nan          nan          nan          nan          nan
          nan   0.76327158          nan]

MEAN
All features: (2190, 28) | Features without None: (223, 28)
[141.29448314 133.10785472 155.68505198 187.98206278 178.367713
 195.66367713  98.16143498  92.9955157  119.2690583   95.97494679
  84.35509337 121.7404573  139.30044843 127.14349776 158.8161435
  68.81165919  60.0941704   96.46188341   5.91407674   1.35662401
   1.02648326   0.72892377   0.98219704   0.97826114   0.94641675
   1.01868753   0.53386817   0.95341928]

FINALLY
Cleaned features: (2190, 28)
[139.83393065 125.73210201 149.50317725 187.47990868 177.83972603
 196.42785388  84.57077626  74.23150685 102.22694064  93.3690155
  77.6886183  11

In [16]:
# Scaler
scaler = StandardScaler()
scaler.fit(train_img_features)
train_img_features = scaler.transform(train_img_features)

m1 = np.mean(train_img_features, axis=0)
print(m1)

[-4.55982284e-15 -1.20046033e-16 -1.34544836e-16 -2.15150069e-16
 -4.05053971e-17  1.01390231e-19  2.25694653e-16  1.26737788e-17
  1.28410727e-16  2.93788332e-15 -7.47575517e-15  8.40436295e-16
  1.27913915e-15  2.63310429e-16 -1.03534634e-15  1.26134516e-15
 -8.65619093e-16 -4.68321475e-16 -3.94978317e-17 -8.63439203e-16
  3.37722937e-16 -1.46516672e-13 -2.71431786e-15  4.82959689e-16
 -8.15101411e-16  1.78565432e-14  1.18347747e-15 -2.35830381e-14]


## Méthodes Linéaires

In [17]:
# Linear Regression
regr = fitLinearRegression(train_img_features, split_train_y)

split_test_y_predicted = []

if boolSplitAndTestLocally:
    for i in range(split_test_y.shape[0]):
        f = computeManualFeaturesNormalized(split_test_img[i], split_test_imgSegCyt[i], split_test_imgSegNuc[i], features, colorFeatures, featuresToNormalize, tupleFeatures)[0]

        for i in range(len(f[0])): #cleaning None values
            if np.isnan(f[0][i]):
                f[0][i] = meanOfTrainingFeatures[i]

        f = scaler.transform(f) #scale
        split_test_y_predicted.append(class_int_round(regr.predict(f), 2))

    print("Linear Regression accuracy: "+ str(accuracy_score(split_test_y, split_test_y_predicted)) + " | Training accuracy: "+ str(accuracy_score(split_train_y, class_int_round(regr.predict(train_img_features), 2))))
else:
    print("Linear Regression training accuracy: "+ str(accuracy_score(split_train_y, class_int_round(regr.predict(train_img_features), 2))))


# LDA
clf_LDA = fitLDA(train_img_features, split_train_y)

split_test_y_predicted = []

if boolSplitAndTestLocally:
    for i in range(split_test_y.shape[0]):
        split_test_y_predicted.append(predictForTestSplit(split_test_img[i], split_test_imgSegCyt[i], split_test_imgSegNuc[i],clf_LDA))

    print("LDA accuracy: "+ str(accuracy_score(split_test_y, split_test_y_predicted))+ " | Training accuracy: "+ str(accuracy_score(split_train_y, clf_LDA.predict(train_img_features))))
else:
    print("LDA training accuracy: "+ str(accuracy_score(split_train_y, clf_LDA.predict(train_img_features))))


# QDA
clf_QDA = fitQDA(train_img_features, split_train_y)

split_test_y_predicted = []

if boolSplitAndTestLocally:
    for i in range(split_test_y.shape[0]):
        split_test_y_predicted.append(predictForTestSplit(split_test_img[i], split_test_imgSegCyt[i], split_test_imgSegNuc[i],clf_QDA))

    print("QDA accuracy: "+ str(accuracy_score(split_test_y, split_test_y_predicted))+ " | Training accuracy: "+ str(accuracy_score(split_train_y, clf_QDA.predict(train_img_features))))
else:
    print("QDA training accuracy: "+ str(accuracy_score(split_train_y, clf_QDA.predict(train_img_features))))


# Bayes
clf_Bayes = fitBayes(train_img_features, split_train_y)

split_test_y_predicted = []

if boolSplitAndTestLocally:
    for i in range(split_test_y.shape[0]):
        split_test_y_predicted.append(predictForTestSplit(split_test_img[i], split_test_imgSegCyt[i], split_test_imgSegNuc[i],clf_Bayes))

    print("Bayes accuracy: "+ str(accuracy_score(split_test_y, split_test_y_predicted))+ " | Training accuracy: "+ str(accuracy_score(split_train_y, clf_Bayes.predict(train_img_features))))
else:
    print("Bayes training accuracy: "+ str(accuracy_score(split_train_y, clf_Bayes.predict(train_img_features))))


# QDA
clf_KNN = fitKNN(train_img_features, split_train_y)

split_test_y_predicted = []

if boolSplitAndTestLocally:
    for i in range(split_test_y.shape[0]):
        split_test_y_predicted.append(predictForTestSplit(split_test_img[i], split_test_imgSegCyt[i], split_test_imgSegNuc[i],clf_KNN))

    print("KNN accuracy: "+ str(accuracy_score(split_test_y, split_test_y_predicted))+ " | Training accuracy: "+ str(accuracy_score(split_train_y, clf_KNN.predict(train_img_features))))
else:
    print("KNN training accuracy: "+ str(accuracy_score(split_train_y, clf_KNN.predict(train_img_features))))


Linear Regression accuracy: 0.2421340629274966 | Training accuracy: 0.24794520547945206
LDA accuracy: 0.8207934336525308 | Training accuracy: 0.8210045662100457
QDA accuracy: 0.8153214774281806 | Training accuracy: 0.8803652968036529
Bayes accuracy: 0.66484268125855 | Training accuracy: 0.6525114155251142
KNN accuracy: 0.7920656634746922 | Training accuracy: 0.8662100456621005


## PCA (avant SVM)

In [18]:
print("Number of features: " + str(numberOfFeatures)+"\n")
train_img_features_reduced = train_img_features

if boolDimensionReduction:
    print("Features size before PCA: "+str(train_img_features.shape))
    pca = PCA(n_components=25, random_state=1)
    train_img_features_reduced=pca.fit_transform(train_img_features)
    print("Features size after PCA: "+str(train_img_features_reduced.shape))

Number of features: 28



## Non linear SVM

In [19]:
# Looking for the best hyperparameters C and Gamma
print("Fitting Non-linear SVM to the training set")
cList = [1e-3,1e-2,1e-1,1,2,3,4,5,6,7,8,9,1e1]
gammaList = [0.0001, 0.0005, 0.001, 0.01, 0.05, 0.1, 0.15, 0.2, 0.3]
p_grid_nlsvm = {'C': cList,
                'gamma': gammaList}
NLsvm = SVC(kernel='rbf', probability=True) #use probability=True to make boosting with initialisation bestEstimator
grid_nlsvm = GridSearchCV(NLsvm,p_grid_nlsvm,cv=5,scoring=('balanced_accuracy'),return_train_score=True, refit=True, n_jobs=-1) # n_jobs divides computation time by 4 (parallelisation)

if boolDimensionReduction:
    grid_nlsvm.fit(train_img_features_reduced, split_train_y)
else:
    grid_nlsvm.fit(train_img_features, split_train_y)

print(grid_nlsvm.best_params_)

bestEstimator = grid_nlsvm.best_estimator_


Fitting Non-linear SVM to the training set
{'C': 10.0, 'gamma': 0.05}


In [20]:
split_test_y_predicted = []
if boolSplitAndTestLocally:
    split_test_y_predicted = predictEntireTestSplit(split_test_img, split_test_imgSegCyt, split_test_imgSegNuc, bestEstimator)

    if boolDimensionReduction:
        print("NL SVM accuracy: "+ str(accuracy_score(split_test_y, split_test_y_predicted))+ " | Training accuracy: "+ str(accuracy_score(split_train_y, bestEstimator.predict(train_img_features_reduced))))
    else:
        print("NL SVM accuracy: "+ str(accuracy_score(split_test_y, split_test_y_predicted))+ " | Training accuracy: "+ str(accuracy_score(split_train_y, bestEstimator.predict(train_img_features))))
else:
    if boolDimensionReduction:
        print("NL SVM training accuracy: "+ str(accuracy_score(split_train_y, bestEstimator.predict(train_img_features_reduced))))
    else:
        print("NL SVM training accuracy: "+ str(accuracy_score(split_train_y, bestEstimator.predict(train_img_features))))


NL SVM accuracy: 0.8645690834473324 | Training accuracy: 0.9817351598173516


## Boost

In [21]:
from sklearn.ensemble import GradientBoostingClassifier

if boolDimensionReduction:
    clf = GradientBoostingClassifier(n_estimators=500, learning_rate=0.1, max_depth=1, random_state=0).fit(train_img_features_reduced, split_train_y)
else:
    clf = GradientBoostingClassifier(n_estimators=500, learning_rate=0.1, max_depth=1, random_state=0).fit(train_img_features, split_train_y)

split_test_y_predicted = []
if boolSplitAndTestLocally:
    split_test_y_predicted = predictEntireTestSplit(split_test_img, split_test_imgSegCyt, split_test_imgSegNuc, clf)

    if boolDimensionReduction:
        print("Gradient boosting accuracy: "+ str(accuracy_score(split_test_y, split_test_y_predicted))+ " | Training accuracy: "+ str(accuracy_score(split_train_y, clf.predict(train_img_features_reduced))))
    else:
        print("Gradient boosting accuracy: "+ str(accuracy_score(split_test_y, split_test_y_predicted))+ " | Training accuracy: "+ str(accuracy_score(split_train_y, clf.predict(train_img_features))))
else:
    if boolDimensionReduction:
        print("Gradient boosting training accuracy: "+ str(accuracy_score(split_train_y, clf.predict(train_img_features_reduced))))
    else:
        print("Gradient boosting training accuracy: "+ str(accuracy_score(split_train_y, clf.predict(train_img_features))))

Gradient boosting accuracy: 0.8344733242134063 | Training accuracy: 0.9662100456621004


In [22]:
from sklearn.ensemble import AdaBoostClassifier

if boolDimensionReduction:
    clf = AdaBoostClassifier(n_estimators=500, learning_rate=0.1).fit(train_img_features_reduced, split_train_y)
else:
    clf = AdaBoostClassifier(n_estimators=2000, learning_rate=0.1).fit(train_img_features, split_train_y)

split_test_y_predicted = []
if boolSplitAndTestLocally:
    split_test_y_predicted = predictEntireTestSplit(split_test_img, split_test_imgSegCyt, split_test_imgSegNuc, clf)

    if boolDimensionReduction:
        print("AdaBoost accuracy: "+ str(accuracy_score(split_test_y, split_test_y_predicted))+ " | Training accuracy: "+ str(accuracy_score(split_train_y, clf.predict(train_img_features_reduced))))
    else:
        print("AdaBoost accuracy: "+ str(accuracy_score(split_test_y, split_test_y_predicted))+ " | Training accuracy: "+ str(accuracy_score(split_train_y, clf.predict(train_img_features))))
else:
    if boolDimensionReduction:
        print("AdaBoost training accuracy: "+ str(accuracy_score(split_train_y, clf.predict(train_img_features_reduced))))
    else:
        print("AdaBoost training accuracy: "+ str(accuracy_score(split_train_y, clf.predict(train_img_features))))

AdaBoost accuracy: 0.6210670314637483 | Training accuracy: 0.6566210045662101


In [23]:
from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import HistGradientBoostingClassifier

if boolDimensionReduction:
    clf = HistGradientBoostingClassifier(max_iter=100, l2_regularization=1, learning_rate=0.1).fit(train_img_features_reduced, split_train_y)
else:
    clf = HistGradientBoostingClassifier(max_iter=100, l2_regularization=0.75, learning_rate=0.1).fit(train_img_features, split_train_y)

split_test_y_predicted = []
if boolSplitAndTestLocally:
    split_test_y_predicted = predictEntireTestSplit(split_test_img, split_test_imgSegCyt, split_test_imgSegNuc, clf)

    if boolDimensionReduction:
        print("Histogram-Gradient boosting accuracy: "+ str(accuracy_score(split_test_y, split_test_y_predicted))+ " | Training accuracy: "+ str(accuracy_score(split_train_y, clf.predict(train_img_features_reduced))))
    else:
        print("Histogram-Gradient boosting accuracy: "+ str(accuracy_score(split_test_y, split_test_y_predicted))+ " | Training accuracy: "+ str(accuracy_score(split_train_y, clf.predict(train_img_features))))
else:
    if boolDimensionReduction:
        print("Histogram-Gradient boosting training accuracy: "+ str(accuracy_score(split_train_y, clf.predict(train_img_features_reduced))))
    else:
        print("Histogram-Gradient boosting training accuracy: "+ str(accuracy_score(split_train_y, clf.predict(train_img_features))))

Histogram-Gradient boosting accuracy: 0.8755129958960328 | Training accuracy: 1.0


# Export CSV

## CSV pour Méthodes Linéaires, Non linear SVM ou Boosting

In [27]:
if not boolSplitAndTestLocally: #predict test set and export CSV
    # to modify ___________
    fileName = "NLSVM.csv"
    classifier = bestEstimator
    #______________________

    submissionCSV(fileName, classifier)