# Connect to drive

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# !tar -xvf '/content/drive/MyDrive/CAD_Dataset/Challenge1/test.tgz' -C '/content/drive/MyDrive/CAD_Dataset/Challenge1/'

In [None]:
# !tar -xvf '/content/drive/MyDrive/CAD_Dataset/Challenge2/test.tgz' -C '/content/drive/MyDrive/CAD_Dataset/Challenge2/'

# Import packages

In [None]:
!pip install mahotas
!pip install mlxtend
!pip install xgboost
!pip install plantcv
!pip install scikit-image --upgrade

In [None]:
##################################     GENERAL     ###########################################
import math
import time
import numpy as np
import pandas as pd
import matplotlib
from matplotlib import pyplot as plt
import numpy.ma as ma
from tabulate import tabulate
import os
import mahotas  
import pickle

################################## Computer Vision ###########################################
import cv2 as cv
from PIL import Image
#from google.colab.patches import cv2_imshow  
from scipy import ndimage
from scipy.spatial.distance import dice
from scipy.ndimage import gaussian_filter
from skimage import data, morphology
from skimage.measure import label, regionprops
from skimage.color import rgb2lab, lab2rgb
from skimage.filters import threshold_multiotsu, threshold_otsu
from skimage.morphology import skeletonize
from plantcv import plantcv as pcv
from skimage.feature import local_binary_pattern, graycomatrix, graycoprops


################################## MACHINE LEARNING  ###########################################
import sklearn as sk
from sklearn.utils import shuffle
from sklearn.metrics import accuracy_score, confusion_matrix, cohen_kappa_score, balanced_accuracy_score 
from sklearn.model_selection import KFold, StratifiedKFold, GridSearchCV, cross_val_score
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, PowerTransformer, FunctionTransformer
from sklearn.feature_selection import SelectKBest, SelectFromModel

#models 
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.svm import LinearSVC
from sklearn.linear_model import RidgeCV
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, StackingClassifier, VotingClassifier
from sklearn import linear_model
import xgboost as xgb

# Helper Functions

### Preprocessing helper functions

#### 0. Reading image

In [None]:
def read_img(img_id: str, challenge: str):
  '''
    read a preprocessed image from the dataset folders
    Args:
    - img_id (string): image folder name (without extension)
    - challenge: Challenge1 (binary) or Challenge2 (multiclass)
    Return:
    - The RGB preprocessed image stored in a numpy ndarray.
  '''
  path = f"/content/drive/MyDrive/CAD_Dataset/{challenge}/testX/{img_id}.jpg"
  print(path)
  img = cv.imread(path)
  img_rgb = cv.cvtColor(img, cv.COLOR_BGR2RGB)
  return img_rgb

In [None]:
def read_img_prepro(img_id: str, challenge: str):
  '''
    read a preprocessed image from the dataset folders
    Args:
    - img_id (string): image folder name (without extension)
    - challenge: Challenge1 (binary) or Challenge2 (multiclass)
    Return:
    - The RGB preprocessed image stored in a numpy ndarray.
  '''
  path = f"/content/drive/MyDrive/CAD_Dataset/{challenge}/test_preprocessed/{img_id}.jpg"
  img = cv.imread(path)
  img_rgb = cv.cvtColor(img, cv.COLOR_BGR2RGB)
  return img_rgb

In [None]:
#### Function Test #### 
img_id = "xxx00033"
img = read_img(img_id, 'Challenge1')

plt.imshow(img)
plt.show()


#### 1. Vignette removal

In [None]:
def crop_img(img: np.ndarray, threshold: int=0):
    '''
    Crop the image to get the region of interest. Remove the vignette frame.
    Analyze the value of the pixels in the diagonal of the image, from 0,0 to h,w and
    take the points where this value crosses the threshold by the first time and for last.
    Args:
    - img (numpy ndarray): Image to crop.
    - threshold (int): Value to split the diagonal into image and frame.
    Return:
    - The coordinates of the rectangle and the cropped image.
    '''
    # Get the image dimensions
    h, w = img.shape[:2]
    cd = math.gcd(h, w)  # Greatest Common Divider

    # Get the coordinates of the pixels in the diagonal
    y_coords = ([i for i in range(0, h, int(h/cd))], [i for i in range(h - int(h/cd), 0, -int(h/cd))])
    x_coords = ([i for i in range(0, w, int(w/cd))], [i for i in range(0, w, int(w/cd))])

    # Get the mean value of the pixels in the diagonal, form 0,0 to h,w 
    # and from h,0 to 0,w
    coordinates = {'y1_1': 0, 'x1_1': 0, 'y2_1': h, 'x2_1': w, 'y1_2': h, 'x1_2': 0, 'y2_2': 0, 'x2_2': w}
    for i in range(2):
        d = []
        y1_aux, x1_aux = 0, 0
        y2_aux, x2_aux = h, w 
        for y, x in zip(y_coords[i], x_coords[i]):
            d.append(np.mean(img[y, x, :]))

        # Get the location of the first point where the threshold is crossed
        for idx, value in enumerate(d):
            if (value >= threshold and idx != 0):  # If there's no vignette, in idx=0 the value would be > thresh..
                coordinates['y1_' + str(i + 1)] = y_coords[i][idx]
                coordinates['x1_' + str(i + 1)] = x_coords[i][idx]
                break

        # Get the location of the last point where the threshold is crossed
        for idx, value in enumerate(reversed(d)):
            if (value >= threshold and idx != 0):  # If there's no vignette, in idx=0 the value would be > thresh..
                coordinates['y2_' + str(i + 1)] = y_coords[i][len(y_coords[i])-idx]
                coordinates['x2_' + str(i + 1)] = x_coords[i][len(x_coords[i])-idx]
                break

    # Set the coordinates to crop the image
    y1 = max(coordinates['y1_1'], coordinates['y2_2'])
    y2 = min(coordinates['y2_1'], coordinates['y1_2'])
    x1 = max(coordinates['x1_1'], coordinates['x1_2'])
    x2 = min(coordinates['x2_1'], coordinates['x2_2'])

    img_new = img[y1:y2, x1:x2, :]

    if img_new.shape[0] == 0 or img_new.shape[1] == 0:
      img_new = img 
    
    return img_new #y1, y2, x1, x2, img[y1:y2, x1:x2, :]

In [None]:
#### Function Test #### 
img_crop= crop_img(img)

plt.imshow(img_crop)
plt.show()

#### 2. Hair Removal

In [None]:
def inpaint(src: np.ndarray, se_size: int = 15):    
    '''param : src --> Color image
               se_size --> Size of the structuring elements
      return : Inp --> Inpainted image with hair removed '''

    # Convert the original image to grayscale if it has > 1 channel
    if (len(src.shape)==3):
      channel = cv.cvtColor(src, cv.COLOR_BGR2GRAY)
    else:
      channel = src

    # Structuring Element for the morphological filtering
    
    se = cv.getStructuringElement(1, (se_size, se_size))  # (17x17) '+' shaped SE
    se2 = np.array(list(reversed(list(zip(*np.eye(se_size)))))) + np.eye(se_size)
    se2[int(se_size/2), int(se_size/2)] = 1  # (17x17) 'X' shaped SE
    
    # Perform the blackHat filtering on the grayscale image to find the 
    # hair (and other objects') countours
    blackhat = cv.morphologyEx(channel, cv.MORPH_BLACKHAT, se)
    blackhat2 = cv.morphologyEx(channel, cv.MORPH_BLACKHAT, se2.astype(np.uint8))
    bHat = blackhat + blackhat2

    # Intensify the countours detected in preparation for the inpainting algorithm
    ret, thresh = cv.threshold(bHat, 10, 255, cv.THRESH_BINARY)

    # Inpaint the original image depending on the mask
    Inp = cv.inpaint(src, thresh, 1, cv.INPAINT_TELEA)

    return Inp

In [None]:
#### Function Test 1 #### 
img_inp = inpaint(img)
plt.imshow(img_inp)
plt.show()

#### 3. Color Constancy 

In [None]:
def shade_of_gray_cc(img: np.ndarray, power:int =6, gamma=None):
    """
    img (numpy array): the original image with format of (h, w, c)
    power (int): the degree of norm, 6 is used in reference paper
    gamma (float): the value of gamma correction, 2.2 is used in reference paper
    """
    img_dtype = img.dtype

    if gamma is not None:
        img = img.astype('uint8')
        look_up_table = np.ones((256,1), dtype='uint8') * 0
        for i in range(256):
            look_up_table[i][0] = 255 * pow(i/255, 1/gamma)
        img = cv.LUT(img, look_up_table)

    img = img.astype('float32')
    img_power = np.power(img, power)
    rgb_vec = np.power(np.mean(img_power, (0,1)), 1/power)
    rgb_norm = np.sqrt(np.sum(np.power(rgb_vec, 2.0)))
    rgb_vec = rgb_vec/rgb_norm
    rgb_vec = 1/(rgb_vec*np.sqrt(3))
    img = np.multiply(img, rgb_vec)

    # Andrew Anikin suggestion
    img = np.clip(img, a_min=0, a_max=255)
    
    return img.astype(img_dtype)

In [None]:
#### Function Test #### 
img_cc = shade_of_gray_cc(img)
plt.imshow(img_cc)
plt.show()

#### 4. Other

In [None]:
def resize_im(img: np.ndarray, max_size: int):  
  '''
    function to resize an image while maintaining the aspect ratio
    args: 
      - img: (np.ndarray) image to be resized 
      - max_size: the new size of the largest side of the image
  ''' 
  scale_percent = round((max_size / max(img.shape[0], img.shape[1])),2) 
  width = int(img.shape[1] * scale_percent)
  height = int(img.shape[0] * scale_percent)
  dim = (width, height)
  # Resize image
  image = cv.resize(img, dim, interpolation = cv.INTER_CUBIC)
  return image

In [None]:
###### Test function #############
resized = resize_im(img, 500);
print(f"Before: {img.shape}\nAfter: {resized.shape}")

Preprocessing function:

In [None]:
def preprocess(img: np.ndarray, max_size: int, crop_params=None, cc_params=None, remove_hair_params=None):
      '''
        function that runs the whole preprocessing pipeline over an image 
        args:
          - img: (np.ndarray) image to be processed 
          - img_size: (int) new image sizes
          - crop_params: (dict) parameters of cropping function stored in a dict
          - cc_params: (dict) parameters of color normalization function stored in a dict
          - remove_hair_params: (dict) parameters of hair removal function stored in a dict
    returns: the preprocessed image stored in an ndarray
      '''
      if crop_params is not None:
        img = crop_img(img, threshold=crop_params["threshold"])

      img = resize_im(img, max_size)

      if cc_params is not None: 
        img = shade_of_gray_cc(img, power=cc_params["power"], gamma=cc_params["gamma"])

      if remove_hair_params is not None: 
        img = inpaint(img, se_size =  remove_hair_params['se_size'])

      return img

In [None]:
#### Function Test #### 
img_pp = preprocess(img, 500, crop_params= None, cc_params={"gamma": None, "power":6}, remove_hair_params={'se_size': 9})
plt.imshow(img_pp)
plt.show()

### Feature Extraction helper functions

#### Global

In [None]:
def variegation(Im: np.ndarray):
  '''param : Im            --> RGB image
    return : C_r, C_g, C_b --> Color Variegation measures for each RGB Channel '''  

  # Split Color channels  
  lesion_r = Im[:, :, 0]
  lesion_g = Im[:, :, 1]
  lesion_b = Im[:, :, 2]

  # Compute the normalized Standard Deviation of each channel
  C_r = np.std(lesion_r) / np.max(lesion_r)
  C_g = np.std(lesion_g) / np.max(lesion_g)
  C_b = np.std(lesion_b) / np.max(lesion_b)

  variegation = np.array([C_r, C_g, C_b]).reshape(1, -1)
  return variegation

In [None]:
### Test function ### 
vari = variegation(img)

print(f"Variegation: {vari}")

Variegation: [[0.1049008  0.11762126 0.13037078]]


#### Color

In [None]:
def color_moments(Img: np.ndarray):
    """ Extract Color Moments of an image.
    --> param Img : ndarray, RGB image
    --> return color_moments : ndarray, contains the 4 Central Color Moments """

    c1, c2, c3 = cv.split(Img)
    color_feature = []  # Initialize the color feature

    # -- The first central moment - average
    c1_mean = np.mean(c1)  # np.sum(h)/float(N)
    c2_mean = np.mean(c2)  # np.sum(s)/float(N)
    c3_mean = np.mean(c3)  # np.sum(v)/float(N)
    color_feature.extend([c1_mean, c2_mean, c3_mean])
    # -- The second central moment - standard deviation
    c1_std = np.std(c1)  # np.sqrt(np.mean(abs(h - h.mean())**2))
    c2_std = np.std(c2)  # np.sqrt(np.mean(abs(s - s.mean())**2))
    c3_std = np.std(c3)  # np.sqrt(np.mean(abs(v - v.mean())**2))
    color_feature.extend([c1_std, c2_std, c3_std])
    # -- The third central moment - the third root of the skewness
    c1_skewness = np.mean(abs(c1 - c1.mean())**3)
    c2_skewness = np.mean(abs(c2 - c2.mean())**3)
    c3_skewness = np.mean(abs(c3 - c3.mean())**3)
    c1_thirdMoment = c1_skewness**(1./3)
    c2_thirdMoment = c2_skewness**(1./3)
    c3_thirdMoment = c3_skewness**(1./3)
    color_feature.extend([c1_thirdMoment, c2_thirdMoment, c3_thirdMoment])
    # -- The fourth central moment - the variance
    c1_var = c1_std**2  # (np.mean(abs(h - h.mean())**2))
    c2_var = c2_std**2  # (np.mean(abs(s - s.mean())**2))
    c3_var = c3_std**2  # (np.mean(abs(v - v.mean())**2))
    color_feature.extend([c1_var, c2_var, c3_var])

    return np.array(color_feature).reshape(1, -1)

In [None]:
### Test function ### 
col_mom = color_moments(img)
print(col_mom)

In [None]:
# ------------------------------------------------------------------------------
def extract_color_histogram(Img: np.ndarray, n_bins: int=256):
    """Extract Color histogram of an image.
    --> param Img : ndarray, RGB image
    --> return feature : ndarray, contains n_bins*n_bins*n_bins of RGB, HSV & L.a.b histogram features of the image
    """
    # --- RGB : ---
    rgb_ft = []
    channels = cv.split(Img)
    colors = ("r", "g", "b")
    for (channel, color) in zip(channels, colors):  
      hist = cv.calcHist([channel], [0], None, [n_bins], [1, 256])     # For each image channel, the normalized histogram is computed
      hist = hist/hist.sum()
      #cv.normalize(hist, hist, norm_type=cv.NORM_MINMAX)
      rgb_ft.extend(hist)  
    rgb_ft = np.array(rgb_ft).reshape(1, -1)

    # --- HSV : ---
    hsv_ft = []
    hsv = cv.cvtColor(Img, cv.COLOR_RGB2HSV) # Convert the image to HSV color-space
    channels = cv.split(hsv)
    colors = ("h", "s", "v")
    for (channel, color) in zip(channels, colors):
      hist = cv.calcHist([channel], [0], None, [n_bins], [1, 256])  # For each image channel, the normalized histogram is computed
      hist = hist/hist.sum()
      #cv.normalize(hist, hist, norm_type=cv.NORM_MINMAX)
      hsv_ft.extend(hist)  
    hsv_ft = np.array(hsv_ft).reshape(1, -1)

    # --- LAB : ---
    lab_ft = []
    lab = cv.cvtColor(Img, cv.COLOR_RGB2LAB) # Convert the image to Lab color-space
    channels = cv.split(lab)
    colors = ("l", "a", "b")
    for (channel, color) in zip(channels, colors):
      hist = cv.calcHist([channel], [0], None, [n_bins], [1, 256])  # For each image channel, the normalized histogram is computed
      hist = hist/hist.sum()
      #cv.normalize(hist, hist, norm_type=cv.NORM_MINMAX)
      lab_ft.extend(hist)  
    lab_ft = np.array(lab_ft).reshape(1, -1)

    return rgb_ft, hsv_ft, lab_ft

In [None]:
### Test function ### 
rgb, hsv, lab = extract_color_histogram(img, 64)

In [None]:
print(hsv.shape)
print(rgb.shape)
print(lab.shape)

#### Texture

In [None]:
def extract_texture_fd(Img: np.ndarray, P:int =16, R:int =2):
    """Extract Texture Features
    --> param Img : ndarray, RGB image
              P, R: integers, LBP parameters
    --> return texture_features : ndarrays, contains Texture descriptors of the image
    """
    blue_Img = Img[:,:,2]  # Use the blue channel of the Image

    # Local Binary Pattern (LBP) : 
    lbp = local_binary_pattern(blue_Img, P, R, method='uniform')  # P24 R8
    n_bins = int(lbp.max() + 1)
    lbp_fd, _ = np.histogram(lbp.ravel(), bins=n_bins, range=(0, n_bins), density=True)

    # Haralick : 
    haralick_fd = mahotas.features.haralick(blue_Img).mean(axis=0)  # 14 Statistics

    # Gray Level Co-occurance Matrix (GLCM) : 
    distance = [1]
    angles = [0, np.pi/4, np.pi/2, 3*np.pi/4]
    properties = ['correlation', 'homogeneity', 'contrast', 'energy', 'dissimilarity']
    glcm_fd = []
    glcm_mat = graycomatrix(blue_Img, distances=distance, angles=angles, symmetric=True, normed=True)
    glcm_fd = np.hstack([graycoprops(glcm_mat, props).ravel() for props in properties])
    lbp_fd = np.array(lbp_fd).reshape(1, -1)
    haralick_fd = np.array(haralick_fd).reshape(1, -1)
    glcm_fd = np.array(glcm_fd).reshape(1, -1)

    return lbp_fd, haralick_fd, glcm_fd

In [None]:
lbp_f, haralick_f, glcm_f = extract_texture_fd(img, 16, 2)
print(lbp_f.shape)
print(haralick_f.shape)
print(glcm_f.shape)

#### All

In [None]:
def extract_save_features(df, challenge, lbp_params = None):
  '''
    function that extracts all features from all images
    args: 
      - df: (pandas dataframe): dataframe with image ids
      - challenge: (string) challeng1 or challeng2
      - lbp_params: (dict) parameters for LBP function
    returns all feature vectors of all images stacked vertically in a numpy array
  '''

    # Initialize empty arrays to store the features 
  variegation_fd = []
  color_feature_fd = []
  rgb_ft_fd = []
  hsv_ft_fd = []
  lab_ft_fd = []

  # Texture features : 
  lbp_fd = []
  haralick_fd = []
  glcm_fd = []    

  for i, row in df.iterrows():
      print(i)
      img_id = row['image_id']
      img = read_img_prepro(img_id, challenge)

      varieg = variegation(img)
      variegation_fd.append(varieg)

      col_mom = color_moments(img)
      color_feature_fd.append(col_mom)

      rgb_tr, hsv_tr, lab_tr = extract_color_histogram(img, 64)
      rgb_ft_fd.append(rgb_tr)
      hsv_ft_fd.append(hsv_tr)
      lab_ft_fd.append(lab_tr)

      lbp_f, haralick_f, glcm_f = extract_texture_fd(img, P=lbp_params['P'], R=lbp_params['R'])
      lbp_fd.append(lbp_f)
      haralick_fd.append(haralick_f)
      glcm_fd.append(glcm_f)

  variegation_fd =  np.squeeze(np.array(variegation_fd), axis=1)
  color_feature_fd =  np.squeeze(np.array(color_feature_fd), axis=1)
  rgb_ft_fd = np.squeeze(np.array(rgb_ft_fd), axis=1)
  hsv_ft_fd =  np.squeeze(np.array(hsv_ft_fd), axis=1)
  lab_ft_fd =  np.squeeze(np.array(lab_ft_fd), axis=1)
  lbp_fd =  np.squeeze(np.array(lbp_fd), axis=1)
  haralick_fd =  np.squeeze(np.array(haralick_fd), axis=1)
  glcm_fd =  np.squeeze(np.array(glcm_fd), axis=1)
    
  return variegation_fd, color_feature_fd, rgb_ft_fd, hsv_ft_fd, lab_ft_fd, lbp_fd, haralick_fd, glcm_fd

# Save preprocessed images

In [None]:
def save_imgs(df, challenge, img_size, crop_params=None, cc_params=None, remove_hair_params=None):
  '''
    function that preprocesses images and saves them to a directory 
    args: 
      - df: (dataframe): dataframe with images ids
      - challenge: (string) Challenge1 or Challenge2
      - img_size: (int) new image sizes
      - crop_params: (dict) parameters of cropping function stored in a dict
      - cc_params: (dict) parameters of color normalization function stored in a dict
      - remove_hair_params: (dict) parameters of hair removal function stored in a dict
    returns: nothing
  '''
  for i, row in df.iterrows():
    img_id = row['image_id']
    print(img_id)
    img = read_img(img_id, challenge)
    img = preprocess(img, img_size, crop_params, cc_params, remove_hair_params)
    img_pil = Image.fromarray(img)
    img_pil.save(f'/content/drive/MyDrive/CAD_Dataset/{challenge}/test/preprocessed/{img_id}.jpg')

In [None]:
# Creating CSV files with file names for each challenge
img_ids_1 = []
img_ids_2 = []

# Challenge 1
directory = "/content/drive/MyDrive/CAD_Dataset/Challenge1/testX" 
for filename in os.listdir(directory):
    f = os.path.join(directory, filename)
    # checking if it is a file
    if os.path.isfile(f):
        img_id = f[-12:-4]
        # print(img_id)
        img_ids_1.append(img_id)

# Challenge 2
directory = "/content/drive/MyDrive/CAD_Dataset/Challenge2/testX" 
for filename in os.listdir(directory):
    f = os.path.join(directory, filename)
    # checking if it is a file
    if os.path.isfile(f):
        img_id = f[-12:-4]
        # print(img_id)
        img_ids_2.append(img_id)
        
img_ids_1 = sorted(img_ids_1)
img_ids_2 = sorted(img_ids_2)
test1_df = pd.DataFrame(list(zip(img_ids_1)), columns =['image_id'])
test2_df = pd.DataFrame(list(zip(img_ids_2)), columns =['image_id'])

test1_df.to_csv('/content/drive/MyDrive/CAD_Dataset/Challenge1/test.csv')
test2_df.to_csv('/content/drive/MyDrive/CAD_Dataset/Challenge2/test.csv')

In [None]:
test_df_challenge1 = pd.read_csv("/content/drive/MyDrive/CAD_Dataset/Challenge1/test.csv")
print(len(test_df_challenge1))
test_df_challenge2 = pd.read_csv("/content/drive/MyDrive/CAD_Dataset/Challenge2/test.csv")
print(len(test_df_challenge2))

In [None]:
save_imgs(test_df_challenge1, 'train', 500, crop_params={'threshold':50}, cc_params={'power':6, 'gamma':None}, remove_hair_params={'se_size': 9})

In [None]:
save_imgs(test_df_challenge2, 'val', 500, crop_params={'threshold':50}, cc_params={'power':6, 'gamma':None}, remove_hair_params={'se_size': 9})

# Extract Features 

## Challenge 1


In [None]:
variegation_fd1, color_feature_fd1, rgb_fd1, hsv_fd1, lab_fd1, lbp_fd1, haralick_fd1, glcm_fd1 = extract_save_features(test_df_challenge1, 
                                                                                                                'Challenge1', 
                                                                                                                lbp_params={'P':16, 'R':2})

In [None]:
# ----  Save the extracted features in our Save_Folder :  ----
save_path = '/content/drive/MyDrive/CAD_Dataset/Challenge1/'

# --  Save Train Features :
if save_path is not None:      
  os.makedirs(os.path.join(save_path, 'features_test'), exist_ok=True)
  with open(os.path.join(save_path, 'features_test', 'test_variegation_fd' + '.pkl'), 'wb') as f:
          pickle.dump(variegation_fd1, f)
  print("test_variegation_fd were saved..\n")

  with open(os.path.join(save_path, 'features_test', 'test_color_feature_fd' + '.pkl'), 'wb') as f:
          pickle.dump(color_feature_fd1, f)
  print("test_color_feature_fd were saved..\n")

  with open(os.path.join(save_path, 'features_test', 'test_rgb_fd' + '.pkl'), 'wb') as f:
          pickle.dump(rgb_fd1, f)
  print("test_rgb_fd were saved..\n")

  with open(os.path.join(save_path, 'features_test', 'test_hsv_fd' + '.pkl'), 'wb') as f:
          pickle.dump(hsv_fd1, f)
  print("test_hsv_fd were saved..\n")

  with open(os.path.join(save_path, 'features_test', 'test_lab_fd' + '.pkl'), 'wb') as f:
          pickle.dump(lab_fd1, f)
  print("test_lab_fd were saved..\n")

  with open(os.path.join(save_path, 'features_test', 'test_lbp_fd' + '.pkl'), 'wb') as f:
          pickle.dump(lbp_fd1, f)
  print("test_lbp_fd were saved..\n")

  with open(os.path.join(save_path, 'features_test', 'test_haralick_fd' + '.pkl'), 'wb') as f:
          pickle.dump(haralick_fd1, f)
  print("test_haralick_fd were saved..\n")

  with open(os.path.join(save_path, 'features_test', 'test_glcm_fd' + '.pkl'), 'wb') as f:
          pickle.dump(glcm_fd1, f)
  print("test_glcm_fd were saved..\n")

test_variegation_fd were saved..

test_color_feature_fd were saved..

test_rgb_fd were saved..

test_hsv_fd were saved..

test_lab_fd were saved..

test_lbp_fd were saved..

test_haralick_fd were saved..

test_glcm_fd were saved..



## Challenge 2


In [None]:
variegation_fd2, color_feature_fd2, rgb_fd2, hsv_fd2, lab_fd2, lbp_fd2, haralick_fd2, glcm_fd2 = extract_save_features(test_df_challenge2, 
                                                                                                                'Challenge2', 
                                                                                                                lbp_params={'P':16, 'R':2})

In [None]:
# ----  Save the extracted features in our Save_Folder :  ----
save_path = '/content/drive/MyDrive/CAD_Dataset/Challenge2/'

# --  Save Train Features :
if save_path is not None:      
  os.makedirs(os.path.join(save_path, 'features_test'), exist_ok=True)
  with open(os.path.join(save_path, 'features_test', 'test_variegation_fd' + '.pkl'), 'wb') as f:
          pickle.dump(variegation_fd2, f)
  print("test_variegation_fd were saved..\n")

  with open(os.path.join(save_path, 'features_test', 'test_color_feature_fd' + '.pkl'), 'wb') as f:
          pickle.dump(color_feature_fd2, f)
  print("test_color_feature_fd were saved..\n")

  with open(os.path.join(save_path, 'features_test', 'test_rgb_fd' + '.pkl'), 'wb') as f:
          pickle.dump(rgb_fd2, f)
  print("test_rgb_fd were saved..\n")

  with open(os.path.join(save_path, 'features_test', 'test_hsv_fd' + '.pkl'), 'wb') as f:
          pickle.dump(hsv_fd2, f)
  print("test_hsv_fd were saved..\n")

  with open(os.path.join(save_path, 'features_test', 'test_lab_fd' + '.pkl'), 'wb') as f:
          pickle.dump(lab_fd2, f)
  print("test_lab_fd were saved..\n")

  with open(os.path.join(save_path, 'features_test', 'test_lbp_fd' + '.pkl'), 'wb') as f:
          pickle.dump(lbp_fd2, f)
  print("test_lbp_fd were saved..\n")

  with open(os.path.join(save_path, 'features_test', 'test_haralick_fd' + '.pkl'), 'wb') as f:
          pickle.dump(haralick_fd2, f)
  print("test_haralick_fd were saved..\n")

  with open(os.path.join(save_path, 'features_test', 'test_glcm_fd' + '.pkl'), 'wb') as f:
          pickle.dump(glcm_fd2, f)
  print("test_glcm_fd were saved..\n")

test_variegation_fd were saved..

test_color_feature_fd were saved..

test_rgb_fd were saved..

test_hsv_fd were saved..

test_lab_fd were saved..

test_lbp_fd were saved..

test_haralick_fd were saved..

test_glcm_fd were saved..



# Prediction

## Challenge 1

### Data

In [None]:
test_df1 = pd.read_csv('/content/drive/MyDrive/CAD_Dataset/Challenge1/test.csv')

In [None]:
# Open test features 
save_path = '/content/drive/MyDrive/CAD_Dataset/Challenge1/'

# ---  Read Saved test Features :  ---------------------------------
with open(os.path.join(save_path, 'features_test', 'test_variegation_fd' + '.pkl'), 'rb') as file:
    test_variegation_fd = pickle.load(file)

with open(os.path.join(save_path, 'features_test', 'test_color_feature_fd' + '.pkl'), 'rb') as file:
    test_color_feature_fd = pickle.load(file)

with open(os.path.join(save_path, 'features_test', 'test_rgb_fd' + '.pkl'), 'rb') as file:
    test_rgb_fd = pickle.load(file)

with open(os.path.join(save_path, 'features_test', 'test_hsv_fd' + '.pkl'), 'rb') as file:
    test_hsv_fd = pickle.load(file)

with open(os.path.join(save_path, 'features_test', 'test_lab_fd' + '.pkl'), 'rb') as file:
    test_lab_fd = pickle.load(file)

with open(os.path.join(save_path, 'features_test', 'test_lbp_fd' + '.pkl'), 'rb') as file:
    test_lbp_fd = pickle.load(file)

with open(os.path.join(save_path, 'features_test', 'test_haralick_fd' + '.pkl'), 'rb') as file:
    test_haralick_fd = pickle.load(file)

with open(os.path.join(save_path, 'features_test', 'test_glcm_fd' + '.pkl'), 'rb') as file:
    test_glcm_fd = pickle.load(file)  

In [None]:
print(test_variegation_fd.shape)
print(test_color_feature_fd.shape)
print(test_rgb_fd.shape)
print(test_hsv_fd.shape)
print(test_lab_fd.shape)
print(test_lbp_fd.shape)
print(test_haralick_fd.shape)
print(test_glcm_fd.shape)

(6340, 3)
(6340, 12)
(6340, 192)
(6340, 192)
(6340, 192)
(6340, 18)
(6340, 13)
(6340, 20)


### Models

In [None]:
save_path = '/content/drive/MyDrive/CAD_Dataset/Challenge1/'

with open(os.path.join(save_path, 'models', 'xgboost_final' + '.pkl'), 'rb') as file:
    xgboost = pickle.load(file)
with open(os.path.join(save_path, 'models', 'svm_final' + '.pkl'), 'rb') as file: 
    svm = pickle.load(file)

### Prediction

In [None]:
# Features 
test_features = np.hstack([test_variegation_fd, 
                            test_color_feature_fd[:, 3:], 
                            test_rgb_fd[:, 64:], 
                            test_hsv_fd, 
                            test_lab_fd, 
                            test_lbp_fd, 
                            test_haralick_fd[:, [0, 1, 2, 3, 4, 6, 9, 10]]])

In [None]:
svm_probs = svm.predict_proba(test_features)
xgbt_probs = xgboost.predict_proba(test_features)
y_probs = (svm_probs + xgbt_probs)/2
y_preds = np.argmax(y_probs, axis=1)

In [None]:
print(len(y_preds[y_preds==0]))
print(len(y_preds[y_preds==1]))

3266
3074


In [None]:
test_df1['labels'] = y_preds
test_df1.to_csv(f'{save_path}/test_results.csv')

## Challenge 2 

### Data

In [None]:
test_df2 = pd.read_csv('/content/drive/MyDrive/CAD_Dataset/Challenge2/test.csv')

# Open test features 
save_path = '/content/drive/MyDrive/CAD_Dataset/Challenge2/'

# ---  Read Saved test Features :  ---------------------------------
with open(os.path.join(save_path, 'features_test', 'test_variegation_fd' + '.pkl'), 'rb') as file:
    test_variegation_fd = pickle.load(file)

with open(os.path.join(save_path, 'features_test', 'test_color_feature_fd' + '.pkl'), 'rb') as file:
    test_color_feature_fd = pickle.load(file)

with open(os.path.join(save_path, 'features_test', 'test_rgb_fd' + '.pkl'), 'rb') as file:
    test_rgb_fd = pickle.load(file)

with open(os.path.join(save_path, 'features_test', 'test_hsv_fd' + '.pkl'), 'rb') as file:
    test_hsv_fd = pickle.load(file)

with open(os.path.join(save_path, 'features_test', 'test_lab_fd' + '.pkl'), 'rb') as file:
    test_lab_fd = pickle.load(file)

with open(os.path.join(save_path, 'features_test', 'test_lbp_fd' + '.pkl'), 'rb') as file:
    test_lbp_fd = pickle.load(file)

with open(os.path.join(save_path, 'features_test', 'test_haralick_fd' + '.pkl'), 'rb') as file:
    test_haralick_fd = pickle.load(file)

with open(os.path.join(save_path, 'features_test', 'test_glcm_fd' + '.pkl'), 'rb') as file:
    test_glcm_fd = pickle.load(file)  

In [None]:
print(test_variegation_fd.shape)
print(test_color_feature_fd.shape)
print(test_rgb_fd.shape)
print(test_hsv_fd.shape)
print(test_lab_fd.shape)
print(test_lbp_fd.shape)
print(test_haralick_fd.shape)
print(test_glcm_fd.shape)

(226, 3)
(226, 12)
(226, 192)
(226, 192)
(226, 192)
(226, 18)
(226, 13)
(226, 20)


### Models

In [None]:
save_path = '/content/drive/MyDrive/CAD_Dataset/Challenge2/'

with open(os.path.join(save_path, 'models', 'voting_smund1700_yeo_newAll_clf6' + '.pkl'), 'rb') as file:
    eclf = pickle.load(file)

### Prediction

In [None]:
# Features :
test_features_all = np.hstack([test_variegation_fd, 
                               test_color_feature_fd, 
                               test_rgb_fd, 
                               test_hsv_fd, 
                               test_lab_fd, 
                               test_lbp_fd, 
                               test_haralick_fd, 
                               test_glcm_fd]) 

In [None]:
y_pred = eclf.predict(test_features_all)  # Predictions
print('Label 0:', np.count_nonzero(y_pred==0))
print('Label 1:', np.count_nonzero(y_pred==1))
print('Label 2:', np.count_nonzero(y_pred==2))

Label 0: 1079
Label 1: 907
Label 2: 135


In [None]:
test_df2['labels'] = y_pred
test_df2.to_csv(f'{save_path}/test_results_challenge2.csv')