# Project 3: Image recognizion. (Melanoma)
### Group 7: Emma, Laurits, Malthe, Mads og Jonas


## Loading the Libraries

In [1]:
## only applies when using deepnote
!apt update
!apt install ffmpeg libsm6 libxext6 -y

!pip install opencv-python
!pip install --upgrade pip

In [2]:
import numpy as np
import matplotlib.pyplot as plt
import os 
import fyp2021p3_group00_functions as util
import pandas as pd
import cv2
import seaborn as sns
import math

from skimage import morphology
from skimage import filters
from skimage.morphology import opening
from skimage.color import label2rgb
from skimage.segmentation import slic
from skimage.segmentation import mark_boundaries
from skimage.util import img_as_float
from scipy.ndimage import rotate
from skimage.exposure import is_low_contrast

from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.decomposition import PCA

## Functions

In [3]:
def check_quality(dictionary_, image_id_jpg):
    """ A Function that returns the number of pixels in an image that can
    be used to check whether the image is low or high quality"""

    im = plt.imread(dictionary_[image_id_jpg])
    resolution = im.shape
    num_of_pixels = resolution[0]*resolution[1]
    return num_of_pixels

In [4]:
def low_quality_pictures(resolution, dictionary_):
    not_fine = 0
    fine = 0
    popped_image = []

    for i in dictionary_:   
        x = check_quality(dictionary_, str(i))
        if x >= resolution:
            fine += 1
        else: 
            not_fine += 1
            popped_image.append(i)

    # then we pop the images from the dictionary PICTURES that are low quality 
    for i in popped_image:
        if i in dictionary_:
            dictionary_.pop(i)

    print('Fine: {0}, Not fine: {1}'.format(fine, not_fine))
    print('Image popped from the dict: {0}'.format(popped_image))

In [5]:
def bluriness(dictionary_):
    
    """This function takes a dictionary and checks whether the images
    in the dictionary are blurry or sharp and then removes the blurry images
    from the dictionary."""

    blurry_images = []
    not_blurry = 0
    blurry = 0
    for key, value in dictionary_.items():   
        x = cv2.imread(str(value))
        v = cv2.Laplacian(x, cv2.CV_64F).var()
        if v > 10:
            not_blurry += 1
        else: 
            blurry += 1
            blurry_images.append(key)

# now we delete the images from the dictionary that are blurry
    for i in blurry_images:
        dictionary_.pop(i)
        temp = str(i)
        dictionary_.pop(temp[:-4]+"_segmentation.png")

    print('Not Blurry: {0}'.format(not_blurry))
    print('Blurry: {0}'.format(blurry))
    print("Blurry Images: {0}".format(blurry_images))

In [6]:
def contrast(dictionary_):

    """ The function takes a dictionary of images with the path as a value 
	to input and checks the contrast of the image"""

    high = 0
    low = 0
    low_ = []
    for key, value in dictionary_.items():
        x = cv2.imread(str(value))
        y = is_low_contrast(x)
        if y == True:
            low += 1
            low_.append(key)
        else:
            high += 1
            
    #Removes the low contrast from the PICTURES
    for i in low_:
        dictionary_.pop(i)

    print('Number of Images with High Contrast: {0}'.format(high))
    print('Number of Images with Low Contrast: {0}'.format(low))
    print("Images with Low Contrast: {0}".format(low_))

In [7]:
def get_mask_size(mask):
    """Finding the min and max values for the mask"""
    i, j = np.where(mask)

    x_min = j.min()
    x_max = j.max()
    y_min = i.min()
    y_max = i.max()
    return x_min, x_max, y_min, y_max
    

In [8]:
def accuracy_matrix(prediction, answer):
    """Takes in the predictions of a model for a test-case, as well as the true values, 
    and returns a matrix of true and false for positive and negative cases."""
    total_sick = np.sum(answer)
    total_healthy = len(answer)-total_sick
    predictions_of_sick = [prediction == 1]
    predictions_of_healthy = [prediction == 0]

    helathy = [answer == 0]
    sick = [answer == 1]

    true_positive = len(answer[(prediction == 1) & (answer == 1)])/len(answer[answer == 1])
    false_negative = 1-true_positive
    true_negative = len(answer[(answer == 0) & (prediction == 0)])/len(answer[answer == 0])
    false_positive = 1-true_negative

    return true_positive, false_positive, true_negative, false_negative

In [9]:
def make_knn_prediction(k):
    """This function takis in the number k in KNN-test, and trans using
    the dataframes for training. With this model, it makes a prediction on the 
    test-data, and returns the prediction."""
    knn = KNeighborsClassifier(n_neighbors = k) #Create the model
    knn.fit(df[['Asymmetry','Border', 'Colour']], df['illens']) #Train the model
    answer = knn.predict(df_test[['Asymmetry','Border', 'Colour']]) #Use the model to predict
    return answer

## Loading the data

In [10]:
# First we make the path to the data
PATH = {}
PATH["data"] ="../data"

#print(PATH)
#Then we load the path for all the diffirent as keys to their names.
PICTURES = {}
for path, subdirs, files in os.walk(PATH["data"]):
    for picture in files:
        # Making sure that we exclude the .csv from the dictionary, to only include images (ie. png and jpg files)
        if not str(picture).count(".csv", (len(picture) - 4),len(picture)):
            PICTURES[picture] = os.path.normpath(os.path.join(path, picture))

print(PICTURES)
# Remove a last file from the PICTURES dictonary
PICTURES.pop(".DS_Store")

images = pd.read_csv("../data/example_ground_truth.csv")
images['id'] = images['image_id']
image_way = pd.DataFrame.from_dict(PICTURES, orient='index')
image_way['id'] = image_way.index
for number, name in enumerate(iterable = image_way['id']):
    image_way['id'][number] = name[0:12]
image_merge = images.merge(image_way, left_on ='image_id' , right_on = 'id', how = 'outer')
image_merge['healthy'] = 1-image_merge['melanoma']-image_merge['seborrheic_keratosis']
image_merge = image_merge.rename(columns ={0: 'path'})
image_merge['mask'] = image_merge['path'].str.contains('png')
image_merge['raw'] = image_merge['path'].str.contains('jpg')
# Reworking images to contain name and helath of all the images to use later.
images = images.dropna(axis=0)
images = images[images['seborrheic_keratosis']!= 1]


# Task 0: Explore the Data

Go through the data (csv file, images, segmentations) that you have available
to understand what’s available to you, and write a brief description. Decide if
this data is sufficient, or if cleaning is needed. For example, what do you do with
the images that are malignant (cancer), but not of the class you want to focus
on? Are there images of low quality? Etc. You are allowed to search for and add
other public dataset, to this set of images.

##### Description of Data: 
The data includes 150 images of skin lesions and then another 150 images of the same images in segmentations, whereby the image is zoomed in. Besides this the data also includes the Superpixel of some of the images. 

In [11]:
# Loading the data into a dataframe and classifying based on class
image_info = pd.read_csv("../data/example_ground_truth.csv")
cancer_free = image_info[(image_info['melanoma']== 0) & (image_info['seborrheic_keratosis']==0)]
melanoma = image_info[image_info['melanoma'] == 1]
keratosis = image_info[image_info['seborrheic_keratosis']==1]

#image_info
print("image_info: {0}, cancer_free: {1}, melanoma: {2}, keratosis: {3}"
.format(image_info.shape, cancer_free.shape, melanoma.shape, keratosis.shape))


In [12]:
# put keratosis image id's into a list to use later
list_keratosis = []
for i in keratosis["image_id"]:
    list_keratosis.append(i)

# deleting the keratosis images in the dictionary PICTURES
# length of dictiionary has changed from 357 to 275 images
for i in list_keratosis:
    n = (i + '.jpg')
    x = (i + '_segmentation.png')
    y = (i + '_superpixels.png')
    if n in PICTURES:
        PICTURES.pop(n)
    if x in PICTURES:
        PICTURES.pop(x)
    if y in PICTURES:
        PICTURES.pop(y)

#now we remove the superpixels
super_pixels = []

for i in PICTURES:
   if '_superpixels.png' in i:
        super_pixels.append(i)

for i in super_pixels:
    PICTURES.pop(i)

print(len(PICTURES))

## Checking for duplicates

In [13]:
""" First we check for duplicates"""

#Checks for duplicated image IDs in image_info
image_id = set()
not_unique_id = 0
for i in image_info['image_id']:
    if i not in image_id:
        image_id.add(i)
    else:
        not_unique += 1
print('Number of duplicates image id: {0}'.format(not_unique_id))

#Checks that there is no duplicates in the pictures
image_unique = set()
not_unique_image = 0
for i in PICTURES:
    if i not in image_unique:
        image_unique.add(i)
    else:
        not_unique_image += 1
print('Number of duplicates images: {0}'.format(not_unique_image))

"""Then we checks that all ID's has a corresponding image, and vice versa"""

# cancer free and melanoma image id's is loaded into sets
cf = set(i for i in cancer_free['image_id'])
mel = set(i for i in melanoma['image_id'])

no_image = set()

#Checks that all IDs in image info is in example folders
for i in cf:
    n = (i + '.jpg')
    if n not in PICTURES:
        no_image.add(i)

for i in mel:
    n = (i + '.jpg')
    if n not in PICTURES:
        no_image.add(i)

print('No image: {0}'.format(len(no_image)))

#Checks that all pictures in example image folder has a image ID in image_info
no_ID = set()

for i in PICTURES:
    x = i[0:12] #The first 12 characters is the ID
    if x not in image_id :
        no_ID.add(i)
print('No ID in image_info: {0}'.format(len(no_ID)))


## Cleaning the Data

In the following cell the dataset of images is cleaned for the following three criteria:

- Resolution
- Blurriness/Sharpness
- Contrast

The images that do not fit the criteria are removed from the image dictionary.

In [14]:
## Checking RESOLUTION
sd = 600*450
low_quality_pictures(sd, PICTURES)

In [15]:
## Checking BLURRINESS/SHARPNESS
bluriness(PICTURES)

Now we will check the colour quality by checking the contrast level.

To check the contrast of an image we use the function is_low_contrast from the library skimage.exposure. 

https://scikit-image.org/docs/dev/api/skimage.exposure.html#re0c68370bb9d-1

In [16]:
## Checking CONTRAST
contrast(PICTURES)

## External Data

First we load the data and clean it with the same criteria used on the previous data in the dictionary named PICTURES.

In [17]:
# First we make the path to the data
PATH = {}
PATH["external data"] ="../external_data/external_data"

#Then we load the path for all the diffirent as keys to their names.
EXTERNAL = {}
for path, subdirs, files in os.walk(PATH["external data"]):
    for picture in files:
        # Making sure that we exclude the .csv from the dictionary, to only include images (ie. png and jpg files)
        if not str(picture).count(".csv", (len(picture) - 4),len(picture)):
            EXTERNAL[picture] = os.path.normpath(os.path.join(path, picture))

EXTERNAL.pop('.DS_Store')
print(len(EXTERNAL))

# Loading the data into a dataframe and classifying based on class
external_image_info = pd.read_csv("../external_data/ISIC-2017_Training_Part3_GroundTruth.csv")
external_cancer_free = external_image_info[(external_image_info['melanoma']== 0) & (external_image_info['seborrheic_keratosis']==0)]
external_melanoma = external_image_info[external_image_info['melanoma'] == 1]
external_keratosis = external_image_info[external_image_info['seborrheic_keratosis']==1]

#image_info
print("image_info: {0}, cancer_free: {1}, melanoma: {2}, keratosis: {3}"
.format(external_image_info.shape, external_cancer_free.shape, external_melanoma.shape, external_keratosis.shape))

The external data is checked for duplicates

In [18]:
#Checks for duplicated image IDs in image_info
image_id = set()
not_unique_id = 0
for i in external_image_info['image_id']:
    if i not in image_id:
        image_id.add(i)
    else:
        not_unique += 1
print('Number of duplicates image id: {0}'.format(not_unique_id))

#Checks that there is no duplicates in the pictures
image_unique = set()
not_unique_image = 0
for i in EXTERNAL:
    if i not in image_unique:
        image_unique.add(i)
    else:
        not_unique_image += 1
print('Number of duplicates images: {0}'.format(not_unique_image))

"""Then we checks that all ID's has a corresponding image, and vice versa"""

# cancer free and melanoma image id's is loaded into sets
mel = set(i for i in external_image_info['image_id'])

no_image = set()

#Checks that all IDs in image info is in example folders
for i in mel:
    n = (i + '.jpg')
    if n not in EXTERNAL:
        no_image.add(i)

print('No image: {0}'.format(len(no_image)))

#Checks that all pictures in example image folder has a image ID in image_info
no_ID = set()

for i in EXTERNAL:
    x = i[0:12] #The first 12 characters is the ID
    if x not in image_id:
        no_ID.add(i)

print('No ID in image_info: {0}'.format(len(no_ID)))

External data is checked for resolution quality, blurriness, and contrast quality

In [19]:
sd = 720 * 480
low_quality_pictures(sd, EXTERNAL)

In [20]:
bluriness(EXTERNAL)

In [21]:
contrast(EXTERNAL)

Now the external data is seperated 50/50 so that 50% can be used to test the model again, and the other 50% can provide the results needed.

In [22]:
x = [i for i in external_melanoma['image_id'] ]
melanoma_dict = {}


for key, value in EXTERNAL.items():
    if key[0:12] in x:
        melanoma_dict[key] = value

y = [i for i in external_cancer_free['image_id'] ]
cancer_free_dict = {}

for key, value in EXTERNAL.items():
    if key[0:12] in y:
        cancer_free_dict[key] = value

        
external_melanoma['mask'] = external_melanoma['image_id']+'_segmentation.png'
external_melanoma['raw'] = external_melanoma['image_id']+'.jpg'
external_melanoma
# melonoma_way = pd.DataFrame.from_dict(melanoma_dict)
# melonoma_way

# Task 1: Implement Two Features
Choose one of the ABC (Asymetry, Border or Color) features and implement a function to measure it for one
image. While you are doing this, you might want to create “toy” images where
you already know the results, for example a circle should be less asymmetric
than an ellipse, etc.
Once you are satisfied with your implementations, run them on all your
images, and examine the feature distributions for each class, for example using
scatter plots. Do you see differences between the classes?

## Asymmetry

In [23]:
def test_asymmetry(mask):

    x_min, x_max, y_min, y_max = get_mask_size(mask)

    mask_cropped = mask[y_min:y_max, x_min:x_max]

    x_half = (x_max - x_min) / 2 # Gives us the middle 

    if x_half.is_integer():
        mask_x_half_left = mask_cropped[:, :int(x_half)]
        mask_x_half_right = mask_cropped[:, int(x_half):]
    else:
        mask_x_half_left = mask_cropped[:, :int(x_half)+1]
        mask_x_half_right = mask_cropped[:, int(x_half):]

    mask_x_half_right_flipped = np.fliplr(mask_x_half_right)

    x_diff = mask_x_half_left - mask_x_half_right_flipped

    # Fold in half y-axis
    y_half = (y_max - y_min) / 2 # Gives us the middle 

    if y_half.is_integer():
        mask_y_half_left = mask_cropped[:int(y_half), :]
        mask_y_half_right = mask_cropped[int(y_half):, :]
    else:
        mask_y_half_left = mask_cropped[:int(y_half)+1, :]
        mask_y_half_right = mask_cropped[int(y_half):, :]


    mask_y_half_right_flipped = np.flipud(mask_y_half_right)

    y_diff = mask_y_half_left - mask_y_half_right_flipped

    # Calculing diff

    gray1 = np.count_nonzero(x_diff == 0)
    non_gray1 = np.count_nonzero(x_diff)

    score_x = non_gray1/gray1 * 100

    gray1 = np.count_nonzero(y_diff == 0)
    non_gray1 = np.count_nonzero(y_diff)

    score_y = non_gray1/gray1 * 100

    return score_x if score_x > score_y else score_y

## Border

In [24]:
def find_border(mask):
    """Returns a ratio between the surcumfrance and the area of a mask. 
    It returns 0 for a perfect circle, and increasingly higher returns for larger 
    surcomfrance compared with area."""
    # Using the fact that all values within the area of the mask is 1, 
    # the area is equal to the sum of the mask
    ones = np.sum(mask)
    # Applying a gausian border to find the surcumfrance
    border = filters.difference_of_gaussians(mask,1) 
    # Reducing the size of the border.

    border = np.where((border < -0.01) & (border > 0.01), 1, border)
    border = np.where(border < 0.01 , 0, border)


    surcumfrance = np.sum(border)

    #plt.imshow(border, cmap='gray')

    Compactness = surcumfrance**2/ones*12


    return(Compactness)

# Making Test-cases to test that the function returns higher value for less compacness
# and that the score is independent of size of image.
# test_square = np.pad(np.ones((100,200)), pad_width = 5, constant_values=0)
# test_big_square = np.pad(np.ones((1000,2000)), pad_width = 5, constant_values=0)
# find_border(test_big_square)
# find_border(test_square)

# Idea from https://stackoverflow.com/questions/10031580/how-to-write-simple-geometric-shapes-into-numpy-arrays
# xx, yy = np.mgrid[:200, :200]

# circle = (xx - 100) ** 2 + (yy - 100) ** 2
# circle1 = np.where(circle < 1500, 1, circle)
# circle1 = np.where(circle1 > 1, 0, circle1)
# circle2 = np.where(circle < 1850, 1, circle)
# circle2 = np.where(circle2 > 1, 0, circle2)
# find_border(circle1)
# find_border(circle2)


## Colour

In [25]:
def test_color2(image, mask):
    def sp_idx(s, index = True):
        u = np.unique(s)
        return [np.where(s == i) for i in u]
    
    # Have to do this in order for skimage label2rgb to work properly
    mask = img_as_float(mask[::2, ::2])
    image = img_as_float(image[::2, ::2])

    segments = slic(image, n_segments= 20, compactness=3,sigma= 5, max_iter=10, mask=mask, convert2lab=True)

    blended_colors = label2rgb(segments, image, kind="avg", bg_label=0) # returns RGB floats / RGB Normalized 
    # To convert the normalized RGB back to RGB, we have to multiply the colour channel with 255 and round.

    # The list contains information of the x and y coordinates that correspond with each segment.
    superpixel_list = sp_idx(segments)

    colours = []

    # Distance from this should be max 10.
    general_light_skin_colour = (255,224,189)
    general_light_skin_threshold = 10
    general_light_skin_hsp_max = 240

    light_skin_colour = None
    light_skin_colours_hsp = 0
    not_skin_colour_threshold = 15

    darkest_colour = None
    darkest_colours_hsp = 255

    # Skip the first entry since it contains the size of the image. 
    for segment in range(1,len(superpixel_list)):
        # Since we are getting all the x-coordinates and y-coordinates.
        # We can just pick one of the x and y coordinates. It doesn't matter which one, since they have the same colour.
        #                            x  x_0
        x = superpixel_list[segment][0][0]
        #                            y  y_0
        y = superpixel_list[segment][1][0]

        r = round(blended_colors[x][y][0] * 255)
        g = round(blended_colors[x][y][1] * 255)
        b = round(blended_colors[x][y][2] * 255)
        
        colours.append((r,g,b))

        distance = math.sqrt(((r - general_light_skin_colour[0])) ** 2 + ((g - general_light_skin_colour[1])) ** 2 + ((b - general_light_skin_colour[2])) ** 2)
        # hsp formula http://alienryderflex.com/hsp.html
        hsp = math.sqrt(0.299 * (r ** 2) + 0.587 * (g ** 2) + 0.114 * (b ** 2))
        if hsp > light_skin_colours_hsp and (distance < general_light_skin_threshold or hsp < general_light_skin_hsp_max):
            light_skin_colours_hsp = hsp
            light_skin_colour = (r,g,b)

    for r,g,b in colours:
        # Remove the outer bounds of the lesion - to similate removing the skin colour.
        # Formular to calculate how close the 2 rgb colours are to eachother.
        distance = math.sqrt(((r - light_skin_colour[0])) ** 2 + ((g - light_skin_colour[1])) ** 2 + ((b - light_skin_colour[2])) ** 2)
        if distance > not_skin_colour_threshold:
            # Find darkest colour
            hsp = math.sqrt(0.299 * (r ** 2) + 0.587 * (g ** 2) + 0.114 * (b ** 2))
            if hsp < darkest_colours_hsp:
                darkest_colours_hsp = hsp
                darkest_colour = (r,g,b)

    average_colour = []
    counter = 0
    current_average_colour = 0

    # Remove skin colour
    # There are 3 channels in RGB, and to get the average we square each channels colors and add them to together to get the average.
    for i in range(0,3):
        for index in range(len(colours)):
            current_average_colour += colours[index][i] ** 2
            counter += 1
        average_colour.append(round(math.sqrt(current_average_colour/counter)))

    # Calculate the distance between the average colour and the highest colour
    distance = math.sqrt(((average_colour[0] - darkest_colour[0])) ** 2 + ((average_colour[1] - darkest_colour[1])) ** 2 + ((average_colour[2] - darkest_colour[2])) ** 2)

    # High value is bad
    return distance


# Task 2: Predict the diagnosis
Split your data so that you are have training data and hold-out test data. Use
the training data to train different classifiers and investigate their parameters.
Once you made a choice, evaluate your classifier on the hold-out test data. Think
of different metrics you can use, and different ways to present your results.

## Preparing machine learning
The data is split into training and the data that will be learned on, and the features are extracted

In [26]:
# Making a set with all the relevant photo-id's
melonoma_set = set()
for key in melanoma_dict:
    melonoma_set.add(key[:12])

health_set = set()
for key in cancer_free_dict:
    health_set.add(key[:12])


In [27]:
### Extracting the features
freature_extract = False # Set this as True to extract features for all photoes.
if freature_extract:
    melanoma_list = [] 
    for picture in melonoma_set:
        try:
            test_picture = melanoma_dict[picture+".jpg"]
            test_mask = melanoma_dict[picture+"_segmentation.png"]
            #print(test_picture)
            # Load in the mask and picture
            temp = []
            mask = plt.imread(test_mask)
            raw_pict = plt.imread(test_picture)
            # Finding the boundaries of the mask.
            x_min, x_max, y_min, y_max = get_mask_size(mask)
            # 
            if x_min > 5:
                mask = mask[:, x_min-3:]
                raw_pict = raw_pict[:, x_min-3:]
            if y_min > 5:
                mask = mask[y_min-3:,:]
                raw_pict = raw_pict[y_min-3:,:]    
            if x_max < mask.shape[1]:
                mask = mask[:,:x_max+3]
                raw_pict = raw_pict[:,:x_max+3]
            if  y_max < mask.shape[0]:
                mask = mask[:y_max+3,:]
                raw_pict = raw_pict[:y_max+3,:]
            # Making the feature-extraction on the 3 featurs
            temp.append(test_asymmetry(mask))
            temp.append(find_border(mask))
            temp.append(test_color2(raw_pict, mask))
            # Adding the data of the picture to the overall list.
            melanoma_list.append(temp)
        except:
            pass
        
    healthy_list = []
    for picture in health_set:
        try: 

            test_picture = cancer_free_dict[picture+".jpg"]
            test_mask = cancer_free_dict[picture+"_segmentation.png"]
            #print(test_picture)
            # Load in the mask and picture
            temp = []
            mask = plt.imread(test_mask)
            raw_pict = plt.imread(test_picture)
            # Finding the boundaries of the mask.
            x_min, x_max, y_min, y_max = get_mask_size(mask)
            # Only applying the border if the 
            if x_min > 5:
                mask = mask[:, x_min-5:]
                raw_pict = raw_pict[:, x_min-5:]
            if y_min > 5:
                mask = mask[y_min-5:,:]
                raw_pict = raw_pict[y_min-5:,:]    
            if x_max < mask.shape[1]:
                mask = mask[:,:x_max+5]
                raw_pict = raw_pict[:,:x_max+5]
            if  y_max < mask.shape[0]:
                mask = mask[:y_max+5,:]
                raw_pict = raw_pict[:y_max+5,:]
            # Making the feature-extraction on the 3 featurs
            temp.append(test_asymmetry(mask))
            temp.append(find_border(mask))
            temp.append(test_color2(raw_pict, mask))
            # Adding the picture to the overall.
            healthy_list.append(temp)

        except:
            pass


In [28]:
# Saving the  features in an csv-file.
if freature_extract:
    melanoma_df = pd.DataFrame.from_records(melanoma_list, columns = ("Asymmetry" ,"compactness", "colour"))
    melanoma_df.to_csv("../data/melanoma_data2.csv")
    df_test= pd.DataFrame.from_records(healthy_list, columns = ("Asymmetry","Border", "Colour"))
    df_test.to_csv("../data/health_df2.csv")

### Making the test
With the data gathered, we triain the model. First the data is loaded and split into two groups

In [29]:
# Loading in the extracted features in in a dataframe
df_healthy = pd.read_csv("../data/health_df2.csv")
df_ill = pd.read_csv("../data/melanoma_data2.csv")
df_healthy
# Removing the previous index.
df_healthy = df_healthy.drop(labels = "Unnamed: 0", axis= 1)
df_ill = df_ill.drop(labels= "Unnamed: 0", axis = 1)
# Adding weather cancer is or is not pressent in the photo
df_healthy['melanoma'] = 0
df_ill['melanoma'] = 1
# Renaming the values so it is the same for both dataframes
df_ill = df_ill.rename(columns={"compactness": "Border", "colour": "Colour"})
# Creating the traning-dataset as the first 160 sick and healthy. 
df_training = df_healthy[0:160]
df_training = df_training.append(df_ill[0:160])
# Creating the test-set in the same way.
df_verification = df_healthy[160:]
df_verification = df_verification.append(df_ill[160:])
df_verification

In [30]:
# Let's scale the features
#Fit scaler on our data
scaler = preprocessing.StandardScaler().fit(df_training[["Asymmetry","Border", "Colour"]])

#Apply to data itself
df_train = scaler.transform(df_training[["Asymmetry","Border", "Colour"]])
df_test = scaler.transform(df_verification[["Asymmetry","Border", "Colour"]])


In [31]:

# Save the data in two dataframes
df = pd.DataFrame(df_train, columns = ["Asymmetry","Border", "Colour"])
df['illens'] = df_training['melanoma'].to_list()

df_test = pd.DataFrame(df_test, columns = ["Asymmetry","Border", "Colour"])
df_test['illens'] = df_verification['melanoma'].to_list()
df_test


#### Visualizing the data

In [32]:
# Making scatterplots for all 2-dimentional posibilities.
sns.pairplot(df[["Asymmetry","Border", "illens"]], hue="illens", size=3,diag_kind="hist")

In [33]:
sns.pairplot(df[["Asymmetry","Colour", "illens"]], hue="illens", size=3,diag_kind="hist")

In [34]:
sns.pairplot(df[["Border","Colour", "illens"]], hue="illens", size=3,diag_kind="hist")

#### Predicting via KNN

In [35]:
# Creating a list of predictions, and testing the accuracy of the predictions
acc_list = []
acc_dict = {}
for i in range(1,60): #Testing the accuracy when k = i
    acc_list.append(accuracy_score(df_test['illens'], make_knn_prediction(i)))
# Plotting KNN's performanc
fig = plt.figure(figsize=(4, 3))
axes = fig.add_axes([0, 0, 1.5, 1])
axes.plot(range(1,60), acc_list)
axes.set_xlabel('Value of k')
axes.set_ylabel('Test accuracy')
axes.set_title('Illnes classification accuracy of KNN for diffirent values of k')
axes.grid(True)

### Predicting using Gausian Process clasifier

In [36]:
gaus_predict = pd.DataFrame(columns = [ "true_positive", "false_positive", "true_negative", "false_negative"])
for i in [1, 2, 3]: # Testing the model after 1, 2 and 3 itterations
    gaus = GaussianProcessClassifier(max_iter_predict = i).fit(X = df[['Asymmetry','Border', 'Colour']], y = df['illens'])
    #After fitting the classifier, using the classifier to predict our test data.
    guess = gaus.predict(df_test[['Asymmetry','Border', 'Colour']])
    true_positive, false_positive, true_negative, false_negative = accuracy_matrix(guess, df_test['illens'])
    gaus_predict.loc[i] = [true_positive, false_positive,  true_negative, false_negative]
gaus_predict

ax = sns.heatmap(gaus_predict)
ax.set_title("Heatmap of precicion by Gaussian process classification for diffirent itterations")
ax.set_ylabel("iterations");


# Task 3: open question
Use the data to formulate, motivate, answer, and discuss another research question of your choice. You may use other data or features here, that we did not
cover

### Does decreasing the dimentionality increase the performance of the model?

In [37]:
# First the performance of the model is found when only having two features 
two_dimentional_factors = [['Asymmetry','Border'], ['Border', 'Colour'], ['Asymmetry', 'Colour']]
i = 0
two_factor_predict = pd.DataFrame(columns = [ "true_positive", "false_positive", "true_negative", "false_negative"])
for factors in two_dimentional_factors: 
    i += 1 # Getting the predictions for every pair.
    gaus = GaussianProcessClassifier(max_iter_predict = 5).fit(X = df[factors], y = df['illens'])
    guess = gaus.predict(df_test[factors])
    true_positive, false_positive, true_negative, false_negative = accuracy_matrix(guess, df_test['illens'])
    two_factor_predict.loc[i] = [true_positive, false_positive,  true_negative, false_negative]
two_factor_predict
# Plotting the performance in a heatmap
names = ('Asymetry and Border', 'Border and Colour', 'Asymetry and Colour')
ax = sns.heatmap(two_factor_predict)
ax.set_title("Heatmap of precicion by Gaussian process classification for diffirent itterations")
ax.set_ylabel("Combination of features")
ax.set_yticks(range(3))
ax.set_yticklabels(names);

In [38]:
two_factor_predict #Printing the performance of the model

In [39]:
diffirence_in_performance = two_factor_predict - gaus_predict.iloc[2]
diffirence_in_performance # Comparing each model with the 3-dimentional model. 

### Reducing dimentionality by utilizing PCA on shape-messuring data-points.

In [40]:
pca = PCA(n_components=1) # Create PCA object

pca.fit(df[['Asymmetry','Border']]) # Fith the PCA to our data.
pca_performance = pd.DataFrame(columns = [ "true_positive", "false_positive", "true_negative", "false_negative"])

# Applying the PCA to our data.
df['shape'] = pca.fit_transform(df[['Asymmetry','Border']])
df_test['shape'] = pca.fit_transform(df_test[['Asymmetry','Border']])

# Evaluating the performance, and then comparing the performance, 
gaus = GaussianProcessClassifier(max_iter_predict = 5).fit(X = df[['Colour', 'shape']], y = df['illens'])
guess = gaus.predict(df_test[['Colour', 'shape']])
true_positive, false_positive, true_negative, false_negative = accuracy_matrix(guess, df_test['illens'])
pca_performance.loc[0] = [true_positive, false_positive,  true_negative, false_negative]
pca_performance.loc[1] = pca_performance.loc[0] - gaus_predict.iloc[2]
pca_performance # The negative prediction becomes better, but positive is 13% worse


<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=6c6ff636-1d68-49e2-b04b-ee0d9895f3db' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>