# Project 3 - Group 14

### Libraries

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

from math import pi
from skimage import morphology
from scipy.spatial.distance import cdist
from scipy.stats.stats import mode

from scipy import ndimage

import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.feature_selection import mutual_info_classif, SelectKBest
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn import decomposition, datasets
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier  
from sklearn import svm
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import cross_val_score, cross_validate
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report, confusion_matrix


### Setup

In [None]:
#Loading in the features csv files which can be used to calculate the compactness of the lesions
features = pd.read_csv (r'../features/features.csv')
truth = pd.read_csv(r'../data/ISIC-2017_Training_Part3_GroundTruth.csv')

In [None]:
truth["id"] = truth["image_id"]
tr_melanoma = truth[truth["melanoma"]>0]
nr_rows_melanoma = len(tr_melanoma)
tr_non_melanoma = truth[truth["melanoma"]<1][:nr_rows_melanoma]
melanoma_d = tr_melanoma.append(tr_non_melanoma)
melanoma_d = melanoma_d.drop(["image_id","seborrheic_keratosis"],axis=1)
melanoma_d

In [None]:
#Age
features2 = pd.read_csv (r'../features/more_features.csv')
f2 = features2.drop("sex",axis=1)
f2 = f2.rename(columns={"image_id":"id"})
f2

### Feature: Compactness

In [None]:
def measure_compactness(mask):
    # Measure area: the sum of all white pixels in the mask image
    area = np.sum(mask)

    # Measure perimeter: first find which pixels belong to the perimeter.
    struct_el = morphology.disk(2)
    mask_eroded = morphology.binary_erosion(mask, struct_el)
    image_perimeter = mask - mask_eroded

    # Now we have the perimeter image, the sum of all white pixels in it
    perimeter = np.sum(image_perimeter)
    compactness = (perimeter**2)/(4*pi*area)
    return compactness

list_x=[]
for i in melanoma_d["id"]:
    #print(i)
    mask = plt.imread("../data/ISIC-2017_Training_Data_mask" + "/" + i + "_segmentation.png")
    dict1 = {}
    # get input row in dictionary format
    # key = col_name
    dict1.update({"id":i,"compactness":measure_compactness(mask)}) 

    list_x.append(dict1)

df_comp = pd.DataFrame(list_x)   

In [None]:
df_comp

### Feature: Symmetry (If we want to use it)

In [None]:
#Determines how symmetrical the lesion is by seeing how much it overlaps with itself when folded across its center

def folded_overlap(mask_ID):
    #Finds center-point
    borders = np.where(mask_ID == 1)
    up, down, left, right = max(borders[0]), min(borders[0]), min(borders[1]), max(borders[1])
    center = ((up+down) //2, (left + right) //2)

    #compares overlap between the 2 horizontal halves of the lesion
    half1 = mask_ID[down:up, left:center[1]]
    half1 = np.flip(half1,1)
    if (right-left)%2 == 0: #can't compare 2 images of different sizes
        half2 = mask_ID[down:up, center[1]:right]
    else:
        half2 = mask_ID[down:up, center[1]+1:right]
    overlap_h = np.logical_and(half1==1, half2==1)

    half_h_average_sum = (np.sum(half1) + np.sum(half2)) / 2
    overlap_h_sum = np.sum(overlap_h)
    overlap_h_percentage = overlap_h_sum / half_h_average_sum

    #the 2 vertical halves
    half3 = mask_ID[down:center[0], left:right]
    half3 = np.flip(half3,0)
    if (up-down)%2 == 0: #can't compare 2 images of different sizes
        half4 = mask_ID[center[0]:up, left:right]
    else:
        half4 = mask_ID[center[0]+1:up, left:right]
    overlap_v = np.logical_and(half3==1, half4==1)
    half_v_average_sum = (np.sum(half3) + np.sum(half4)) / 2
    overlap_v_sum = np.sum(overlap_v)
    overlap_v_percentage = overlap_v_sum / half_v_average_sum

    #horizontal fold, vertical fold
    return overlap_h_percentage, overlap_v_percentage

# mask_ID = plt.imread("../data/example_segmentation/ISIC_0001769_segmentation.png")
# print(folded_overlap(mask_ID)))

In [None]:
#Runs the folded_overlap function for several images and their rotations.

degrees = [30]*2 #folds across 0 and 90 degrees, then 30 and 120, then 60 and 150
list_y = list()
for i in melanoma_d["id"]:
    #print(i)
    mask_ID = plt.imread("../data/ISIC-2017_Training_Data_mask" + "/" + i + "_segmentation.png")
    dict2 = dict()
    overlap_list = list()
    overlap_list.extend(folded_overlap(mask_ID)) #first foldings before rotations begin
    for degree in degrees:
        overlap_list.extend(folded_overlap(mask_ID))
        mask_ID = ndimage.rotate(mask_ID, degree)
    dict2.update({"id":i, "symmetry":max(overlap_list)}) #max(folded_overlap(mask_ID))})
    list_y.append(dict2)
df_sym = pd.DataFrame(list_y)


In [None]:
df_sym

### Feature: Colors

In [None]:
struct_el = morphology.disk(20)
rows_list = []
for i in melanoma_d["id"]:

    img =plt.imread("../data/ISIC-2017_Training_Data" + "/" + i + ".jpg")
    mask = plt.imread("../data/ISIC-2017_Training_Data_mask" + "/" + i + "_segmentation.png")
    mask_eroded = morphology.binary_erosion(mask, struct_el)
    image_perimeter = mask - mask_eroded
    dict1 = {}
    dict1.update({"id":i,"lesion_color":np.var(img[image_perimeter==1])}) 
    
    rows_list.append(dict1)

df_color = pd.DataFrame(rows_list)

In [None]:
df_color

### KNN

Preparing the dataset

In [None]:
#Merging all the datasets

new_df2 = pd.merge(df_comp,melanoma_d,on = "id", how='outer')
new_df3 = pd.merge(new_df2,df_color,on = "id",how="outer")
new_df4 = pd.merge(new_df3,f2,on = "id")
new_df5 = pd.merge(new_df4,df_sym,on="id")

#Replacing missing values with median or mean 

mask4 = new_df5[new_df5["age_approximate"]!="unknown"]
median = mask4["age_approximate"].median()
new_df5['age'] = new_df5['age_approximate'].replace('unknown', median)

# #Converting column to float since it was object

new_df5["age"] = new_df5["age"].astype(np.float64)

# #Dropping some of the columns
new_df5 = new_df5.drop(['id',"age_approximate"],axis=1)

In [None]:
new_df5

In [None]:
#Task 1 plot

figure, axes = plt.subplots(nrows=2, figsize=(6,6))

axes[0].hist(new_df5["symmetry"][new_df5["melanoma"]==0], range=[0.5,1])
axes[1].hist(new_df5["symmetry"][new_df5["melanoma"]==1], range=[0.5,1])

axes[0].set_title("Non-Melanoma Lesion Symmetries")
axes[1].set_title("Melanoma Lesion Symmetries")

figure.tight_layout(pad=1.0)

plt.show()

In [None]:
#Creating a correlating matrix to see how well each feature correlates
#with being a melanoma or not
corrMatrix = new_df5.corr()
sns.heatmap(corrMatrix, annot=True)
plt.show()

In [None]:
#Creating a zip file with csv inside of the dataframe
new_df5.to_csv("dataframe2.csv", index=False)

In [None]:
new_df5 =  pd.read_csv("dataframe2.csv")
new_df5

In [None]:
corrMatrix = new_df5.corr()
sns.heatmap(corrMatrix, annot=True)
plt.show()

In [None]:
from sklearn import preprocessing
scaled_features = new_df5.copy()
col_names = ["compactness","lesion_color","age","symmetry"]
features = scaled_features[col_names]
scaler = StandardScaler().fit(features.values)
s_features = scaler.transform(features.values)
scaled_features[col_names] =s_features
scaled_features

In [None]:
# Split the data before feature selection
from sklearn.model_selection import train_test_split

# Add the noisy data to the informative features
X = scaled_features[["compactness","lesion_color","age","symmetry"]]
y = scaled_features['melanoma']
print(X.shape)
print(y.shape)
# # Split dataset to select feature and evaluate the classifier
# X_dev, X_test, y_dev, y_test = train_test_split(
#         X, y, stratify=y, random_state=0)

# X_train, X_val, y_train, y_val = train_test_split(
#         X_dev, y_dev, stratify=y_dev)

In [None]:
#Running GridSearch algorithm to find the best hyperparameters for the classifiers 
clf_svc = svm.SVC()
clf_rf = RandomForestClassifier()
lr = LogisticRegression()
knn = KNeighborsClassifier()
best_models=[]
models = [
          knn,
          clf_rf, 
          lr,
          clf_svc,
          ]

for model in models:
# finding the best hyperparameters for each model using gridsearch
    if(model==knn):
        grid_search = GridSearchCV(model, param_grid={  'weights':('uniform', 'distance'),
            'n_neighbors':list(range(1, 31)),'algorithm':('auto', 'ball_tree', 'kd_tree', 'brute')}, cv=10, scoring='accuracy', return_train_score=True,n_jobs=-1)
        grid_search.fit(X,y)
        best_params = grid_search.best_params_
        best_knn = KNeighborsClassifier( weights=best_params['weights'],
           n_neighbors=best_params['n_neighbors'],algorithm=best_params['algorithm'])
        print(best_params)
        best_models.append(best_knn)
    if(model==clf_rf):
        grid_search = GridSearchCV(model, param_grid={ 'bootstrap': [True, False],
            'criterion': ['gini', 'entropy'], 'n_estimators': [3,10,30,100,300,1000]}, 
            cv=10, scoring='accuracy', return_train_score=True,n_jobs=-1)
        grid_search.fit(X,y)
        best_params= grid_search.best_params_
        best_rf = RandomForestClassifier(bootstrap = best_params['bootstrap'],criterion=best_params['criterion'],
                                         n_estimators= best_params['n_estimators'])
        print(best_params)
        best_models.append(best_rf)
    if(model==lr):
        grid_search = GridSearchCV(model, param_grid={ 'penalty':["l1","l2"], 'C': [0.001, 0.01, 0.1, 1, 10]}
            , cv=10, scoring='accuracy', return_train_score=True,n_jobs=-1)
        grid_search.fit(X,y)
        best_params= grid_search.best_params_
        best_lr = LogisticRegression(penalty= best_params['penalty'], C=best_params['C']) 
        print(best_params)
        best_models.append(best_lr)
    if(model==clf_svc):
        grid_search = GridSearchCV(model, param_grid={  'decision_function_shape':('ovo','ovr'),
            'shrinking':(True,False),'kernel':('linear', 'rbf','poly'), 'C': [0.001, 0.01, 0.1, 1, 10], 
            'gamma' : [0.001, 0.01, 0.1, 1]}, cv=10, scoring='accuracy', return_train_score=True,n_jobs=-1)
        grid_search.fit(X,y)
        best_params = grid_search.best_params_
        best_svc = svm.SVC(decision_function_shape=best_params['decision_function_shape'],
            shrinking= best_params['shrinking'],kernel=best_params['kernel'], C= best_params['C'], 
            gamma = best_params['gamma'])
        print(best_params)
        best_models.append(best_svc)
    

In [None]:
#The best hyperparameters for each classifier
best_models

In [None]:
kfold = KFold(n_splits=10)
models_table = pd.DataFrame(columns=['Classifier_name', 'train_score','vald_score'])
metrics_table = pd.DataFrame(columns=['Classifier_name', 'test_score',"auc_score"])
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=0)

for i, model in enumerate(best_models):
# training the models 
    print(model)
    cv_result = cross_validate(model, X_train, y_train, cv=kfold, scoring='accuracy',return_train_score=True)
    modelx = model.fit(X_train,y_train)
    models_table.loc[i, 'Classifier_name'] = model.__class__.__name__
    models_table.loc[i, 'train_score'] = cv_result['train_score'].mean()
    models_table.loc[i, 'vald_score'] = cv_result['test_score'].mean()
    metrics_table.loc[i, 'Classifier_name']= model.__class__.__name__
    y_pred=[]
    y_pred_val = modelx.predict(X_test)

    print(model.__class__.__name__)
    print(confusion_matrix( y_test,y_pred_val))
    print()
    metrics_table.loc[i,"Classifier_name"] = model.__class__.__name__
    metrics_table.loc[i,"test_precision"] = precision_score(y_test,y_pred_val)
    metrics_table.loc[i,"test_score"] = accuracy_score(y_test,y_pred_val)
    metrics_table.loc[i,"test_recall"] = recall_score(y_test,y_pred_val)
    metrics_table.loc[i,"auc_score"] = roc_auc_score(y_test,y_pred_val)
models_table

In [None]:
metrics_table