In [1]:
import os
from os.path import isfile, join
import pandas as pd
import numpy as np
from prep_image import prep_im_and_mask
import matplotlib.pyplot as plt

from asymmetry import mean_asymmetry
from color import slic_segmentation, rgb_var
from compactness import compactness_score
from convexity import convexity_score

## Extract features

In [2]:
def extract_features(im, im_mask):
    
    # Assymmetry
    asymmetry = mean_asymmetry(im_mask,4)
    
    # Color
    segments = slic_segmentation(im, im_mask)
    col_r, col_g, col_b = rgb_var(im, segments)
    
    # Compactness
    compactness = compactness_score(im_mask)
    
    # Convexity
    convexity = convexity_score(im_mask)
    
    return np.array([asymmetry, col_r, col_g, col_b, compactness, convexity], dtype=np.float16)

## Process Images

In [8]:
file_data = 'metadata_withmasks.csv'
image_folder = 'test_images' + os.sep
mask_folder = 'test_images_mask' + os.sep
file_features = 'feature_data.csv'

In [9]:
# Extract image ids and labels
df = pd.read_csv(file_data)

# Remove images without masks
df_mask = df['mask'] == 1
df = df.loc[df_mask]

labels = list(df['diagnostic'])

In [10]:
# Extract feature
feature_names = ['assymmetry', 'color_r', 'color_g', 'color_b', 'compactness', 'convexity']
features_n = len(feature_names)
features = np.zeros(shape = [len(df), features_n], dtype = np.float16)

In [11]:
# Extract features
images = []
for i, id in enumerate(list(df['img_id'])):
    
    im, mask = prep_im_and_mask(id, image_folder, mask_folder)
    images.append(im)

    # Extract features
    x = extract_features(im, mask)
    features[i,:] = x

In [38]:
# Save image_ids and features in a file
#df_features = pd.DataFrame(features, columns = feature_names)
#df_features.to_csv(file_features, index = False)

## Feature selection

In [None]:
from sklearn.feature_selection import SelectKBest, mutual_info_classif

selector = SelectKBest(mutual_info_classif, k=k)
selector.fit(X_train, y_train)



## Feature extraction with PCA

## Train classifier

In [1]:
import os
import pandas as pd
import numpy as np

# Default packages for the minimum example
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GroupKFold, StratifiedShuffleSplit, StratifiedKFold, train_test_split
from sklearn.metrics import accuracy_score #example for measuring performance


import pickle #for saving/loading trained classifiers

In [2]:
file_data = 'metadata_withmasks.csv'
file_features = 'feature_data.csv'

In [3]:
# Extract metadata for images
df = pd.read_csv(file_data)
df_mask = df['mask'] == 1
df = df.loc[df_mask]

# Extract labels
labels = np.array(df['diagnostic'])

In [6]:
# Extract features
feature_names = ['assymmetry', 'red_var', 'green_var', 'blue_var', \
		'hue_var', 'sat_var', 'val_var', 'dom_hue', 'dom_sat', 'dom_val', \
		'compactness', 'convexity']
df_features = pd.read_csv(file_features)

In [9]:
# Make dataset
X = np.array(df_features[feature_names])
y =  (labels == 'BCC') | (labels == 'SCC') | (labels == 'MEL')   #now True means healthy nevus, False means something else
patient_id = df['patient_id']

In [68]:
# Train-test split
num_folds = 5
group_kfold = GroupKFold(n_splits=num_folds)
group_kfold.get_n_splits(X, y, patient_id)

sss = StratifiedShuffleSplit(n_splits = num_folds)
sss.get_n_splits(X,y)

skf = StratifiedKFold(n_splits=num_folds)

In [47]:
#Different classifiers to test out
classifiers = [
    KNeighborsClassifier(1),
    KNeighborsClassifier(5)
]

num_classifiers = len(classifiers)

In [48]:
from sklearn.model_selection import cross_validate
from sklearn.metrics import accuracy_score, recall_score, precision_score, roc_auc_score

# Metrics with result from cross-validation
# We wish to measure accuracy, sensitivity, specificity, auc (area under the ROC curve)
scoring =['accuracy', 'recall', 'precision', 'roc_auc']

In [97]:
for i in enumerate(classifiers, n_splits):
    
    skf = StratifiedKFold(n_splits)
    scores = cross_validate(clf, X, y, cv=skf, scoring = scoring)

In [7]:
scores['test_accuracy']

NameError: name 'scores' is not defined

## Evaluate classifier

In [39]:
#Average over all folds
average_acc = np.mean(acc_val,axis=0) 
   
print('Classifier 1 average accuracy={:.3f} '.format(average_acc[0]))
print('Classifier 2 average accuracy={:.3f} '.format(average_acc[1]))

Classifier 1 average accuracy=0.444 
Classifier 2 average accuracy=0.523 


In [None]:
from sklean

## Save final classifier

In [None]:
#Let's say you now decided to use the 5-NN 
classifier = KNeighborsClassifier(n_neighbors = 5)

#It will be tested on external data, so we can try to maximize the use of our available data by training on 
#ALL of x and y
classifier = classifier.fit(X,y)

#This is the classifier you need to save using pickle, add this to your zip file submission
filename = 'groupXY_classifier.sav'
pickle.dump(classifier, open(filename, 'wb'))

## Evaluate classifier

In [None]:
import pickle #for loading your trained classifier

from extract_features import extract_features #our feature extraction

# The function that should classify new images. 
# The image and mask are the same size, and are already loaded using plt.imread
def classify(img, mask):
    
    
     #Resize the image etc, if you did that during training
    
     #Extract features (the same ones that you used for training)
     x = extract_features(img, mask)
         
     
     #Load the trained classifier
     classifier = pickle.load(open('groupXY_classifier.sav', 'rb'))
    
    
     #Use it on this example to predict the label AND posterior probability
     pred_label = classifier.predict(x)
     pred_prob = classifier.predict_proba(x)
     
     
     #print('predicted label is ', pred_label)
     #print('predicted probability is ', pred_prob)
     return pred_label, pred_prob

In [None]:
# Call in a loop