ML Project Part 3: Classification

In [36]:
#Imports
import numpy as np
import matplotlib.pyplot as plt
import ipywidgets as widgets
from IPython.display import display
import os

import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score

In [37]:
if 'google.colab' in str(get_ipython()):
  print('Running on CoLab')
  running_on_colab = True
else:
  print('Not running on CoLab')
  running_on_colab = False

Running on CoLab


In [38]:
if running_on_colab:
  from google.colab import drive
  drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [39]:
if running_on_colab:
  FILES_PATH = "drive/MyDrive/ml_project_data/part3/"
else:
  FILES_PATH = None #put the right path here i guess

In [41]:
# Importing data, defining constants
X_handout = np.load(os.path.join(FILES_PATH, "Xtrain_Classification1.npy"))
Y_handout = np.load(os.path.join(FILES_PATH, "ytrain_Classification1.npy"))
X_test_delivery = np.load(os.path.join(FILES_PATH, "Xtest_Classification1.npy"))
NUM_HANDOUT_SAMPLES = X_handout.shape[0]
NUM_FEATURES = X_handout.shape[1]
IMAGE_WIDTH = IMAGE_HEIGHT = 28

print(f"X_handout shape: {X_handout.shape}")
print(f"Y_handout shape: {Y_handout.shape}")
print(f"X_test_delivery shape: {X_test_delivery.shape}")

print(f"Number of handout examples: {NUM_HANDOUT_SAMPLES}")
print(f"Number of features per sample: {NUM_FEATURES}")

print(f"Number of melanoma:{np.count_nonzero(Y_handout == 1)}")
print(f"Number of nevu:{np.count_nonzero(Y_handout == 0)}")

print(f"Image Size: {IMAGE_WIDTH}x{IMAGE_HEIGHT}")

X_handout shape: (6254, 2352)
Y_handout shape: (6254,)
X_test_delivery shape: (1764, 2352)
Number of handout examples: 6254
Number of features per sample: 2352
Number of melanoma:896
Number of nevu:5358
Image Size: 28x28


In [42]:
# Displaying the images
def show_image(index):
    rgb_image = np.reshape(X_handout[index], (28, 28, 3))
    plt.imshow(rgb_image)
    plt.axis("off")
    plt.show()

    class_label_widget.value = f"Class (0 or 1): {Y_handout[index]}"

class_label_widget = widgets.Label(value=f"Class (0 or 1): {-1}")
image_widget = widgets.interactive(
    show_image, index=widgets.IntSlider(min=0, max=len(X_handout) - 1, step=1, value=0)
)
display(image_widget, class_label_widget)

interactive(children=(IntSlider(value=0, description='index', max=6253), Output()), _dom_classes=('widget-inte…

Label(value='Class (0 or 1): 1.0')

In [43]:
def upsample(X, Y, class_id, upsample_factor, shuffle=False):
  """
    Upsamples by creating copies of samples belonging to class_id in X

    Returns:
        Samples X with lables Y
    """
  X_to_upsample = X[Y == class_id]
  X_upsampled = np.repeat(X_to_upsample, upsample_factor, axis=0)
  X_new = np.concatenate((X_upsampled, X), axis=0)
  Y_new = np.concatenate((np.ones(X_upsampled.shape[0])*class_id, Y), axis=0)
  if shuffle:
    shuffle_mask = np.random.permutation(X_new.shape[0])
    return X_new[shuffle_mask], Y_new[shuffle_mask]
  else:
    return X_new, Y_new

def downsample(X, Y, class_id, shuffle=False):
    """
    Downsamples by deleting samples belonging to class_id in X, untill the dataset is balanced
    Returns:
        Samples X with lables Y
    """
    if class_id == 1:
      other_class_label = 0
    elif class_id == 0:
      other_class_label = 1

    X_to_downsample = X[Y == class_id]
    X_downsampled = X_to_downsample[0:len(X[Y != class_id])]
    assert(X_downsampled.shape == X[Y != class_id].shape)
    X_new = np.concatenate((X_downsampled, X[Y != class_id]), axis=0)
    Y_new = np.concatenate((np.ones(X_downsampled.shape[0])*class_id, np.ones(X[Y != class_id].shape[0])*other_class_label), axis=0)
    if shuffle:
      shuffle_mask = np.random.permutation(X_new.shape[0])
      return X_new[shuffle_mask], Y_new[shuffle_mask]
    else:
      return X_new, Y_new




In [44]:
#X_handout_rescaled = np.array([rgb_val/255 for rgb_val in X_handout])
#X_handout_reshaped = np.array([np.reshape(pic, (28, 28, 3)) for pic in X_handout_rescaled])
X_handout_rescaled = X_handout/255
X_handout_reshaped = np.reshape(X_handout_rescaled, newshape=(NUM_HANDOUT_SAMPLES, 28, 28, 3))

#upsampling
print("X_handout_reshaped.shape: ", X_handout_reshaped.shape)
print("Y_handout.shape: ", Y_handout.shape)

#note: the upsampling should happen in a way that does not leave duplicates in train and test set
#upsample after train_test_split and then shuffle
UPSAMPLE_FACTOR = 5
# ------ old upsample code -----
"""
X_handout_melanoma = X_handout_reshaped[Y_handout == 1]
X_handout_melanoma_upscaled = np.repeat(X_handout_melanoma, UPSAMPLE_FACTOR, axis=0)
X_handout_upscaled = np.concatenate((X_handout_melanoma_upscaled, X_handout_reshaped), axis = 0)
Y_handout_upscaled = np.concatenate((np.ones(X_handout_melanoma_upscaled.shape[0]), Y_handout), axis = 0)
print("X_handout_upscaled.shape: ", X_handout_upscaled.shape)
print("Y_handout_upscaled.shape: ", Y_handout_upscaled.shape)
"""
# ----- old upsample code -----

X_train, X_test, y_train, y_test = train_test_split(X_handout_reshaped, Y_handout, test_size=0.2, random_state=42)

print("Before upsampling:")
print("X_train shape:", X_train.shape)
print("y_train shape:", y_train.shape)
print("X_test shape:", X_test.shape)
print("y_test shape:", y_test.shape)

#Downsample
X_train, y_train = downsample(X_train, y_train, class_id=0, shuffle=True)
X_test, y_test = downsample(X_test, y_test, class_id=0, shuffle=True)

#Upsample
#X_train, y_train = upsample(X_train, y_train, class_id=1, upsample_factor=UPSAMPLE_FACTOR, shuffle=True)
#X_test, y_test = upsample(X_test, y_test, class_id=1, upsample_factor=UPSAMPLE_FACTOR, shuffle=True)

print("After upsampling:")
print("X_train shape:", X_train.shape)
print("y_train shape:", y_train.shape)
print("X_test shape:", X_test.shape)
print("y_test shape:", y_test.shape)
print(f"Number of melanoma in train set:{np.count_nonzero(y_train == 1)}, test set: {np.count_nonzero(y_test == 1)}")
print(f"Number of nevu in train set:{np.count_nonzero(y_train == 0)}, test set: {np.count_nonzero(y_test == 0)}")

X_handout_reshaped.shape:  (6254, 28, 28, 3)
Y_handout.shape:  (6254,)
Before upsampling:
X_train shape: (5003, 28, 28, 3)
y_train shape: (5003,)
X_test shape: (1251, 28, 28, 3)
y_test shape: (1251,)
After upsampling:
X_train shape: (1438, 28, 28, 3)
y_train shape: (1438,)
X_test shape: (354, 28, 28, 3)
y_test shape: (354,)
Number of melanoma in train set:719, test set: 177
Number of nevu in train set:719, test set: 177


In [45]:
model = tf.keras.models.Sequential([
    tf.keras.layers.Conv2D(32, (3, 3), activation='relu', input_shape=(IMAGE_HEIGHT, IMAGE_WIDTH, 3)),
    tf.keras.layers.MaxPooling2D((2, 2)),
    #tf.keras.layers.Conv2D(64, (3, 3), activation='relu'),
    #tf.keras.layers.MaxPooling2D((2, 2)),
    #tf.keras.layers.Conv2D(128, (3, 3), activation='relu'),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Flatten(),
    tf.keras.layers.Dense(100, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')  # Binary classification, so use sigmoid activation
])


model.compile(optimizer='adam',
              loss='binary_crossentropy',  # Binary cross-entropy for binary classification
              metrics=[tf.keras.metrics.Precision(name = 'precision'),
                       tf.keras.metrics.Recall(name = 'recall'),
                       'accuracy'])

In [48]:
history = model.fit(X_train, y_train, epochs=20, validation_data=(X_test, y_test), batch_size = 50)

Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200
Epoch 26/200
Epoch 27/200
Epoch 28/200
Epoch 29/200
Epoch 30/200
Epoch 31/200
Epoch 32/200
Epoch 33/200
Epoch 34/200
Epoch 35/200
Epoch 36/200
Epoch 37/200
Epoch 38/200
Epoch 39/200
Epoch 40/200
Epoch 41/200
Epoch 42/200
Epoch 43/200
Epoch 44/200
Epoch 45/200
Epoch 46/200
Epoch 47/200
Epoch 48/200
Epoch 49/200
Epoch 50/200
Epoch 51/200
Epoch 52/200
Epoch 53/200
Epoch 54/200
Epoch 55/200
Epoch 56/200
Epoch 57/200
Epoch 58/200
Epoch 59/200
Epoch 60/200
Epoch 61/200
Epoch 62/200
Epoch 63/200
Epoch 64/200
Epoch 65/200
Epoch 66/200
Epoch 67/200
Epoch 68/200
Epoch 69/200
Epoch 70/200
Epoch 71/200
Epoch 72/200
Epoch 73/200
Epoch 74/200
Epoch 75/200
Epoch 76/200
Epoch 77/200
Epoch 78

In [49]:
train_predictions = model.predict(X_train)
test_predictions = model.predict(X_test)



In [50]:
print("train set")
print("precision score: ", precision_score(y_train, (train_predictions > 0.5).astype(int)))
print("recall score: ", recall_score(y_train, (train_predictions > 0.5).astype(int)))
print("test set")
print("precision score: ", precision_score(y_test, (test_predictions > 0.5).astype(int)))
print("recall score: ", recall_score(y_test, (test_predictions > 0.5).astype(int)))

train set
precision score:  1.0
recall score:  1.0
test set
precision score:  0.7634408602150538
recall score:  0.8022598870056498


In [26]:
# Displaying the images with predictions


def show_image(index):
    rgb_image = np.reshape(X_test[index], (28, 28, 3))
    plt.imshow(rgb_image)
    plt.axis("off")
    plt.show()

    class_label_widget.value = f"Class (0 or 1): {y_test[index]}"
    predicted_value_widget.value = f"Predicted Value: {test_predictions[index]}"
    predicted_class_widget.value = f"Predicted Class: {0 if test_predictions[index] < 0.5 else 1}"


class_label_widget = widgets.Label(value=f"Actual Class (0 or 1): {-1}")

predicted_value_widget = widgets.Label(value=f"Predicted Value: {-1}")

predicted_class_widget = widgets.Label(value=f'Predicted Class: {-1}')

image_widget = widgets.interactive(
    show_image, index=widgets.IntSlider(min=0, max=len(X_test) - 1, step=1, value=0)
)
display(image_widget, class_label_widget, predicted_value_widget, predicted_class_widget)

interactive(children=(IntSlider(value=0, description='index', max=2146), Output()), _dom_classes=('widget-inte…

Label(value='Class (0 or 1): 1.0')

Label(value='Predicted Value: [0.9999641]')

Label(value='Predicted Class: 1')