#### Machine Learning via SciKit Learn

In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt
import seaborn as sns
import tensorflow as tf

# Pre processing
from sklearn.model_selection import train_test_split

# Feature Engineering
from sklearn.decomposition import PCA
from sklearn.preprocessing import MinMaxScaler

# Modeling
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV
import tensorflow as tf
import keras

# Metrics
from sklearn.metrics._classification import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score

# Model Saving
from joblib import dump
from joblib import load

#### Functions

In [None]:
def evaluate_models(true_labels, *model_predictions):
    metrics = ['Accuracy', 'Precision', 'Recall', 'F1 Score', 'Specificity']

    evaluation_results = []

    for predictions in model_predictions:
        accuracy = accuracy_score(true_labels, predictions)
        precision = precision_score(true_labels, predictions, average='weighted', zero_division=0)
        recall = recall_score(true_labels, predictions, average='weighted', zero_division=0)
        f1 = f1_score(true_labels, predictions, average='weighted', zero_division=0)

        # Calculate specificity
        tn, fp, fn, tp = confusion_matrix(true_labels, predictions).ravel()
        specificity = tn / (tn + fp) if (tn + fp) > 0 else 0

        evaluation_results.append([accuracy, precision, recall, f1, specificity])

    evaluation_df = pd.DataFrame(evaluation_results, columns=metrics)

    return round(evaluation_df, 4)

def plot_confusion_matrices(y_true,*model_predictions):

    fig, axes = plt.subplots(3, 2, figsize=(18,20))
    axes = axes.flatten()

    for i, predictions in enumerate(model_predictions):
        cm = confusion_matrix(y_true, predictions)
        sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', ax=axes[i])
        axes[i].set_title(f"Model {i + 1} Confusion Matrix")
        axes[i].set_xlabel("Predicted")
        axes[i].set_ylabel("True")

    plt.tight_layout()
    plt.show()

def correct_distribution(images, labels, target_counts):    
    """
    Resizes the dataset to have the specified number of instances for each class.

    :param images: A numpy ndarray of shape (n, 299, 299, 1) representing the images.
    :param labels: A numpy array of length n representing the labels.
    :param target_counts: A dictionary where keys are labels and values are the target counts for each label.
    :return: Resized images and labels arrays.
    """
    resized_images = []
    resized_labels = []

    for label, count in target_counts.items():
        # Find indices where the label matches
        indices = np.where(labels == label)[0]

        # If there are more instances than needed, randomly select 'count' instances
        if len(indices) > count:
            indices = np.random.choice(indices, count, replace=False)

        # Append selected images and labels to the lists
        resized_images.append(images[indices])
        resized_labels.append(labels[indices])

    # Concatenate all the selected images and labels
    resized_images = np.concatenate(resized_images, axis=0)
    resized_labels = np.concatenate(resized_labels, axis=0)

    return resized_images, resized_labels

#### Import Data

#### Dataset: CBIS-DDSM

In [None]:
# Import data in numpy objects
cv10_data_filepath = '/Users/dylanhayashi/Desktop/Northwestern/498 - Capstone/CBIS-DDSM tf/cv10_data.npy'
cv10_labels_filepath = '/Users/dylanhayashi/Desktop/Northwestern/498 - Capstone/CBIS-DDSM tf/cv10_labels.npy'
test10_data_filepath = '/Users/dylanhayashi/Desktop/Northwestern/498 - Capstone/CBIS-DDSM tf/test10_data.npy'
test10_labels_filepath = '/Users/dylanhayashi/Desktop/Northwestern/498 - Capstone/CBIS-DDSM tf/test10_labels.npy'
cv10_data = np.load(cv10_data_filepath)
cv10_labels = np.load(cv10_labels_filepath)
test10_data = np.load(test10_data_filepath)
test10_labels = np.load(test10_labels_filepath)

# Import data from tfrecord datasets
training_zero_filepath = '/Users/dylanhayashi/Desktop/Northwestern/498 - Capstone/CBIS-DDSM tf/training10_0.tfrecords'
training_one_filepath = '/Users/dylanhayashi/Desktop/Northwestern/498 - Capstone/CBIS-DDSM tf/training10_1.tfrecords'
training_two_filepath = '/Users/dylanhayashi/Desktop/Northwestern/498 - Capstone/CBIS-DDSM tf/training10_2.tfrecords'
training_three_filepath = '/Users/dylanhayashi/Desktop/Northwestern/498 - Capstone/CBIS-DDSM tf/training10_3.tfrecords'
training_four_filepath = '/Users/dylanhayashi/Desktop/Northwestern/498 - Capstone/CBIS-DDSM tf/training10_4.tfrecords'
training_zero = tf.data.TFRecordDataset(training_zero_filepath)
training_one = tf.data.TFRecordDataset(training_one_filepath)
training_two = tf.data.TFRecordDataset(training_two_filepath)
training_three = tf.data.TFRecordDataset(training_three_filepath)
training_four = tf.data.TFRecordDataset(training_four_filepath)

#### Dataset: VinDr Mammo

In [None]:
def rescale_array(arr):
    arr_min = arr.min()
    arr_max = arr.max()
    # Normalize the array to 0-1
    normalized_arr = (arr - arr_min) / (arr_max - arr_min)
    # Scale to 0-255
    scaled_arr = normalized_arr * 255
    return scaled_arr.astype(np.uint8)  # Convert to unsigned integer for image representation

In [None]:
b_cal_images_filepath = '/Users/dylanhayashi/Desktop/Northwestern/498 - Capstone/VinDr-Mammo/images_unique_calcifications.npy'
b_cal_labels_filepath = '/Users/dylanhayashi/Desktop/Northwestern/498 - Capstone/VinDr-Mammo/labels_unique_calcifications.npy'
b_mass_images_filepath = '/Users/dylanhayashi/Desktop/Northwestern/498 - Capstone/VinDr-Mammo/images_unique_masses.npy'
b_mass_labels_filepath = '/Users/dylanhayashi/Desktop/Northwestern/498 - Capstone/VinDr-Mammo/labels_unique_masses.npy'
b_cal_images = np.load(b_cal_images_filepath)
b_cal_labels = np.load(b_cal_labels_filepath)
b_mass_images = np.load(b_mass_images_filepath)
b_mass_labels = np.load(b_mass_labels_filepath)

b_cal_images = rescale_array(b_cal_images)
b_mass_images = rescale_array(b_mass_images)

b_cal_labels[b_cal_labels == 1] = 3
b_cal_labels[b_cal_labels == 0] = 1
b_mass_labels[b_mass_labels == 0] = 2
b_mass_labels[b_mass_labels == 1] = 4

#### Combine datasets into single dataset

In [None]:
training_images = []
training_labels = []

feature_dictionary = {
    'label':tf.io.FixedLenFeature([],tf.int64),
    'label_normal':tf.io.FixedLenFeature([],tf.int64),
    'image':tf.io.FixedLenFeature([],tf.string)
}

def _parse_function(example,feature_dictionary=feature_dictionary):
  parsed_example = tf.io.parse_example(example,feature_dictionary)
  return(parsed_example)

def read_data(dataset):
  read_dataset = dataset.map(_parse_function)
  for features in read_dataset:
      image = tf.io.decode_raw(features['image'], tf.uint8)
      image = tf.reshape(image, [299, 299,1])
      image=image.numpy()
      training_images.append(image)
      training_labels.append(features['label'].numpy())

for training_dataset in [training_zero,training_one,training_two,training_three,training_four]:
  read_data(training_dataset)

#### Turn one of the following three cells from raw to python to determine which dataset(s) to use.

In [None]:
# CBIS-DDSM
images = np.concatenate((training_images,cv10_data,test10_data),axis=0)
labels = np.concatenate((training_labels,cv10_labels,test10_labels),axis=0)

#### Set dataset distribution according to dataset(s) choice and classification task

In [None]:
target_counts = {0:9985, 1: 2768, 2: 2542, 3: 1849, 4: 2240}

In [None]:
images, labels = correct_distribution(images, labels, target_counts)

#### Relabel data according to classification task

In [None]:
# Task 1 - Negative vs Abnormality
labels[(labels==2) | (labels==3) | (labels==4)] = 1

# Task 2 - Calcification vs Mass
# labels[(labels==1) | (labels==3)] = 0
# labels[(labels==2) | (labels==4)] = 1

# Task 3 - Benign vs Malignant Calcification
# labels[labels==1] = 0
# labels[labels==3] = 1

# Task 4 - Benign vs Malignant Mass
# labels[labels==2] = 0
# labels[labels==4] = 1

#### Create train and test splits, flatten, normalize

In [None]:
# Create train test split
X_train, X_test, y_train, y_test = train_test_split(images, labels, test_size=0.2, random_state=42)

# Flatten arrays for traditional ML algorithms
train_n, height, width, channels = X_train.shape
X_train_flat = X_train.reshape((train_n, height * width * channels))
test_n, height, width, channels = X_test.shape
X_test_flat = X_test.reshape((test_n, height * width * channels))

# Create normalized versions
scaler = MinMaxScaler()
scaler.fit(X_train_flat)
X_train_norm = scaler.transform(X_train_flat)
X_test_norm = scaler.transform(X_test_flat)

#### Modeling

In [None]:
model_one = GaussianNB()
model_one.fit(X_train_flat,y_train)
pred_one = model_one.predict(X_test_flat)

In [None]:
model_two = LogisticRegression()
model_two.fit(X_train_norm,y_train)
pred_two = model_two.predict(X_test_norm)

In [None]:
model_three = SVC()
model_three.fit(X_train_norm,y_train)
pred_three = model_three.predict(X_test_norm)

In [None]:
model_five = RandomForestClassifier()
model_five.fit(X_train_norm,y_train)
pred_five = model_five.predict(X_test_flat)

In [None]:
model_six = DecisionTreeClassifier()
model_six.fit(X_train_norm,y_train)
pred_six = model_six.predict(X_test_flat)

In [None]:
model_seven = RandomForestClassifier()
model_seven.fit(X_train_norm,y_train)
pred_seven = model_seven.predict(X_test_flat)

#### Results

In [None]:
model_dict = {
    'Model One':'Naive Bayes',
    'Model Two':'Logistic Regression',
    'Model Three':'Support Vector Machine',
    'Model Four':'K-Nearest Neighbors',
    'Model Five':'Decision Tree',
    'Model Six':'Random Forest',
    'Model Seven':'XGBoost'
}

In [None]:
eval = evaluate_models(y_test,pred_one,pred_two,pred_three,pred_five)
eval['Model'] = [key for key in model_dict.keys()]
eval['Model Type'] = [values[0] for values in model_dict.values()]
eval.round(2)