In [1]:
import numpy as np
from modules import zip2array_filenames, extract_pixels, PersistentHomologyClassifier, plot_metrics
from sklearn.model_selection import train_test_split, StratifiedKFold
from skimage.feature import hog
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
from sklearn.metrics import f1_score, classification_report
from itertools import product

In [2]:
# global controls
extract_data = False
extract_features = True
ppc = [(4,4), (8,8), (16,16)]
cpb = [(2,2), (3,3)]
scalers = [StandardScaler(), MinMaxScaler(), RobustScaler()]
cv_folds = 5

In [4]:
if extract_data:
    # ------- FILE EXTRACTION ------ 
    # Extract File Names
    breast_cancer = {'target_names': ['benign', 'malignant'], 'image_dim': (224, 224, 3)}
    classes = breast_cancer['target_names']
    zip_paths = ['BreastCancer_Benign.zip', 'BreastCancer_Malignant.zip']

    for idx, clss in enumerate(classes):
        breast_cancer[clss+'_paths'] = zip2array_filenames(zip_paths[idx], 'png')

    num_benign, num_malignant = len(breast_cancer['benign_paths']), len(breast_cancer['malignant_paths'])

    # Extract Pixels using Image File Names
    for c in classes:
        breast_cancer[c+'_pixels'] = [extract_pixels(img_path, grayscaled=True, flattened=True) for img_path in breast_cancer[c+'_paths']]

    # Organizing Data according the Pixels and Targets
    breast_cancer['data'], breast_cancer['target'] = [], []
    for idx, clss in enumerate(classes):
        new_data = [data for data in breast_cancer[clss+'_pixels']]
        new_target = [idx for _ in breast_cancer[clss+'_pixels']]

        breast_cancer['data'].extend(new_data)
        breast_cancer['target'].extend(new_target)
    
    np.save('breast_cancer_dataset.npy', breast_cancer)

else:
    # load breast_cancer dictionary into npy file
    breast_cancer = np.load('breast_cancer_dataset.npy', allow_pickle=True).item()
    classes = breast_cancer['target_names']

    for c in classes:
        breast_cancer[c+'_paths'] = breast_cancer.pop(c+'Paths')
        breast_cancer[c+'_pixels'] = breast_cancer.pop(c+'Pixels')
    
    for k in ['data', 'target']:
        breast_cancer[k] = np.array(breast_cancer[k])

In [6]:
if extract_features:
    # ---------- DATA PREPARATION
    print("Preparing dataset ...")
    print()

    # reshaping data
    (HEIGHT, WIDTH) = breast_cancer['image_dim'][:2]
    X = breast_cancer['data'].reshape(-1, HEIGHT, WIDTH)
    y = breast_cancer['target']
    print(f"Data is reshaped. \n Dimension of each instance: {HEIGHT*WIDTH}")


    # data splitting    
    test_size = 0.2
    indices = np.arange(len(y))
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, stratify=y)
    print(f"Data is split. \n Train/Validation size: {y_train.shape[0]} images \n Test size: {y_test.shape[0]} images \n")

    # parameters and folds
    hog_params = [{'pixels_per_cell': i[0],
                   'cells_per_block': i[1]} for i in list(product(ppc, cpb))]
    params = list(product(hog_params, scalers))
    k_folds = StratifiedKFold(n_splits=cv_folds, shuffle=True,
                                  random_state=42)
    
    # iterates every parameter combination
    param_scores = []
    for i, param in enumerate(params):
        print(f"======= Running parameter combination {i+1} =======")
        print(f'HOG: pixels_per_cell = {param[0]["pixels_per_cell"]}, cells_per_block: {param[0]["cells_per_block"]}')
        print(f'Scaler: {param[1]}')

        # 5-fold cross validation
        # divides the training set into 5 folds with stratification
        # 4 folds for training and 1 fold for validation
        val_scores, j = [], 0
        for train_idx, val_idx in k_folds.split(X_train, y_train):
            X_tr, X_v = X_train[train_idx], X_train[val_idx]
            y_tr, y_v = y_train[train_idx], y_train[val_idx]

            # feature extraction (HOG)
            print(f"\n Fold {j+1} ------------")
            print(f"\n Extracting features using HOG ...")
            X_tr = np.array([hog(img, orientations=9, 
                                 **param[0]).flatten()
                            for img in X_tr])
            X_v = np.array([hog(img, orientations=9, 
                                 **param[0]).flatten()
                            for img in X_v])
            print(f" Features extracted. \n Dimension of each instance: {X_v[0].shape[0]}")
        
            # dimension reduction
            print(f"\n Reducing dimensions using PCA ...")
            pca_model = PCA(0.95).fit(X_tr)
            X_tr, X_v = pca_model.transform(X_tr), pca_model.transform(X_v)
            print(f" Dimension reduced. \n Dimension of each instance: {X_v[0].shape[0]}")

            # data scaling
            print(f"\n Scaling the data ...")
            X_tr, X_v = param[1].transform(X_tr), param[1].transform(X_v)
            print(f" Data normalized. \n Final dimension of each instance: {X_v[0].shape[0]}")

            # validation
            print(f"\n Validating using PHCA ...")
            phca_model = PersistentHomologyClassifier()
            phca_model.fit(X_tr, y_tr)

            score = f1_score(y_v, phca_model.predict(X_v))
            # score = np.random.rand()
            val_scores.append(score)
            print(f" Validation complete. \n Validation score: {score:0.4f} \n\n")

            if j == 5:
                j = 0
            else:
                j += 1
        param_scores.append(val_scores)
    param_scores = np.array(param_scores)

Preparing dataset ...

Data is reshaped. 
 Dimension of each instance: 50176
Data is split. 
 Train/Validation size: 6105 images 
 Test size: 1527 images 

HOG: pixels_per_cell = (4, 4), cells_per_block: (2, 2)
Scaler: StandardScaler()

 Fold 1 ------------

 Extracting features using HOG ...
 Features extracted. 
 Dimension of each instance: 108900

 Reducing dimensions using PCA ...


MemoryError: Unable to allocate 3.96 GiB for an array with shape (4884, 108900) and data type float64

In [None]:
np.save('param_f1scores.npy', param_scores)

In [None]:
# pulling best parameters
ave_scores = np.mean(param_scores, axis=1)
best_idx = int(np.argmax(ave_scores))

# feature extraction
print("Extracting features using HOG ...")
X_train = np.array([hog(img, orientations=9, 
                     **params[best_idx][0]).flatten()
                     for img in X_train])
X_test = np.array([hog(img, orientations=9, 
                    **param[best_idx][0]).flatten()
                    for img in X_test])
print(f"Features extracted. \n Dimension of each instance: {X_test[0].shape[0]}")

# dimension reduction
print("Reducing dimensions using PCA ...")
pca_model = PCA(0.95).fit(X_train)
X_train, X_test = pca_model.transform(X_train), pca_model.transform(X_test)
print(f"Dimension reduced. \n Dimension of each instance: {X_test[0].shape[0]}")

# feature scaling
scaler = params[best_idx][1]
X_train, X_test = scaler.transform(X_train), scaler.transform(X_test)
print(f"Data normalized. \n Final dimension of each instance: {X_test[0].shape[0]}")

# classification
print('Starting classification --------------')
print('The PHCA model is learning from the data ...')
labels = {'true_labels': y_test, 'phca': []}
phca_model = PersistentHomologyClassifier()
phca_model.fit(X_train, y_train)

print("Model finished learning. \n")
print("The model is now predicting new data ...")
labels['phca'].extend(phca_model.predict(X_test))

# --- classification report
metrics = ['precision', 'recall', 'f1-score', 'specificity', 'support', 'accuracy']
report = classification_report(labels['true_labels'], labels['phca'])
print(report)
plot_metrics(predicted_labels=labels['phca'],
             true_labels=labels['true_labels'],
             measurements=metrics, save=True)

({'pixels_per_cell': (16, 16), 'cells_per_block': (2, 2)}, MinMaxScaler())

In [None]:
np.save('report.npy', report)