In [7]:
# This class is used for Task 1 and 2.

In [1]:
import pandas as pd
from imageio import imread
import numpy as np
from pandas import Series, DataFrame
import matplotlib.pyplot as plt
from skimage.exposure import exposure
from skimage.feature import hog
from skimage.io import imshow
from skimage.transform import resize
from sklearn.decomposition import PCA
import os
from imblearn.over_sampling import SMOTE
from sklearn import preprocessing
from collections import Counter
from matplotlib import pyplot as plt
from skimage.filters import prewitt_h, prewitt_v
from sklearn.preprocessing import MinMaxScaler
import shutil
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report

le = preprocessing.LabelEncoder()
pca = PCA(100)
scaler = MinMaxScaler()

In [2]:
# This method reads through the 3000 images provided and applies two transformations:
#     1. HOG transformation
#     2. PCA dimensionality reduction
# Finall the result is saved to a file to avoid repeating this step
def pre_process_data_hog_pca():
    if os.path.isfile('../dataset/X_HOG_PCA.pickle'):
        print('Started reading from files')
        X = pd.read_pickle('../dataset/X_HOG_PCA.pickle')
        print('Finished reading from files')
        return X

    df = pd.read_csv('../dataset/label.csv')

    X = pd.DataFrame()

    for index, row in df.iterrows():
        img_gray = imread('../dataset/image/' + row['file_name'], as_gray=True)

        fd, hog_image = hog(img_gray, orientations=9, pixels_per_cell=(16, 16),
                            cells_per_block=(2, 2), visualize=True, multichannel=False)

        data_rescaled = scaler.fit_transform(hog_image)

        img_transformed = pca.fit_transform(data_rescaled)

#         Show transformation example
        if(index == 0):
            # Start plot section
            hog_image_rescaled = exposure.rescale_intensity(hog_image, in_range=(0, 10))
            _, axs = plt.subplots(1, 2, figsize=(12, 12))
            axs = axs.flatten()
            imgs = [img_gray, hog_image_rescaled]
            for img, ax in zip(imgs, axs):
                ax.imshow(img)
            plt.show()
            # End plot section

        features = np.reshape(img_transformed, (512 * 100))
        if np.any(np.isnan(features)):
            print('features creating nans')

        X = X.append(pd.Series(features).T, ignore_index=True)
        print("\rCompleted {:.2f}".format((index / df.shape[0]) * 100), end="")

    X.to_pickle('../dataset/X_HOG_PCA.pickle')
    return X

# Reads the input and output to determine which classes are underrepresented, then augments and creates new
# data points to balance the datasets using SMOTE
def resolve_imbalances_smote(X, Y):
    print(Counter(Y))
    oversample = SMOTE()
    X, Y = oversample.fit_resample(X, Y)
    print(Counter(Y))
    return X, Y

# Extracts labels for binary classification by making all non-tumor labels = tumor then converts to binary
def y_binary():
    df = pd.read_csv('../dataset/label.csv')
    return (df['label'] != 'no_tumor').astype(int)

# Extracts all labels and transforms to numeric representation
def y_multiclass():
    df = pd.read_csv('../dataset/label.csv')
    le.fit(df['label'])
    return le.transform(df['label'])

# Converts numeric represnetation of labels back to original form
def invert_multiclass(Y):
    return le.inverse_transform(Y)

In [3]:
def svm_tuned_predict(x_train, y_train, x_test, y_test):
#     Grid Search tuning, results already selected as linear, C=0.05
#     param_grid = {'C': [0.025, 0.05, 0.1, 0.25], 'kernel': ['linear', 'rbf']}
#     grid = GridSearchCV(SVC(), param_grid, refit=True, verbose=2)
#     grid.fit(x_train, y_train)

    svc = SVC(C=0.05, kernel='linear')
    svc.fit(x_train, y_train)

    y_pred = svc.predict(x_test)
    y_train_pred = svc.predict(x_train)

    print('Accuracy on SVM training set: ' + str(accuracy_score(y_train, y_train_pred)))
    print(classification_report(y_train, y_train_pred))

    print('Accuracy on SVM test set: ' + str(accuracy_score(y_test, y_pred)))
    print(classification_report(y_test, y_pred))

In [4]:
X = pre_process_data_hog_pca()
Y = y_binary()

X, Y = resolve_imbalances_smote(X, Y)
x_train, x_test, y_train, y_test = train_test_split(X, Y, random_state=0)

Started reading from files
Finished reading from files
Counter({1: 2546, 0: 454})
Counter({1: 2546, 0: 2546})


In [5]:
print('binary results')
svm_tuned_predict(x_train, y_train, x_test, y_test)

binary results
Accuracy on SVM training set: 1.0
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      1918
           1       1.00      1.00      1.00      1901

    accuracy                           1.00      3819
   macro avg       1.00      1.00      1.00      3819
weighted avg       1.00      1.00      1.00      3819

Accuracy on SVM test set: 0.9874312647289867
              precision    recall  f1-score   support

           0       0.98      1.00      0.99       628
           1       1.00      0.98      0.99       645

    accuracy                           0.99      1273
   macro avg       0.99      0.99      0.99      1273
weighted avg       0.99      0.99      0.99      1273



In [6]:
X = pre_process_data_hog_pca()
Y = y_multiclass()

X, Y = resolve_imbalances_smote(X, Y)
x_train, x_test, y_train, y_test = train_test_split(X, Y, random_state=0)

print('multiclass results')
svm_tuned_predict(x_train, y_train, x_test, y_test)

Started reading from files
Finished reading from files
Counter({0: 860, 1: 855, 3: 831, 2: 454})
Counter({1: 860, 2: 860, 0: 860, 3: 860})
multiclass results
Accuracy on SVM training set: 1.0
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       652
           1       1.00      1.00      1.00       623
           2       1.00      1.00      1.00       645
           3       1.00      1.00      1.00       660

    accuracy                           1.00      2580
   macro avg       1.00      1.00      1.00      2580
weighted avg       1.00      1.00      1.00      2580

Accuracy on SVM test set: 0.75
              precision    recall  f1-score   support

           0       0.60      0.67      0.63       208
           1       0.68      0.55      0.61       237
           2       0.90      0.93      0.92       215
           3       0.82      0.87      0.84       200

    accuracy                           0.75       860
   macro avg    