# Tumor Classification Assignment Part 1

In [9]:
import pandas as pd
from imageio import imread
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('white')
from matplotlib.colors import ListedColormap
from sklearn.decomposition import PCA
from sklearn.metrics import confusion_matrix, classification_report,accuracy_score

## Data extraction

1. Map images into vectors
    - convert to grey scale 2d vector
    - reduce dimensionality
    - store result to avoid repeating
2. Map labels to 1 for tumor, 0 for no_tumor
3. Create xTrain, yTrain, xTest, yTest from training data (test again once we have new test data)

In [39]:
def rgb2gray(rgb):
    return np.dot(rgb[...,:3], [0.2989, 0.5870, 0.1140])

df = pd.read_csv('./dataset/label.csv')

X = pd.DataFrame()
Y = (df['label'] != 'no_tumor').astype(int)

img = imread('./dataset/image/image_0000.jpg')
# plt.imshow(img)

img_gray = rgb2gray(img)

# features = np.reshape(img_gray, (512*512))
# print(features.shape)


# print(img_gray.shape)
# img_transformed = pca.transform(img_gray) 
# print(img_transformed.shape)
# img_transformed_features = np.reshape(img_transformed, (512*32))
# print(img_transformed_features.shape)
# print(np.sum(pca.explained_variance_ratio_) )

# Retrieving the results of the image after Dimension reduction.
# temp = pca.inverse_transform(img_transformed) 
# print(temp.shape)
# temp = np.reshape(temp, (512,512)) 
# print(temp.shape) 
# plt.imshow(temp)

for index, row in df.iterrows():
    img = imread('./dataset/image/' + row['file_name'])
    img_gray = rgb2gray(img)
    pca = PCA(32).fit(img_gray) 
    img_transformed = pca.transform(img_gray) 
    features = np.reshape(img_transformed, (512*32))
    X = X.append(pd.Series(features).T, ignore_index=True)
    print("\rCompleted {:.2f}".format((index/df.shape[0]) * 100), end="")

print(X.shape)

[[1 1 1]
 [2 2 2]
 [3 3 3]
 ...
 [2 2 2]
 [2 2 2]
 [2 2 2]]
(786432,)


In [4]:
# X = pd.read_csv('./dataset/ImageFeatures.csv')

### Logistic Regression

In [10]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

In [6]:
# x_train, x_test, y_train, y_test = train_test_split(X, Y,random_state=0)
def logRegrPredict(x_train, y_train,xtest ):
    # Build Logistic Regression Model
    logreg = LogisticRegression(solver='lbfgs')
    # Train the model using the training sets
    logreg.fit(x_train, y_train)
    y_pred= logreg.predict(xtest)
    print('Accuracy on test set: {:.2f}'.format(logreg.score(x_test, y_test)))
    return y_pred

x_train, x_test, y_train, y_test = train_test_split(X, Y,random_state=0)
y_pred = logRegrPredict(x_train, y_train,x_test)

print(confusion_matrix(y_test, y_pred))
print('Accuracy on test set: '+str(accuracy_score(y_test,y_pred)))
print(classification_report(y_test,y_pred))

Accuracy on test set: 0.86
[[ 82  41]
 [ 65 562]]
Accuracy on test set: 0.8586666666666667
              precision    recall  f1-score   support

           0       0.56      0.67      0.61       123
           1       0.93      0.90      0.91       627

    accuracy                           0.86       750
   macro avg       0.74      0.78      0.76       750
weighted avg       0.87      0.86      0.86       750



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [7]:
from sklearn import tree

tree_params={
    'criterion':'entropy'
}
clf = tree.DecisionTreeClassifier( **tree_params )

clf.fit(x_train,y_train)

y_pred =  clf.predict(x_test)
print('Accuracy on test set: '+str(accuracy_score(y_test,y_pred)))
print(classification_report(y_test,y_pred))

Accuracy on test set: 0.868
              precision    recall  f1-score   support

           0       0.60      0.58      0.59       123
           1       0.92      0.93      0.92       627

    accuracy                           0.87       750
   macro avg       0.76      0.75      0.76       750
weighted avg       0.87      0.87      0.87       750



In [12]:
from sklearn.ensemble import AdaBoostClassifier

clf = AdaBoostClassifier(n_estimators=50, random_state=0, learning_rate=0.7)
clf.fit(x_train, y_train)
adaBoostY = clf.predict(x_test)

print('Accuracy on test set: '+str(accuracy_score(y_test,adaBoostY)))
print(classification_report(y_test,y_pred))

In [None]:
print(clf.estimator_errors_)

Apply ensemble but train weak classifiers on equal split of class 0 and 1
https://machinelearningmastery.com/cross-validation-for-imbalanced-classification/
Test Smote
https://machinelearningmastery.com/smote-oversampling-for-imbalanced-classification/

In [7]:
from imblearn.over_sampling import SMOTE
from collections import Counter
oversample = SMOTE()

print(Counter(Y))

X, Y = oversample.fit_resample(X, Y)
print(Counter(Y))

Counter({1: 2546, 0: 454})
Counter({1: 2546, 0: 2546})


In [16]:
x_train, x_test, y_train, y_test = train_test_split(X, Y,random_state=0)

clf = AdaBoostClassifier(n_estimators=20, random_state=0, learning_rate=1.0)
clf.fit(x_train, y_train)
adaBoostY = clf.predict(x_test)

In [15]:
print('Accuracy on test set: '+str(accuracy_score(y_test,adaBoostY)))
print(classification_report(y_test,adaBoostY))

Accuracy on test set: 0.7965435978004713
              precision    recall  f1-score   support

           0       0.79      0.80      0.80       628
           1       0.80      0.79      0.80       645

    accuracy                           0.80      1273
   macro avg       0.80      0.80      0.80      1273
weighted avg       0.80      0.80      0.80      1273



In [18]:
adaBoostYtrain = clf.predict(x_train)
print('Accuracy on test set: '+str(accuracy_score(y_train,adaBoostYtrain)))
print(classification_report(y_train,adaBoostYtrain))

Accuracy on test set: 0.866980885048442
              precision    recall  f1-score   support

           0       0.86      0.88      0.87      1918
           1       0.88      0.85      0.86      1901

    accuracy                           0.87      3819
   macro avg       0.87      0.87      0.87      3819
weighted avg       0.87      0.87      0.87      3819

