In [1]:
import os
import numpy as np
from PIL import Image

 #define folder paths for smaller images,for each class
TUMOR_path = 'Kather_texture_2016_image_tiles_5000/01_TUMOR/'
STROMA_path = 'Kather_texture_2016_image_tiles_5000/02_STROMA/'
COMPLEX_path = 'Kather_texture_2016_image_tiles_5000/03_COMPLEX/'
LYMPHO_path = 'Kather_texture_2016_image_tiles_5000/04_LYMPHO/'
DEBRIS_path = 'Kather_texture_2016_image_tiles_5000/05_DEBRIS/'
MUCOSA_path= 'Kather_texture_2016_image_tiles_5000/06_MUCOSA/'
ADIPOSE_path = 'Kather_texture_2016_image_tiles_5000/07_ADIPOSE/'
EMPTY_path = 'Kather_texture_2016_image_tiles_5000/08_EMPTY/'

#Load the images from each folder path
TUMOR = np.array([np.array(Image.open(TUMOR_path + fname)) for fname in os.listdir(TUMOR_path)])
STROMA = np.array([np.array(Image.open(STROMA_path + fname)) for fname in os.listdir(STROMA_path)])
COMPLEX = np.array([np.array(Image.open(COMPLEX_path + fname)) for fname in os.listdir(COMPLEX_path)])
LYMPHO = np.array([np.array(Image.open(LYMPHO_path + fname)) for fname in os.listdir(LYMPHO_path)])
DEBRIS = np.array([np.array(Image.open(DEBRIS_path + fname)) for fname in os.listdir(DEBRIS_path)])
MUCOSA = np.array([np.array(Image.open(MUCOSA_path + fname)) for fname in os.listdir(MUCOSA_path)])
ADIPOSE = np.array([np.array(Image.open(ADIPOSE_path + fname)) for fname in os.listdir(ADIPOSE_path)])
EMPTY = np.array([np.array(Image.open(EMPTY_path + fname)) for fname in os.listdir(EMPTY_path)])

#Create the labels for each image so we know which image belongs to which class
TUMOR_labels = np.ones((len(TUMOR),1))
STROMA_labels = np.ones((len(STROMA),1))*2
COMPLEX_labels = np.ones((len(COMPLEX),1))*3
LYMPHO_labels = np.ones((len(LYMPHO),1))*4
DEBRIS_labels = np.ones((len(DEBRIS),1))*5
MUCOSA_labels = np.ones((len(MUCOSA),1))*6
ADIPOSE_labels = np.ones((len(ADIPOSE),1))*7
EMPTY_labels = np.ones((len(EMPTY),1))*8

In [8]:
#Combine all the images and labels into one X and y array
X = np.concatenate((TUMOR,STROMA,COMPLEX,LYMPHO,DEBRIS,MUCOSA,ADIPOSE,EMPTY),axis=0)
y = np.concatenate((TUMOR_labels,STROMA_labels,COMPLEX_labels,LYMPHO_labels,DEBRIS_labels,MUCOSA_labels,ADIPOSE_labels,EMPTY_labels),axis=0)

#Shuffle the data (prevents non random assignment to training and testing)
from sklearn.utils import shuffle
X, y = shuffle(X, y, random_state=0)

#Split the data into training and testing
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

#resize images to 50x50 and convert to grayscale as random forest classifier only accepts 2D data
from skimage.transform import resize
X_train = np.array([resize(image, (50, 50)) for image in X_train])
X_test = np.array([resize(image, (50, 50)) for image in X_test])
#Convert to grayscale
X_train = np.array([np.mean(image, axis=2) for image in X_train])
X_test = np.array([np.mean(image, axis=2) for image in X_test])

#Normalize the data to be between 0 and 1 for same scale
X_train = X_train.astype('float32')
X_test = X_test.astype('float32')
X_train /= 255
X_test /= 255

In [9]:
#One hot encode the labels
from keras.utils import to_categorical
y_train = to_categorical(y_train)
y_test = to_categorical(y_test)

In [5]:
X_train.shape
y_train.shape

(4000, 9)

In [12]:
X_train.shape
#convert to 2D array for random forest classifier
X_train = X_train.reshape(4000,2500)
X_test = X_test.reshape(1000,2500)


In [13]:
#perform the random forest classification using cross validation
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
#define a basic classifier
clf = RandomForestClassifier(n_estimators=100, max_depth=2,random_state=0)
#perform the cross validation
scores = cross_val_score(clf, X_train, y_train, cv=5)
#print the scores
print(scores)




[0.14625 0.1225  0.12625 0.11    0.1075 ]


In [None]:

#create gridsearch to find the best parameters
from sklearn.model_selection import GridSearchCV
#create a dictionary of parameters to search
param_grid = {
    'bootstrap': [True],
    'max_depth': [10, 20, 30, 40, 50],
    'max_features': [2, 3],
    'n_estimators': [100, 200, 300, 1000]
}
#instantiate the grid search model
grid_search = GridSearchCV(estimator = clf, param_grid = param_grid,
                            cv = 5, n_jobs = -1, verbose = 2)                               
#fit the grid search to the data
grid_search.fit(X_train, y_train)
#print the best parameters
print(grid_search.best_params_)
#print the best score
print(grid_search.best_score_)
#save the best model
best_grid = grid_search.best_estimator_


Fitting 3 folds for each of 100 candidates, totalling 300 fits
