In [0]:
# The following code is run on google cloud vm due to the memory insufficient error we encountered on colab.
# For result please refer to the report.
# 
##### loading data #####
from google.colab import drive
drive.mount('/gdrive', force_remount=True)

Mounted at /gdrive


In [0]:
train_dir = "/gdrive/My Drive/kaggle_dataset/train/"
val_dir = "/gdrive/My Drive/kaggle_dataset/val/"
test_dir = "/gdrive/My Drive/kaggle_dataset/test/"

img_width = 224
img_height = 224

# img_width = 512
# img_height = 496

batch_size = 128
channels = 3
epochs = 50
nb_train_samples = 8000
nb_valid_samples = 32
nb_test_samples = 968
num_classes = 4

from keras.preprocessing.image import ImageDataGenerator

train_datagen = ImageDataGenerator(rescale=1./255)             
valid_datagen = ImageDataGenerator(rescale=1./255)    
test_datagen = ImageDataGenerator(rescale=1./255) 

train_generator = train_datagen.flow_from_directory(
    train_dir, 
    target_size=(img_height, img_width),
    color_mode="rgb",
    batch_size=batch_size,
    class_mode='categorical',
    shuffle=True)   

valid_generator = valid_datagen.flow_from_directory(
    val_dir,
    target_size=(img_height, img_width),
    color_mode="rgb",
    batch_size=batch_size,
    class_mode='categorical',
    shuffle=True) #weight toward one class or another

test_generator = test_datagen.flow_from_directory(
    test_dir,
    target_size=(img_height, img_width),
    color_mode="rgb",
    batch_size=batch_size,
    class_mode='categorical',
    shuffle=False)

Found 8000 images belonging to 4 classes.
Found 32 images belonging to 4 classes.
Found 968 images belonging to 4 classes.


In [0]:
import numpy as np

def stacking_samples(dataset_type, sample_count):
    features = np.zeros(shape=(sample_count, img_height, img_width, channels))
    labels = np.zeros(shape=(sample_count))
    i = 0
    if dataset_type == "train":
        for inputs_batch, labels_batch in train_generator:
#             features_batch = conv_base.predict(inputs_batch)
            features[i * batch_size : (i + 1) * batch_size] = inputs_batch
            labels[i * batch_size : (i + 1) * batch_size] = np.argmax(labels_batch)
            i += 1
            if i * batch_size >= sample_count:
                break   
    elif dataset_type == "valid":
        for inputs_batch, labels_batch in valid_generator:
#             features_batch = conv_base.predict(inputs_batch)
            features[i * batch_size : (i + 1) * batch_size] = inputs_batch
            labels[i * batch_size : (i + 1) * batch_size] = np.argmax(labels_batch)
            i += 1
            if i * batch_size >= sample_count:
                break
    else:
        for inputs_batch, labels_batch in test_generator:
#             features_batch = conv_base.predict(inputs_batch)
            features[i * batch_size : (i + 1) * batch_size] = inputs_batch
            labels[i * batch_size : (i + 1) * batch_size] = np.argmax(labels_batch)
            i += 1
            if i * batch_size >= sample_count:
                break
    return features, labels

train_features, train_labels = stacking_samples("train", nb_train_samples)
valid_features, valid_labels = stacking_samples("valid", nb_valid_samples)
test_features, test_labels = stacking_samples("test", nb_test_samples)


print(train_features.shape, train_labels.shape)
print(valid_features.shape, valid_labels.shape)
print(test_features.shape, test_labels.shape)

In [0]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification

clf = RandomForestClassifier(n_estimators=100, max_depth=12, random_state=0, bootstrap=True)
clf.fit(np.reshape(train_features, (nb_train_samples,img_height*img_width*channels)), train_labels)

valid_prediction = clf.predict(np.reshape(valid_features, (nb_valid_samples, img_height*img_width*channels)))
test_prediction = clf.predict(np.reshape(test_features, (nb_test_samples, img_height*img_width*channels))) 

valid_score = clf.score(np.reshape(valid_features, (nb_valid_samples, img_height*img_width*channels)), valid_labels)
test_score = clf.score(np.reshape(test_features, (nb_test_samples, img_height*img_width*channels)), test_labels)

print("valid accuracy:", valid_score)
print("test accuracy:", test_score)