In [106]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
import gc

import keras as k
from keras.models import Sequential
from keras.layers import Activation, Dropout, Flatten, Dense
from keras.layers import Convolution2D, MaxPooling2D
from keras.preprocessing.image import ImageDataGenerator

import cv2
from tqdm import tqdm

#from keras import backend as K
#K.set_image_dim_ordering('th')

In [107]:
x_train = []
x_test = []
y_train = []

data_dir = 'D:/Downloads/amazon/'

df_train = pd.read_csv(data_dir + 'train.csv')

flatten = lambda l: [item for sublist in l for item in sublist]
labels = list(set(flatten([l.split(' ') for l in df_train['tags'].values])))

print(labels)
print(len(labels))

['slash_burn', 'clear', 'blooming', 'primary', 'cloudy', 'conventional_mine', 'water', 'haze', 'cultivation', 'partly_cloudy', 'artisinal_mine', 'habitation', 'bare_ground', 'blow_down', 'agriculture', 'road', 'selective_logging']
17


In [108]:
label_map = {l: i for i, l in enumerate(labels)}
inv_label_map = {i: l for l, i in label_map.items()}

In [109]:
number_of_samples = 1000
split = 800

for f, tags in tqdm(df_train.values[:number_of_samples], miniters=1000):
    img = cv2.imread(data_dir + 'train-jpg/{}.jpg'.format(f))
    targets = np.zeros(17)
    for t in tags.split(' '):
        targets[label_map[t]] = 1 
    x_train.append(cv2.resize(img, (32, 32)))
    y_train.append(targets)

100%|█████████████████████████████████████████████████████████████████████████████| 1000/1000 [00:01<00:00, 683.06it/s]


In [110]:
y_train = np.array(y_train, np.uint8)
x_train = np.array(x_train, np.uint8)

print(x_train.shape)
print(y_train.shape)

(1000L, 32L, 32L, 3L)
(1000L, 17L)


In [111]:
x_train = x_train.transpose(0,3,1,2)
print(x_train.shape)

(1000L, 3L, 32L, 32L)


In [112]:
x_train, x_valid, y_train, y_valid = x_train[:split], x_train[split:], y_train[:split], y_train[split:]

In [113]:
# this is the augmentation configuration we will use for training
train_datagen = ImageDataGenerator(
        rescale=1./255,
        shear_range=0.0,
        zoom_range=0.0,
        horizontal_flip=True)  # TODO try vertical_flip=True


In [114]:
train_generator = train_datagen.flow(
        x_train, 
        y_train, 
        batch_size=32,
        shuffle=True) 

In [116]:
model = Sequential()

model.add(Convolution2D(32, 3, 3, input_shape=(3, 32, 32)))  # https://github.com/fchollet/keras/issues/2681
model.add(Activation('relu'))
model.add(MaxPooling2D(pool_size=(2, 2)))

model.add(Convolution2D(64, 3, 3))
model.add(Activation('relu'))
model.add(MaxPooling2D(pool_size=(2, 2)))

model.add(Flatten())
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(17, activation='sigmoid'))

model.compile(loss='binary_crossentropy', # We NEED binary here, since categorical_crossentropy l1 norms the output before calculating loss.
              optimizer='adam',
              metrics=['accuracy'])

In [122]:
# fits the model on batches with real-time data augmentation:
model.fit_generator(train_generator,
                    steps_per_epoch=len(x_train) / 32, 
                    samples_per_epoch=1000, #  not using all training samples at every epoch.
                    epochs=5,
                    verbose=1)

TypeError: fit_generator() takes at least 4 arguments (4 given)

In [None]:
from sklearn.metrics import fbeta_score

p_valid = model.predict(x_valid, batch_size=128)
print(y_valid)
print(p_valid)
print(fbeta_score(y_valid, np.array(p_valid) > 0.5, beta=2, average='macro'))