In [31]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [2]:
from shutil import copyfile
copyfile('/content/gdrive/My Drive/MLB.py', 'MLB.py')

'MLB.py'

In [3]:
from zipfile import ZipFile
with ZipFile('/content/gdrive/My Drive/data.zip', 'r') as zip:
    print('Extracting all the files now...') 
    zip.extractall() 
    print('Done!') 

Extracting all the files now...
Done!


In [4]:
!pip install scikit-multilearn

import os
import shutil
import numpy as np
import pandas as pd

import keras
import keras.backend as K
from keras.models import Model
from keras.optimizers import Adam
from keras.models import Sequential
from keras.models import model_from_json
from keras.applications.vgg16 import VGG16, preprocess_input
from keras.preprocessing.image import ImageDataGenerator as ImageDataGen
from keras.callbacks import EarlyStopping, ModelCheckpoint, Callback, ReduceLROnPlateau
from keras.layers import Dense, Dropout, BatchNormalization, Conv2D, MaxPooling2D, Flatten, GlobalAveragePooling2D, Input

from sklearn.metrics import fbeta_score
from sklearn.preprocessing import MultiLabelBinarizer

from MLB import ImageDataGenerator

from skmultilearn.model_selection import iterative_train_test_split

xx, yy, channel, epochs = 64, 64, 'rgb', 10

Collecting scikit-multilearn
[?25l  Downloading https://files.pythonhosted.org/packages/bb/1f/e6ff649c72a1cdf2c7a1d31eb21705110ce1c5d3e7e26b2cc300e1637272/scikit_multilearn-0.2.0-py3-none-any.whl (89kB)
[K     |███▊                            | 10kB 27.9MB/s eta 0:00:01[K     |███████▍                        | 20kB 2.1MB/s eta 0:00:01[K     |███████████                     | 30kB 2.9MB/s eta 0:00:01[K     |██████████████▊                 | 40kB 2.1MB/s eta 0:00:01[K     |██████████████████▍             | 51kB 2.3MB/s eta 0:00:01[K     |██████████████████████          | 61kB 2.8MB/s eta 0:00:01[K     |█████████████████████████▊      | 71kB 3.1MB/s eta 0:00:01[K     |█████████████████████████████▍  | 81kB 3.4MB/s eta 0:00:01[K     |████████████████████████████████| 92kB 3.1MB/s 
[?25hInstalling collected packages: scikit-multilearn
Successfully installed scikit-multilearn-0.2.0


Using TensorFlow backend.


In [0]:
data_dir = '/content/data'

In [0]:
def df_create(X, y, mlb):
    return pd.DataFrame({'Image': X.reshape(X.shape[0]), 
                         'Label': mlb.inverse_transform(y)})

def multi_split(df, img_format='png'):
    
    col_0, col_1 = df.columns[0], df.columns[1]
    
    df[col_1] = df[col_1].str.split(' ')
    df[col_0] = df[col_0].apply(lambda x: x + '.{}'.format(img_format))
    
    X = df[col_0].values
    X = X.reshape(X.shape[0], 1)
    mlb = MultiLabelBinarizer()
    y = mlb.fit_transform([i for i in df[col_1]])
    
    X_train, y_train, X_test, y_test = iterative_train_test_split(X, y, test_size=0.06)
    
    df_train = df_create(X_train, y_train, mlb)
    df_test = df_create(X_test, y_test, mlb)
    
    return df_train, df_test

In [0]:
df = pd.read_csv(os.path.join(data_dir, 'labels/train_v2.csv'))

df_train, df_val = multi_split(df, img_format='jpg')

In [8]:
train_datagen = ImageDataGenerator(preprocessing_function=preprocess_input, horizontal_flip=True, vertical_flip=True)
val_datagen = ImageDataGenerator(preprocessing_function=preprocess_input)
test_datagen = ImageDataGen(preprocessing_function=preprocess_input)

train_iter = train_datagen.flow_from_directory(os.path.join(data_dir, 'train'), target_size=(xx, yy), batch_size=28, 
                                               shuffle=True, dataframe=df_train)  
val_iter = val_datagen.flow_from_directory(os.path.join(data_dir, 'train'), target_size=(xx, yy), batch_size=19, 
                                           shuffle=False, dataframe=df_val)
test_iter = test_datagen.flow_from_directory(os.path.join(data_dir, 'test'), target_size=(xx, yy), batch_size=13, 
                                             shuffle=False) 

train_steps = train_iter.n // train_iter.batch_size
val_steps = val_iter.n // val_iter.batch_size
test_steps = test_iter.n // test_iter.batch_size

Found 38017 images belonging to 17 classes.
Found 2462 images belonging to 17 classes.
Found 61191 images belonging to 1 classes.


In [0]:
input_tensor = Input(shape=(xx, yy, 3))

base_model = VGG16(include_top=False,
                   weights='imagenet',
                   input_shape=(xx, yy, 3))

bn = BatchNormalization()(input_tensor)
x = base_model(bn)
x = Flatten()(x)
output = Dense(17, activation='sigmoid')(x)

model = Model(input_tensor, output)

In [0]:
es = EarlyStopping(patience=3)
mc = ModelCheckpoint('planet_weights(vgg16).h5', save_best_only=True, save_weights_only=True)
lrop = ReduceLROnPlateau(monitor='val_loss', factor=0.1, patience=2, cooldown=0, min_lr=1e-6, verbose=1)

def fbeta(y_true, y_pred, threshold_shift=0.2):
    beta = 2
    y_pred = K.cast(K.greater(K.clip(y_pred, 0, 1), threshold_shift), K.floatx())
    tp = K.sum(K.round(y_true * y_pred)) + K.epsilon()
    fp = K.sum(K.round(K.clip(y_pred - y_true, 0, 1)))
    fn = K.sum(K.round(K.clip(y_true - y_pred, 0, 1)))
    precision = tp / (tp + fp)
    recall = tp / (tp + fn)
    beta_squared = beta ** 2
    return (beta_squared + 1) * (precision * recall) / (beta_squared * precision + recall + K.epsilon())
    
model.compile(optimizer=Adam(lr=0.001), metrics=['accuracy', fbeta], loss='binary_crossentropy')

In [11]:
hist = model.fit_generator(train_iter, steps_per_epoch=train_steps, epochs=epochs, verbose=1, callbacks=[es, mc],
                           validation_data=val_iter, validation_steps=val_steps)



Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [0]:
with open('amazon_arch.json', 'w') as f:
    f.write(model.to_json())

In [0]:
with open('amazon_arch.json', 'r') as f:
    model = model_from_json(f.read())

model.load_weights('planet_weights(vgg16).h5')

In [29]:
# test_iter.reset()
y_pred = model.predict_generator(test_iter, steps=test_steps, verbose=1)
y_pred_val = model.predict_generator(val_iter, steps=val_steps, verbose=1)



In [0]:
def find_f2score_threshold(y_true, pred, try_all=False, verbose=False):
    
    best, best_score = 0, -1
    totry = np.arange(0, 1, 0.005) if try_all is False else np.unique(p_valid)
    
    for t in totry:
        score = fbeta_score(y_true, pred>t, beta=2, average='samples')
        if score > best_score:
            best_score = score
            best = t
            
    if verbose is True: 
        print('Best score: ', round(best_score, 5), ' @ threshold =', best)

    return best

In [20]:
classes = [i for i in df_val[df_val.columns[1]]]
mlb = MultiLabelBinarizer()
y_true = mlb.fit_transform(classes)

sco = find_f2score_threshold(y_true[:2451], y_pred_val, False, True)
sco

Best score:  0.89607  @ threshold = 0.20500000000000002


0.20500000000000002

In [0]:
inv = {v: k for k, v in train_iter.class_indices.items()}
label = []

for i in range(y_pred.shape[0]):
    arr = np.argwhere(y_pred[i] > sco)
    s = ''
    for j in range(arr.shape[0]):
        s += inv.get(arr[j, 0]) + ' '
    label.append(s)

img = os.listdir(os.path.join(data_dir, 'test/test'))
img = [i.split('.')[0] for i in img]

In [0]:
df = pd.DataFrame({'image_name': img,
                   'tags'      : label})

df.to_csv('multi.csv', index=False)

In [0]:
df_file = df.loc[df['image_name'].str.contains('file')]
df_test = df.loc[df['image_name'].str.contains('test')]

In [0]:
df_file.sort_values('image_name', inplace=True)

In [0]:
df_test['image_name'] = df_test['image_name'].apply(lambda x: int(x.split('_')[-1]))
df_test.sort_values('image_name', inplace=True)
df_test['image_name'] = df_test['image_name'].apply(lambda x: 'test_' + str(x))

In [0]:
pd.concat([df_test, df_file]).to_csv('submission.csv', index=False)