In [1]:
from google.colab import drive
drive.mount("/content/gdrive", force_remount=True)

ModuleNotFoundError: No module named 'google.colab'

In [None]:
!unzip "gdrive/My Drive/shopee-product-detection-student.zip"

Archive:  gdrive/My Drive/shopee-product-detection-student.zip
file #1:  bad zipfile offset (lseek):  0


In [None]:
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings('ignore') # filter warnings
import os
import cv2
import tensorflow as tf
from tensorflow import keras
from keras.applications import imagenet_utils
from keras.utils import np_utils
from keras import regularizers
from tensorflow.keras.layers import Dense, GlobalAveragePooling2D, Dropout
from tensorflow.python.keras.callbacks import EarlyStopping, ModelCheckpoint
from keras.utils.np_utils import to_categorical # convert to one-hot-encoding
from keras.models import Sequential
from keras.preprocessing.image import ImageDataGenerator
from keras.preprocessing import image

In [None]:
# Set imgaug Sequential
import imgaug as ia
from imgaug import augmenters as iaa
seq = iaa.Sequential(
    [
        iaa.Affine(rotate=(-10, 10)),
        iaa.Fliplr(0.5),
        iaa.GaussianBlur((0, 2.0)),
        iaa.AdditiveGaussianNoise(scale=(0, 0.05), per_channel=True),
        iaa.ChannelShuffle(p=0.5),
    ],
    random_order=False,
)

# Cool cutmix self implementation from some pro dude
class CutMixImageDataGenerator():
    def __init__(self, generator1, generator2, img_size, batch_size):
        self.batch_index = 0
        self.samples = generator1.samples
        self.class_indices = generator1.class_indices
        self.generator1 = generator1
        self.generator2 = generator2
        self.img_size = img_size
        self.batch_size = batch_size

    def reset_index(self):  # Ordering Reset (If Shuffle is True, Shuffle Again)
        self.generator1._set_index_array()
        self.generator2._set_index_array()

    def reset(self):
        self.batch_index = 0
        self.generator1.reset()
        self.generator2.reset()
        self.reset_index()

    def get_steps_per_epoch(self):
        quotient, remainder = divmod(self.samples, self.batch_size)
        return (quotient + 1) if remainder else quotient
    
    def __len__(self):
        self.get_steps_per_epoch()

    def __next__(self):
        if self.batch_index == 0: self.reset()

        crt_idx = self.batch_index * self.batch_size
        if self.samples > crt_idx + self.batch_size:
            self.batch_index += 1
        else:  # If current index over number of samples
            self.batch_index = 0

        reshape_size = self.batch_size
        last_step_start_idx = (self.get_steps_per_epoch()-1) * self.batch_size
        if crt_idx == last_step_start_idx:
            reshape_size = self.samples - last_step_start_idx
            
        X_1, y_1 = self.generator1.next()
        X_2, y_2 = self.generator2.next()
        
        cut_ratio = np.random.beta(a=1, b=1, size=reshape_size)
        cut_ratio = np.clip(cut_ratio, 0.2, 0.8)
        label_ratio = cut_ratio.reshape(reshape_size, 1)
        cut_img = X_2

        X = X_1
        for i in range(reshape_size):
            cut_size = int((self.img_size-1) * cut_ratio[i])
            y1 = np.random.randint(0, (self.img_size-1) - cut_size)
            x1 = np.random.randint(0, (self.img_size-1) - cut_size)
            y2 = y1 + cut_size
            x2 = x1 + cut_size
            cut_arr = cut_img[i][y1:y2, x1:x2]
            cutmix_img = X_1[i]
            cutmix_img[y1:y2, x1:x2] = cut_arr
            X[i] = cutmix_img
            
        X = seq.augment_images(X)  # Sequential of imgaug
        y = y_1 * (1 - (label_ratio ** 2)) + y_2 * (label_ratio ** 2)
        return X, y

    def __iter__(self):
        while True:
            yield next(self)

In [None]:
img_size = 224

# Data augmentation only on training set not val set
datagen_aug = ImageDataGenerator(
         rescale=1./255,
         validation_split=0.06,   # 100k image dataset... 6k val should be enough?
         horizontal_flip=True, 
         vertical_flip=False
         )  

batchsize = 16

train_gen = datagen_aug.flow_from_directory(
    directory="./train/train/train/",
    subset = 'training',
    target_size=(img_size, img_size),
    color_mode="rgb",
    batch_size=batchsize,
    class_mode="categorical",
    shuffle=True,
)

train_gen2 = datagen_aug.flow_from_directory(    # 2 of these because it said in documentation for cutmix
    directory="./train/train/train/",
    subset = 'training',
    target_size=(img_size, img_size),
    color_mode="rgb",
    batch_size=batchsize,
    class_mode="categorical",
    shuffle=True,
)

valid_gen = datagen_aug.flow_from_directory(
    directory="./train/train/train/",
    subset = 'validation',
    target_size=(img_size, img_size),
    color_mode="rgb",
    batch_size=batchsize,
    class_mode="categorical",
    shuffle=True,
    seed=42
)

train_generator = CutMixImageDataGenerator(
    generator1=train_gen,
    generator2=train_gen2,
    img_size=img_size,
    batch_size=batchsize,
)

model = tf.keras.Sequential()

base = tf.keras.applications.MobileNetV2(weights='imagenet', include_top=False,input_shape=(img_size, img_size,3))
for layer in base.layers[:135]:
    layer.trainable = False
model.add(base)
model.add(GlobalAveragePooling2D())
#model.add(Dense(1024,
#        activation='relu',
#        kernel_initializer='he_normal',
#        kernel_regularizer=regularizers.l2(1e-3),
#    ))
#model.add(Dropout(0.6))
model.add(Dense(42, activation='softmax'))

optim = tf.keras.optimizers.SGD(lr=0.05, nesterov=True) # Start to lower it when it plateaus

# Compile model with optimizer
model.compile(
    optimizer=optim,
    loss='categorical_crossentropy',
    metrics=['accuracy']

  )


In [None]:
# Training with regular checkpointing because colab times out after awhile lmao
epochs = 5

cb_early_stopper = EarlyStopping(monitor = 'val_loss', patience = 3)
cb_checkpointer = ModelCheckpoint(filepath = './gdrive/My Drive/checkpt_latest.hdf5', 
                                  monitor = 'val_loss',
                                  save_best_only = True, mode = 'auto')

VALID_STEP = valid_gen.n//valid_gen.batch_size
TRAIN_STEP = train_gen.n//train_gen.batch_size

model.load_weights('./gdrive/My Drive/checkpt_promising.hdf5') 
history = model.fit(train_generator,
                              epochs = epochs, 
                              validation_data = valid_gen,
                              validation_steps = VALID_STEP,
                              steps_per_epoch = TRAIN_STEP, # depends on amount of training data
                              callbacks=[cb_checkpointer, cb_early_stopper])
model.load_weights('./gdrive/My Drive/checkpt_latest.hdf5')         

In [None]:
# Evaluating on test set to produce submission file
test_datagen = ImageDataGenerator(rescale=1./255)
testdf = pd.read_csv('../input/shopee-product-detection-student/test.csv') #only predict those in test csv 
test_gen = test_datagen.flow_from_dataframe(
    testdf,
    directory="../input/shopee-product-detection-student/test/test/test/",
    target_size=(img_size, img_size),
    color_mode="rgb",
    batch_size=1,
    class_mode=None,
    shuffle=False
)

STEP_SIZE_TEST = test_gen.n//test_gen.batch_size
test_gen.reset()

pred = model.predict_generator(test_gen, steps=STEP_SIZE_TEST, verbose=1)
predicted_class_indices = np.argmax(pred,axis=1)

filenames=testdf['filename']
submission_df=pd.DataFrame({"filename":filenames,
                      "category":predicted_class_indices})

submission_df["category"] = submission_df["category"].apply(lambda x: "{:02}".format(x))  # pad zeroes
submission_df.to_csv("submission.csv", index=False)

In [None]:
# CTrl SHift I and input this into the console
function ConnectButton(){
    console.log("Connect pushed"); 
    document.querySelector("#top-toolbar > colab-connect-button").shadowRoot.querySelector("#connect").click() 
}
setInterval(ConnectButton,300000); # useful thing to keep Colab running I guess