In [None]:
import os, shutil
import numpy as np
import pandas as pd
import tensorflow as tf
import matplotlib.pyplot as plt

from random import shuffle
from zipfile import ZipFile
from datetime import datetime
from google.colab import drive, files
from tensorflow.keras import layers
from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing.image import ImageDataGenerator

# Data Understanding

In [None]:
# Import the dataset
zipfile = 'gdrive/My Drive/shopee/shopee-product-detection-dataset-002.zip'
destination = 'dataset/'
drive.mount('/content/gdrive')

In [None]:
# Unzip the dataset
before = datetime.now()
with ZipFile(zipfile, 'r') as zipObj:
    print("Unzipping")
    zipObj.extractall(destination)
after = datetime.now()
print('Done in', after-before)

In [None]:
# Specify path to the dataset
train_dir = 'dataset/train/train/'
validation_dir = 'dataset/validation/validation/'
test_dir = 'dataset/test/'

In [None]:
# Create directory for validation data
os.mkdir(os.path.dirname(validation_dir[:-1]))
os.mkdir(validation_dir)

In [None]:
# Split training and validation set
for category in os.listdir(train_dir):

    # Create new directory for each category
    os.mkdir(validation_dir + category)

    # Count number of images per category
    filenames = os.listdir(train_dir + category)
    total = len(filenames)
    fraction = int(total * 0.1)

    # Move files randomly to validation set
    shuffle(filenames)
    for filename in filenames[:fraction]:
        shutil.move(
            train_dir + category + '/' + filename, 
            validation_dir + category + '/' + filename
        )

In [None]:
# Function to count number of images
def count_images(location):
    counter = 0
    for path, subdirs, files in os.walk(location):
        for name in files:
            if name.endswith(".jpg"):
                counter = counter + 1
    
    return(counter)

In [None]:
# Count number of images
total_train = count_images(train_dir)
total_validation = count_images(validation_dir)

print("Total training images:", total_train)
print("Total validation images:", total_validation)

## Data Preparation

In [None]:
# Specify hyperparameters
batch_size = 128
epochs = 5
IMG_HEIGHT = 224
IMG_WIDTH = 224

In [None]:
# Define image generator
train_image_generator = ImageDataGenerator(rescale=1./255)
validation_image_generator = ImageDataGenerator(rescale=1./255)
test_image_generator = ImageDataGenerator(rescale=1./255)

In [None]:
# Define flow from the training data
train_generator = train_image_generator.flow_from_directory(
    batch_size=batch_size,
    directory=train_dir,
    shuffle=True,
    target_size=(IMG_HEIGHT, IMG_WIDTH),
    class_mode='categorical'
)

In [None]:
# Define flow from the validation data
validation_generator = validation_image_generator.flow_from_directory(
    batch_size=batch_size,
    directory=validation_dir,
    shuffle=False,
    target_size=(IMG_HEIGHT, IMG_WIDTH),
    class_mode='categorical'
)

In [None]:
# Define flow from the test data
test_generator = test_image_generator.flow_from_directory(
    batch_size=batch_size,
    directory=test_dir,
    shuffle=False,
    target_size=(IMG_HEIGHT, IMG_WIDTH),
    class_mode='categorical'
)

## Modeling

In [None]:
# Create the base model from the pre-trained model MobileNet V2
IMG_SHAPE = (IMG_WIDTH, IMG_HEIGHT, 3)
base_model = tf.keras.applications.MobileNetV2(
    input_shape=IMG_SHAPE,
    include_top=False,
    weights='imagenet'
)

In [None]:
# Freeze the base model
base_model.trainable = False

In [None]:
# Add a classification head
model = tf.keras.Sequential([
  base_model,
  tf.keras.layers.GlobalAveragePooling2D(),
  tf.keras.layers.Dense(42)
])

In [None]:
# Compile the model
model.compile(
    optimizer='adam',
    loss=tf.keras.losses.CategoricalCrossentropy(from_logits=True),
    metrics=['accuracy']
)

In [None]:
# Show summary of the model
model.summary()

In [None]:
# Train the model
history = model.fit(
    train_generator,
    steps_per_epoch=total_train//batch_size,
    epochs=epochs,
    validation_data=validation_generator,
    validation_steps=total_validation//batch_size
)

In [None]:
# Make prediction for test data
predictions = np.argmax(model.predict(test_generator), axis=-1)

In [None]:
# Get filenames of test data
filenames = test_generator.filenames

In [None]:
# Get list of test images for submission
test = pd.read_csv('dataset/test.csv')

In [None]:
# Generate submission file
submission=pd.DataFrame({"filename": filenames, "category": predictions})
submission.category = submission.category.astype('str')
submission['filename'] = submission.apply(lambda row: row[0][5:], axis=1)
submission['category'] = submission.apply(lambda row: row[1].zfill(2), axis=1)
submission = pd.merge(test[['filename']], submission, on='filename', how='left')
submission.to_csv('submission.csv', index=False, header=True)

In [None]:
# Download submission file
files.download('submission.csv') 