# Set up

In [None]:
%%capture
# install Hugging Face's transformers
! pip install transformers

In [None]:
# import library

import tensorflow as tf
tf.config.run_functions_eagerly(True)

from keras import layers, Model
from keras.layers import Input, Dense, Flatten, Conv2D, MaxPooling2D, LSTM, Embedding, Concatenate, Dropout
from keras.utils import to_categorical
from tensorflow.keras.preprocessing.sequence import pad_sequences

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

from transformers import BertTokenizer, TFBertModel

import os

import pandas as pd
import numpy as np
import re, string

import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

import matplotlib.pyplot as plt

from PIL import Image

pd.set_option('display.max_colwidth', 1000)

%matplotlib inline

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


# Load data

## Train dataset

In [None]:
# load the full, translated train dataset
train = pd.read_csv('final_train.csv', index_col = 0)

train.head(3)

Unnamed: 0,posting_id,image,image_phash,title,label_group,title_translate
0,train_129225211,0000a68812bc7e98c42888dfb1c07da0.jpg,94974f937d4c2433,Paper Bag Victoria Secret,249114794,Victoria's Secret Paper Bag
1,train_3386243561,00039780dfc94d01db8676fe789ecd05.jpg,af3f9460c2838f0f,"Double Tape 3M VHB 12 mm x 4,5 m ORIGINAL / DOUBLE FOAM TAPE",2937985045,Double Tape 3M VHB 12 mm x 4.5 m ORIGINAL / DOUBLE FOAM TAPE
2,train_2288590299,000a190fdd715a2a36faed16e2c65df7.jpg,b94cb00ed3e50f78,Maling TTS Canned Pork Luncheon Meat 397 gr,2395904891,Maling TTS Canned Pork Luncheon Meat 397 gr


In [None]:
# if using the 500-observation sample, run this instead
'''train = pd.read_csv('train_sample.csv', index_col = 0)
train.set_index(pd.RangeIndex(start = 0, stop = len(train)), inplace = True) # reset index
train.head(3)'''

"train = pd.read_csv('data/train_sample.csv', index_col = 0)\ntrain.set_index(pd.RangeIndex(start = 0, stop = len(train)), inplace = True) # reset index\ntrain.head(3)"

## Validation dataset

The validation dataset is 20% of the total rows of duplicated label_group in the full translated train dataset.

Only need to load the validation dataset if running model on the full dataset. If using the 500-observation sample, no need to load the validation dataset as there will be a train/validation split later.

In [None]:
# load translated validation dataset
validation = pd.read_csv('final_validation.csv', index_col = 0)
validation.set_index(pd.RangeIndex(start = 0, stop = len(validation)), inplace = True) # reset index
validation.head(3)

Unnamed: 0,posting_id,image,image_phash,title,label_group,title_translate
0,train_1003554842,560a5c3577fb22be2ac82c0e97558158.jpg,f3c78fce8c3050f0,Mustika Ratu Minyak Cem-Ceman 175 ml,3044373336,Mustika Ratu Oil Cem-Ceman 175 ml
1,train_523363809,dd1f14c7a734ff28b67062ae4f8529c6.jpg,af919a66c49d688b,Snobby Kelambu Box Bayi Snobby 1 Tiang KBX 1201,873493898,Snobby Baby Mosquito Net Snobby 1 Pole KBX 1201
2,train_1036373061,34b4aa697f4606fcf52ec74f53c9f246.jpg,dc132bece40552db,stopper mini,578575602,mini stoppers


## Test dataset

In [None]:
# load the test dataset for prediction
test = pd.read_csv('test_translated.csv', index_col = 0)
test.head(3)

Unnamed: 0,posting_id,image,image_phash,title,title_translate
0,test_2255846744,0006c8e5462ae52167402bac1c2e916e.jpg,ecc292392dc7687a,Edufuntoys - CHARACTER PHONE ada lampu dan musik/ mainan telepon,Edufuntoys - CHARACTER PHONE has lights and music/ toy phone
1,test_3588702337,0007585c4d0f932859339129f709bfdc.jpg,e9968f60d2699e2c,(Beli 1 Free Spatula) Masker Komedo | Blackheads Mask 10gr by Flawless Go Surabaya | Flawless.Go,(Buy 1 Free Spatula) Blackhead Mask | Blackheads Mask 10gr by Flawless Go Surabaya | Flawless. Go
2,test_4015706929,0008377d3662e83ef44e1881af38b879.jpg,ba81c17e3581cabe,READY Lemonilo Mie instant sehat kuah dan goreng,READY Lemonilo Healthy instant noodle soup and fried


## Images

Images in the train_images folder have been converted to dataframe of (64, 64, 3)

In [None]:
# train
train_img = pd.read_csv('images_train.csv', index_col = 0)
train_img.set_index(pd.RangeIndex(start = 0, stop = len(train_img)), inplace = True)
train_img.drop(['label_group', 'image'], axis = 1, inplace = True)
train_img.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,12278,12279,12280,12281,12282,12283,12284,12285,12286,12287
0,143,73,83,150,85,95,153,88,98,158,...,142,194,116,134,191,111,129,193,122,143
1,255,255,255,255,255,255,255,255,255,255,...,255,255,255,255,255,255,255,255,255,255
2,206,201,195,205,200,194,206,201,195,208,...,245,242,243,245,244,245,247,244,245,247
3,254,254,254,254,254,254,254,254,254,254,...,248,251,250,249,251,251,249,252,251,250
4,255,255,255,255,255,255,255,255,255,255,...,141,22,72,141,22,72,141,22,72,141


In [None]:
# validation
validation_img = pd.read_csv('images_validation.csv', index_col = 0)
validation_img.set_index(pd.RangeIndex(start = 0, stop = len(validation_img)), inplace = True)
validation_img.drop(['label_group', 'image'], axis = 1, inplace = True)
validation_img.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,12278,12279,12280,12281,12282,12283,12284,12285,12286,12287
0,255,255,255,255,255,255,255,255,255,255,...,255,255,255,255,255,255,255,255,255,255
1,255,255,255,255,255,255,255,255,255,255,...,255,255,255,255,255,255,255,255,255,255
2,12,14,13,12,14,13,28,30,29,35,...,18,9,9,9,9,9,9,11,11,11
3,255,255,255,255,255,255,255,255,255,255,...,255,255,255,255,255,255,255,255,255,255
4,249,247,246,251,249,248,253,251,250,252,...,173,184,179,173,184,180,174,178,173,169


In [None]:
# if using the 500-observation sample, run this instead of the previous 2 cells
'''# images are (64, 64, 3)
train_img = pd.read_csv('images_sample.csv', index_col = 0)
train_img.set_index(pd.RangeIndex(start = 0, stop = len(train_img)), inplace = True)
train_img.drop(['label_group', 'image'], axis = 1, inplace = True)
train_img.head()'''

"# images are (64, 64, 3)\ntrain_img = pd.read_csv('data/images_sample.csv', index_col = 0)\ntrain_img.set_index(pd.RangeIndex(start = 0, stop = len(train_img)), inplace = True)\ntrain_img.drop(['label_group', 'image'], axis = 1, inplace = True)\ntrain_img.head()"

# Image processing

## Test Images
Images (.jpg) in the test_images folder for prediction

In [None]:
# function to turn images into normalized float type numpy array of desired shape
# img_name is the .jpg name of the image files
# size is in the format (width, height), i.e (64, 64)

path = 'kaggle/test_images/'

def load_image(image_name, size):
    # open the image file
    img = Image.open(path + image_name)

    # resize the image to the specified size
    img = img.resize(size)

    # convert the image to a numpy array of dtype float32
    img_array = np.asarray(img, dtype = np.float32)

    # reshape the image
    img_array = np.reshape(img_array, (1, size[0], size[1], 3))

    # normalize the image
    img_array /= 255.0

    return img_array

In [None]:
# apply the img_to_arr to the images for testing
X_img_test = test['image'].apply(lambda x: load_image(image_name = x, 
                                                      size = (64, 64)))
X_img_test = np.vstack(X_img_test)
X_img_test.shape

(3, 64, 64, 3)

## Train/Validate images
Images that have been converted to dataframe of (64, 64, 3)

In [None]:
# images for traing
# convert train_img to numpy array in float type
X_img_train = np.asarray(train_img, dtype = np.float32)

# reshape
X_img_train = X_img_train.reshape(X_img_train.shape[0], 64, 64, 3)

# normalize
X_img_train /= 255.0

X_img_train.shape

(29603, 64, 64, 3)

In [None]:
# images for validation
# convert validation_img to numpy array in float type
X_img_val = np.asarray(validation_img, dtype = np.float32)

# reshape
X_img_val = X_img_val.reshape(X_img_val.shape[0], 64, 64, 3)

# normalize
X_img_val /= 255.0

X_img_val.shape

(4647, 64, 64, 3)

In [None]:
# if using the 500-observation sample, run this instead of the previous 2 cells
'''X_img = np.asarray(train_img, dtype = np.float32)

# reshape
X_img = X_img.reshape(X_img.shape[0], 64, 64, 3)

# normalize
X_img /= 255.0

X_img.shape'''

'X_img = np.asarray(train_img, dtype = np.float32)\n\n# reshape\nX_img = X_img.reshape(X_img.shape[0], 64, 64, 3)\n\n# normalize\nX_img /= 255.0\n\nX_img.shape'

# Label group processing

In [None]:
# label pre-processing: convert to group then perform onehot encoding

# Convert labels to integers using LabelEncoder
le = LabelEncoder()
train_labels_int = le.fit_transform(train['label_group']) # train
validation_labels_int = le.fit_transform(validation['label_group']) # validation

# One-hot encode the labels using to_categorical
y_train = to_categorical(train_labels_int) # train
y_val = to_categorical(validation_labels_int) # validation

# Print the shape of the one-hot encoded labels
print(y_train.shape)
print(y_val.shape)

(29603, 11014)
(4647, 3429)


In [None]:
# if using the 500-observation sample, run this instead of the previous cell
'''# label pre-processing: convert to group then perform onehot encoding

# convert labels to integers using LabelEncoder
le = LabelEncoder()
labels_int = le.fit_transform(train['label_group'])

# one-hot encode the labels using to_categorical
y = to_categorical(labels_int)

# Print the shape of the one-hot encoded labels
print(y.shape)'''

"# label pre-processing: convert to group then perform onehot encoding\n\n# convert labels to integers using LabelEncoder\nle = LabelEncoder()\nlabels_int = le.fit_transform(train['label_group'])\n\n# one-hot encode the labels using to_categorical\ny = to_categorical(labels_int)\n\n# Print the shape of the one-hot encoded labels\nprint(y.shape)"

# Title processing

## Pre-processing

In [None]:
# function to clean the translated titles before embedding
def clean_title(title):
    # Remove all non-alphanumeric characters and convert to lowercase
    clean1 = re.sub(r'[^a-zA-Z0-9\s]', '', title).lower()
    # Split the cleaned string into words
    clean2 = re.split('\W+', clean1)
    # Remove stopwords and short words
    title_cleaned = [word for word in clean2 if (word not in stopwords.words('english')) & (len(word) > 2)]
    # Join the cleaned words using a space separator
    title_cleaned = ' '.join(title_cleaned)
    return title_cleaned

In [None]:
# clean the title in the test dataset
test['title_cleaned'] = test['title_translate'].apply(clean_title)

test[['title_translate', 'title_cleaned']].head(3)

Unnamed: 0,title_translate,title_cleaned
0,Edufuntoys - CHARACTER PHONE has lights and music/ toy phone,edufuntoys character phone lights music toy phone
1,(Buy 1 Free Spatula) Blackhead Mask | Blackheads Mask 10gr by Flawless Go Surabaya | Flawless. Go,buy free spatula blackhead mask blackheads mask 10gr flawless surabaya flawless
2,READY Lemonilo Healthy instant noodle soup and fried,ready lemonilo healthy instant noodle soup fried


In [None]:
# clean the title in the train dataset
train['title_cleaned'] = train['title_translate'].apply(clean_title)

train[['title_translate', 'title_cleaned']].head(3)

Unnamed: 0,title_translate,title_cleaned
0,Victoria's Secret Paper Bag,victorias secret paper bag
1,Double Tape 3M VHB 12 mm x 4.5 m ORIGINAL / DOUBLE FOAM TAPE,double tape vhb original double foam tape
2,Maling TTS Canned Pork Luncheon Meat 397 gr,maling tts canned pork luncheon meat 397


In [None]:
# if using the 500-observation sample, no need to run this cell
# as there will be train/validation split later
# clean the title in the validation dataset
validation['title_cleaned'] = validation['title_translate'].apply(clean_title)

validation[['title_translate', 'title_cleaned']].head(3)

Unnamed: 0,title_translate,title_cleaned
0,Mustika Ratu Oil Cem-Ceman 175 ml,mustika ratu oil cemceman 175
1,Snobby Baby Mosquito Net Snobby 1 Pole KBX 1201,snobby baby mosquito net snobby pole kbx 1201
2,mini stoppers,mini stoppers


## Bert embedding

In [None]:
# define the Bert tokenizer
Btokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [None]:
# define the Bert model
Bmodel = TFBertModel.from_pretrained("bert-base-uncased")

Downloading tf_model.h5:   0%|          | 0.00/536M [00:00<?, ?B/s]

Some layers from the model checkpoint at bert-base-uncased were not used when initializing TFBertModel: ['mlm___cls', 'nsp___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-base-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


In [None]:
# tokenize the cleaned title in the test dataset
X_title_test = test['title_cleaned'].apply(lambda x: Btokenizer.encode(x, 
                                                                   padding = 'max_length',
                                                                   truncation = True, 
                                                                   max_length = 128, 
                                                                   add_special_tokens = True))

In [None]:
# tokenize the cleaned title in the train dataset
X_title_train = train['title_cleaned'].apply(lambda x: Btokenizer.encode(x, 
                                                                   padding = 'max_length',
                                                                   truncation = True, 
                                                                   max_length = 128, 
                                                                   add_special_tokens = True))

In [None]:
# tokenize the cleaned title in the validation dataset
X_title_val = train['title_cleaned'].apply(lambda x: Btokenizer.encode(x, 
                                                                   padding = 'max_length',
                                                                   truncation = True, 
                                                                   max_length = 128, 
                                                                   add_special_tokens = True))

In [None]:
# if using the 500-observation sample, run this instead of the previous 2 cells
'''# tokenize the cleaned title in the train dataset
X_title = train['title_cleaned'].apply(lambda x: Btokenizer.encode(x, 
                                                                   padding = 'max_length',
                                                                   truncation = True, 
                                                                   max_length = 128, 
                                                                   add_special_tokens = True))'''

"# tokenize the cleaned title in the train dataset\nX_title = train['title_cleaned'].apply(lambda x: Btokenizer.encode(x, \n                                                                   padding = 'max_length',\n                                                                   truncation = True, \n                                                                   max_length = 128, \n                                                                   add_special_tokens = True))"

# Train/Validation split

In [None]:
# only run this cell if using the 500-observation sample
# split the data into training and validation sets
'''X_img_train, X_img_val, X_title_train, X_title_val, y_train, y_val = train_test_split(X_img, X_title, 
                                                                                      y, 
                                                                                      test_size = 0.2, 
                                                                                      random_state = 42)'''

'X_img_train, X_img_val, X_title_train, X_title_val, y_train, y_val = train_test_split(X_img, X_title, \n                                                                                      y, \n                                                                                      test_size = 0.2, \n                                                                                      random_state = 42)'

In [None]:
# pad the title input sequences to have the same length
max_len = 128  # set the maximum sequence length to 128

X_title_train = pad_sequences(X_title_train, 
                                maxlen = max_len, dtype = "long", value = 0, 
                                truncating = "post", padding = "post")
X_title_val = pad_sequences(X_title_val, 
                          maxlen = max_len, dtype = "long", value = 0, 
                          truncating = "post", padding = "post")
X_title_test = pad_sequences(X_title_test, 
                          maxlen = max_len, dtype = "long", value = 0, 
                          truncating = "post", padding = "post")

In [None]:
# convert tokenized title into tensor object
X_title_train = tf.constant(X_title_train, dtype = tf.int32)
X_title_val = tf.constant(X_title_val, dtype = tf.int32)
X_title_test = tf.constant(X_title_test, dtype = tf.int32)

In [None]:
X_img_train.shape

(29603, 64, 64, 3)

In [None]:
X_img_val.shape

(4647, 64, 64, 3)

In [None]:
X_title_train.shape

TensorShape([29603, 128])

In [None]:
X_title_val.shape

TensorShape([29603, 128])

In [None]:
X_title_test.shape

TensorShape([3, 128])

In [None]:
y_train.shape

(29603, 11014)

In [None]:
y_val.shape

(4647, 3429)

# Modeling

In [None]:
# define image input shape
img_shape = (64, 64, 3)

# define title input shape
title_shape = (max_len,)

# define the input layers
title_input = Input(shape = title_shape, dtype = tf.int32, name = 'title_input')
img_input = Input(shape = img_shape, dtype = tf.float32, name = 'img_input')

In [None]:
# define the convolutional layers for the image input
x = layers.Conv2D(32, (3, 3), activation='relu')(img_input)
x = layers.MaxPooling2D((2, 2))(x)
x = Dropout(0.25)(x)  # add dropout here
x = layers.Conv2D(64, (3, 3), activation='relu')(x)
x = layers.MaxPooling2D((2, 2))(x)
x = Dropout(0.25)(x)  # add dropout here
x = layers.Conv2D(64, (3, 3), activation='relu')(x)
x = layers.Flatten()(x)
x = Dropout(0.5)(x)  # add dropout here
x = layers.Dense(64, activation='relu')(x)

In [None]:
# pass the title tokens through the BERT model
title_output = Bmodel(title_input)[1]  # use only the pooled output

# flatten
flatten_layer = Flatten()(title_output)

In [None]:
# concatenate the title and image embeddings
combined = Concatenate()([x, flatten_layer])

In [None]:
y_train.shape[1]

11014

In [None]:
# define the output layer
output = Dense(y_train.shape[1], 
               activation = 'softmax', 
               name = 'output')(combined)

In [None]:
# define the model of two inputs (title and images) and one output (label group)
BertCNNModel = Model(inputs = [img_input, title_input], outputs = output)
BertCNNModel.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 img_input (InputLayer)         [(None, 64, 64, 3)]  0           []                               
                                                                                                  
 conv2d (Conv2D)                (None, 62, 62, 32)   896         ['img_input[0][0]']              
                                                                                                  
 max_pooling2d (MaxPooling2D)   (None, 31, 31, 32)   0           ['conv2d[0][0]']                 
                                                                                                  
 dropout_37 (Dropout)           (None, 31, 31, 32)   0           ['max_pooling2d[0][0]']          
                                                                                              

In [None]:
# compile the model
BertCNNModel.compile(optimizer = 'adam', 
                     loss = 'categorical_crossentropy', 
                     metrics = ['accuracy'])

## Training

In [None]:
# train the model
BertCNNModel.fit({'img_input': X_img_train, 
                  'title_input': X_title_train}, 
                 y_train, 
                 epochs = 10, 
                 batch_size = 32)



Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f7ef2f135b0>

In [None]:
# save trained model
BertCNNModel.save('BertCNNModel.h5')

In [None]:
# to load the saved model
# BertCNNModel = keras.models.load_model('BertCNNModel.h5')

## Evaluation

In [None]:
# evaluate the model on the validation set
score = BertCNNModel.evaluate({'img_input': X_img_val, 
                               'title_input': X_title_val}, 
                              y_val, verbose = 0)

# print the validation accuracy
print('Validation accuracy:', score[1])

# Prediction

In [None]:
predictions = BertCNNModel.predict({'img_input': X_img_test, 
                                    'title_input': X_title_test})

In [None]:
# decode prediction results to get the label group
y_pred = np.argmax(predictions, axis = 1) # decode into integer
test['label_group_pred'] = le.inverse_transform(y_pred) # decode into label group

In [None]:
# max predicted probability for each posting
test['conf_level'] = np.max(predictions, axis = 1)

In [None]:
test.head()