# Set up

In [None]:
#import library

from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, Flatten, Conv2D, MaxPooling2D, LSTM, Embedding, concatenate
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import EarlyStopping

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

from PIL import Image

# Load data

## Load the dataset with translated title

### Train text data

In [None]:
# load the training text data
train = pd.read_csv('final_train.csv', usecols = ['posting_id', 'image', 'image_phash', 'label_group', 'title_translate'])

In [None]:
train.head(2)

Unnamed: 0,posting_id,image,image_phash,label_group,title_translate
0,train_129225211,0000a68812bc7e98c42888dfb1c07da0.jpg,94974f937d4c2433,249114794,Victoria's Secret Paper Bag
1,train_3386243561,00039780dfc94d01db8676fe789ecd05.jpg,af3f9460c2838f0f,2937985045,Double Tape 3M VHB 12 mm x 4.5 m ORIGINAL / DO...


### Validation text data

In [None]:
# load the validation text data
validation = pd.read_csv('final_validation.csv', usecols = ['posting_id', 'image', 'image_phash', 'label_group', 'title_translate'])

In [None]:
validation.head(2)

Unnamed: 0,posting_id,image,image_phash,label_group,title_translate
0,train_1003554842,560a5c3577fb22be2ac82c0e97558158.jpg,f3c78fce8c3050f0,3044373336,Mustika Ratu Oil Cem-Ceman 175 ml
1,train_523363809,dd1f14c7a734ff28b67062ae4f8529c6.jpg,af919a66c49d688b,873493898,Snobby Baby Mosquito Net Snobby 1 Pole KBX 1201


### Test text data

In [None]:
# load and preprocess the test text data
test = pd.read_csv('test_translated.csv', usecols = ['posting_id', 'image', 'image_phash', 'title_translate'])

In [None]:
test

Unnamed: 0,posting_id,image,image_phash,title_translate
0,test_2255846744,0006c8e5462ae52167402bac1c2e916e.jpg,ecc292392dc7687a,Edufuntoys - CHARACTER PHONE has lights and mu...
1,test_3588702337,0007585c4d0f932859339129f709bfdc.jpg,e9968f60d2699e2c,(Buy 1 Free Spatula) Blackhead Mask | Blackhea...
2,test_4015706929,0008377d3662e83ef44e1881af38b879.jpg,ba81c17e3581cabe,READY Lemonilo Healthy instant noodle soup and...


## Load the image to dataframe in array format

### Load full training image data

In [None]:
# load and preprocess the full image training data
img_df = pd.read_csv("images_train.csv", delimiter=",", engine='c', na_filter=False)

In [None]:
img_df.head(2)

Unnamed: 0.1,Unnamed: 0,label_group,image,0,1,2,3,4,5,6,...,12278,12279,12280,12281,12282,12283,12284,12285,12286,12287
0,0,249114794,0000a68812bc7e98c42888dfb1c07da0.jpg,143,73,83,150,85,95,153,...,142,194,116,134,191,111,129,193,122,143
1,1,2937985045,00039780dfc94d01db8676fe789ecd05.jpg,255,255,255,255,255,255,255,...,255,255,255,255,255,255,255,255,255,255


In [None]:
X = img_df.iloc[:,3:]

In [None]:
X_ar = X.to_numpy()

In [None]:
n_samples = X_ar.shape[0]
X_image_train = np.zeros((n_samples, 64, 64, 3))

for i in range(n_samples):
    X_image_train[i] = X_ar[i].reshape([64,64,3])

X_image_train.shape

(29603, 64, 64, 3)

### Load validation image data

In [None]:
# load and preprocess the full image validation data
img_df_val = pd.read_csv("images_validation.csv", delimiter=",", engine='c', na_filter=False)

In [None]:
img_df_val.head(2)

Unnamed: 0.1,Unnamed: 0,label_group,image,0,1,2,3,4,5,6,...,12278,12279,12280,12281,12282,12283,12284,12285,12286,12287
0,11443,3044373336,560a5c3577fb22be2ac82c0e97558158.jpg,255,255,255,255,255,255,255,...,255,255,255,255,255,255,255,255,255,255
1,29516,873493898,dd1f14c7a734ff28b67062ae4f8529c6.jpg,255,255,255,255,255,255,255,...,255,255,255,255,255,255,255,255,255,255


In [None]:
X_val = img_df_val.iloc[:,3:]

In [None]:
X_ar_val = X_val.to_numpy()

In [None]:
n_samples_val = X_ar_val.shape[0]
X_image_val = np.zeros((n_samples_val, 64, 64, 3))

for i in range(n_samples_val):
    X_image_val[i] = X_ar_val[i].reshape([64,64,3])

X_image_val.shape

(4647, 64, 64, 3)

In [None]:
X_ar_val.shape

(4647, 12288)

### Load test image data

In [None]:
# from PIL import Image
# define a function to load the images and convert them to numpy arrays
path = 'kaggle/test_images/'

def load_image(image_path):
    img = Image.open(path+image_path)
    fixed_size = (64, 64) # set the desired fixed size
    img = img.resize(fixed_size)
    img_array = np.array(img)
    return img_array

In [None]:
# apply the load_image function to the image file paths in the train dataframe
test['image_array'] = test['image'].apply(lambda x: load_image(x))

In [None]:
test['image_array'].shape

(3,)

Normalize the test image

In [None]:
test['normalized_image'] = test['image_array'] / 255.0

In [None]:
test['normalized_image'].shape

(3,)

In [None]:
X_image_test = np.array(test['normalized_image'].tolist())

In [None]:
X_image_test.shape

(3, 64, 64, 3)

# Title processing

In [None]:
train['title_translate']

0                              Victoria's Secret Paper Bag
1        Double Tape 3M VHB 12 mm x 4.5 m ORIGINAL / DO...
2              Maling TTS Canned Pork Luncheon Meat 397 gr
3        Short sleeve Batik negligee - Random / Mixed P...
4                        Nescafe \xc3\x89clair Latte 220ml
                               ...                        
29598    Battery Battery Xiaomi Redmi Note 3 BM46 BM-46...
29599    Washable 75 gsm Non-Woven Spunbond Fabric Mask...
29600    KHANZAACC Robot RE101S 1.2mm Subwoofer Bass Me...
29601    Broth NON MSG HALAL Mama Kamu Free-range Chick...
29602    LEAK COATING FLEX TAPE / MAGIC ISOLATION / LEA...
Name: title_translate, Length: 29603, dtype: object

In [None]:
import re, string
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
# function to clean the translated titles before embedding
def clean_title(title):
    # Remove all non-alphanumeric characters and convert to lowercase
    clean1 = re.sub(r'[^a-zA-Z0-9\s]', '', title).lower()
    # Split the cleaned string into words
    clean2 = re.split('\W+', clean1)
    # Remove stopwords and short words
    title_cleaned = [word for word in clean2 if (word not in stopwords.words('english')) & (len(word) > 2)]
    # Join the cleaned words using a space separator
    title_cleaned = ' '.join(title_cleaned)
    return title_cleaned

## Title in train data

In [None]:
texts = train['title_translate'].values.tolist()
for i in range(len(texts)):
  texts[i] = clean_title(texts[i])

In [None]:
#Check if blank text exists after filtering
string_lengths = [len(string) for string in texts]
string_lengths = pd.DataFrame(string_lengths)
string_lengths[string_lengths.iloc[:,0]==0]

Unnamed: 0,0


In [None]:
train['title_clean'] = texts

In [None]:
train.head(30)

Unnamed: 0,posting_id,image,image_phash,label_group,title_translate,title_clean
0,train_129225211,0000a68812bc7e98c42888dfb1c07da0.jpg,94974f937d4c2433,249114794,Victoria's Secret Paper Bag,victorias secret paper bag
1,train_3386243561,00039780dfc94d01db8676fe789ecd05.jpg,af3f9460c2838f0f,2937985045,Double Tape 3M VHB 12 mm x 4.5 m ORIGINAL / DO...,double tape vhb original double foam tape
2,train_2288590299,000a190fdd715a2a36faed16e2c65df7.jpg,b94cb00ed3e50f78,2395904891,Maling TTS Canned Pork Luncheon Meat 397 gr,maling tts canned pork luncheon meat 397
3,train_2406599165,00117e4fc239b1b641ff08340b429633.jpg,8514fc58eafea283,4093212188,Short sleeve Batik negligee - Random / Mixed P...,short sleeve batik negligee random mixed patte...
4,train_3369186413,00136d1cf4edede0203f32f05f660588.jpg,a6f319f924ad708c,3648931069,Nescafe \xc3\x89clair Latte 220ml,nescafe xc3x89clair latte 220ml
5,train_2464356923,0013e7355ffc5ff8fb1ccad3e42d92fe.jpg,bbd097a7870f4a50,2660605217,WOMEN'S PANTS (BB 45-84 KG) Harem women (cod a...,womens pants 4584 harem women cod available
6,train_1802986387,00144a49c56599d45354a1c28104c039.jpg,f815c9bb833ab4c8,1835033137,Children's robe size 1-12 yrs,childrens robe size 112 yrs
7,train_1806152124,0014f61389cbaa687a58e38a97b6383d.jpg,eea7e1c0c04da33d,1565741687,SALUR PLISKET CULOT / CANDY PLISKET / WISH KUL...,salur plisket culot candy plisket wish kulot p...
8,train_86570404,0019a3c6755a194cb2e2c12bfc63972e.jpg,ea9af4f483249972,2359912463,"[LOGU] Magnetic number fridge sticker, magneti...",logu magnetic number fridge sticker magnetic n...
9,train_831680791,001be52b2beec40ddc1d2d7fc7a68f08.jpg,e1ce953d1a70618f,2630990665,BIG SALE SHOES COOL LEATHER LOOSE TO WORK OFFI...,big sale shoes cool leather loose work office ...


## Title in validation data

In [None]:
texts_val = validation['title_translate'].values.tolist()
for i in range(len(texts_val)):
  texts_val[i] = clean_title(texts_val[i])

In [None]:
validation['title_clean'] = texts_val
validation.head(30)

Unnamed: 0,posting_id,image,image_phash,label_group,title_translate,title_clean
0,train_1003554842,560a5c3577fb22be2ac82c0e97558158.jpg,f3c78fce8c3050f0,3044373336,Mustika Ratu Oil Cem-Ceman 175 ml,mustika ratu oil cemceman 175
1,train_523363809,dd1f14c7a734ff28b67062ae4f8529c6.jpg,af919a66c49d688b,873493898,Snobby Baby Mosquito Net Snobby 1 Pole KBX 1201,snobby baby mosquito net snobby pole kbx 1201
2,train_1036373061,34b4aa697f4606fcf52ec74f53c9f246.jpg,dc132bece40552db,578575602,mini stoppers,mini stoppers
3,train_3107279377,8255718baadd70981be5f49de9ff270b.jpg,e691986c9586cdec,1376270839,F916/304 Jelly Slides Wedges Shoes,f916304 jelly slides wedges shoes
4,train_2073151758,4e8d8a4d98f98dd41be729476d498701.jpg,f7b9c25c64433926,994676122,\xe3\x80\x90CELEB\xe3\x80\x91100 Pcs Korean St...,xe3x80x90celebxe3x80x91100 pcs korean style el...
5,train_953854502,c941fd0e10d0a58b39825e32edf0ef9d.jpg,b3111172e5bdfa30,2299022995,U disk pen digital audio recording USB flash d...,disk pen digital audio recording usb flash drive
6,train_329988377,bc957fd54730829bd15d75f39eccce5b.jpg,836164e79938f8f1,2121822798,Something Niacinamide + Moisture Beet Serum,something niacinamide moisture beet serum
7,train_2310491569,c2e3ca6729aa9975b351b52fa5e6d477.jpg,ec88c6b793d0ac66,2803363809,Cosmetic Bag,cosmetic bag
8,train_4223409596,f2486add3fa87aa40b652af9bd925e90.jpg,bb70859fe0ce4847,1256204969,3 Pairs of MACHU PICHU Junior Velvet Suits - S...,pairs machu pichu junior velvet suits sml trou...
9,train_38697839,a910d82d906ce755c1bcffcc22e5dd9e.jpg,819eff698642c13d,3576030179,BOLA VOLLY / VOLLY / VOLLY/ VOLLEYBALL MIKASA ...,bola volly volly volly volleyball mikasa 210


## Title in test data

In [None]:
texts_test = test['title_translate'].values.tolist()
for i in range(len(texts_test)):
  texts_test[i] = clean_title(texts_test[i])

In [None]:
test['title_clean'] = texts_test
test.head(30)

Unnamed: 0,posting_id,image,image_phash,title_translate,image_array,normalized_image,title_clean
0,test_2255846744,0006c8e5462ae52167402bac1c2e916e.jpg,ecc292392dc7687a,Edufuntoys - CHARACTER PHONE has lights and mu...,"[[[255, 255, 255], [255, 255, 255], [255, 255,...","[[[1.0, 1.0, 1.0], [1.0, 1.0, 1.0], [1.0, 1.0,...",edufuntoys character phone lights music toy phone
1,test_3588702337,0007585c4d0f932859339129f709bfdc.jpg,e9968f60d2699e2c,(Buy 1 Free Spatula) Blackhead Mask | Blackhea...,"[[[255, 245, 244], [255, 245, 244], [255, 245,...","[[[1.0, 0.9607843137254902, 0.9568627450980393...",buy free spatula blackhead mask blackheads mas...
2,test_4015706929,0008377d3662e83ef44e1881af38b879.jpg,ba81c17e3581cabe,READY Lemonilo Healthy instant noodle soup and...,"[[[254, 254, 254], [254, 254, 254], [254, 254,...","[[[0.996078431372549, 0.996078431372549, 0.996...",ready lemonilo healthy instant noodle soup fried


# Embedding / Modeling

In [None]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.28.1-py3-none-any.whl (7.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.0/7.0 MB[0m [31m96.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting huggingface-hub<1.0,>=0.11.0
  Downloading huggingface_hub-0.13.4-py3-none-any.whl (200 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m200.1/200.1 kB[0m [31m29.8 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m107.2 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.13.4 tokenizers-0.13.3 transformers-4.28.1


In [None]:
from transformers import GPT2Tokenizer, TFGPT2Model

In [None]:
import tensorflow as tf

In [None]:
# Load the tokenizer and GPT-2 model
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
gpt2 = TFGPT2Model.from_pretrained('gpt2')

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

Downloading tf_model.h5:   0%|          | 0.00/498M [00:00<?, ?B/s]

All model checkpoint layers were used when initializing TFGPT2Model.

All the layers of TFGPT2Model were initialized from the model checkpoint at gpt2.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFGPT2Model for predictions without further training.


## Title preprocessing

In [None]:
X_text_train = [tokenizer.encode(text, add_special_tokens = True) for text in train['title_clean']]

In [None]:
X_text_val = [tokenizer.encode(text, add_special_tokens = True) for text in validation['title_clean']]

In [None]:
X_text_test = [tokenizer.encode(text, add_special_tokens = True) for text in test['title_clean']]

In [None]:
# Pad the text input sequences to have the same length
max_len = 128  # Set the maximum sequence length to 128
X_text_train = pad_sequences(X_text_train, 
                                maxlen = max_len, dtype = "long", value = 0, 
                                truncating = "post", padding = "post")
X_text_val = pad_sequences(X_text_val, 
                          maxlen = max_len, dtype = "long", value = 0, 
                          truncating = "post", padding = "post")
X_text_test = pad_sequences(X_text_test, 
                          maxlen = max_len, dtype = "long", value = 0, 
                          truncating = "post", padding = "post")

# Convert the input sequences to a TensorFlow constant tensor
X_text_train = tf.constant(X_text_train, dtype = tf.int32)
X_text_val = tf.constant(X_text_val, dtype = tf.int32)
X_text_test = tf.constant(X_text_test, dtype = tf.int32)

print(X_text_train.shape)
print(X_text_val.shape)
print(X_text_test.shape)

(29603, 128)
(4647, 128)
(3, 128)


## Label preprocessing

In [None]:
class_num = train["label_group"].nunique()
class_num

11014

In [None]:
# label pre-processing: convert to group then perform onehot encoding

from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.utils import to_categorical

# Convert labels to integers using LabelEncoder
le = LabelEncoder()
labels_int = le.fit_transform(train['label_group'])
labels_int_val = le.fit_transform(validation['label_group'])

# One-hot encode the labels using to_categorical
y_train = to_categorical(labels_int)
y_val = to_categorical(labels_int_val, num_classes = 11014)

# Print the shape of the one-hot encoded labels
print(y_train.shape)
print(y_val.shape)

(29603, 11014)
(4647, 11014)


## Define the model

In [None]:
from tensorflow.keras import layers

In [None]:
# Define the neural network model
# Use a convolutional neural network (CNN) to process the image data.
# Use GPT-2 to process the text data.
# Combine the outputs of the two networks using a concatenation layer.

# define the image input
image_input = layers.Input(shape=(64, 64, 3), name='image_input')

# define the convolutional layers for the image input
x = layers.Conv2D(32, (3, 3), activation='relu')(image_input)
x = layers.MaxPooling2D((2, 2))(x)
x = layers.Dropout(0.25)(x)
x = layers.Conv2D(64, (3, 3), activation='relu')(x)
x = layers.MaxPooling2D((2, 2))(x)
x = layers.Dropout(0.25)(x)  
x = layers.Flatten()(x)
x = layers.Dropout(0.25)(x)  
x = layers.Dense(64, activation='relu')(x)

In [None]:
# Define the text input
text_input = layers.Input(shape=(max_len,), dtype=tf.int32, name='text_input')

In [None]:
# Get the output of the GPT-2 model using the input layer
gpt2_output = gpt2(text_input)[0]

In [None]:
flatten_layer = Flatten()(gpt2_output)

In [None]:
# Concatenate the image and text features
concatenated = layers.concatenate([x, flatten_layer])

In [None]:
# Define the output layer
output_layer = layers.Dense(class_num, activation='softmax', name='output')(concatenated)

In [None]:
# Define the model with the two inputs and one output
model = Model(inputs=[image_input, text_input], outputs=output_layer)

model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 image_input (InputLayer)       [(None, 64, 64, 3)]  0           []                               
                                                                                                  
 conv2d (Conv2D)                (None, 62, 62, 32)   896         ['image_input[0][0]']            
                                                                                                  
 max_pooling2d (MaxPooling2D)   (None, 31, 31, 32)   0           ['conv2d[0][0]']                 
                                                                                                  
 dropout_37 (Dropout)           (None, 31, 31, 32)   0           ['max_pooling2d[0][0]']          
                                                                                              

In [None]:
# Compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

## Train the model

In [None]:
# Train the model
history = model.fit({'image_input': X_image_train, 'text_input': X_text_train}, 
          y_train, 
          epochs=10, 
          batch_size=32,
          validation_data=([X_image_val, X_text_val], y_val))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


## Validate the model

In [None]:
# Prediction on the validation set
score = model.evaluate({'image_input': X_image_val, 'text_input': X_text_val}, y_val, verbose=0)

# Print the validation accuracy
print('Validation accuracy:', score[1])

Validation accuracy: 0.00021519260189961642


## Save the model

In [None]:
# Save the model parameters
model.save("GPT2_model.h5")

## Prediction on test data and image

In [None]:
# Predict on test data
prediction = model.predict({"image_input": X_image_test, "text_input": X_text_test})

In [None]:
# Convert the predicted one-hot encoded labels to original type.
y_pred_int = np.argmax(prediction, axis=1)
pre_label = le.inverse_transform(y_pred_int)
pre_label

In [None]:
# Get the maximum predicted probability for each sample
confidence_level = np.max(prediction, axis=1)

In [None]:
# Add the predicted labels and confidence levels into the test dataset
test["predicted label"] = pre_label
test["confidence level"] = confidence_level
test