## Library

In [1]:
import platform
import os
import random
import scipy

import pandas as pd
from sklearn.metrics import f1_score, classification_report
import tensorflow as tf
import tensorflow_addons as tfa
import numpy as np
import sklearn
import gensim
import datetime


In [2]:
SEED = 42

os.environ['PYTHONHASHSEED']=str(SEED)
random.seed(SEED)
np.random.seed(SEED)
tf.random.set_seed(SEED)


In [3]:
MAX_WORD = 200
EMBEDDING_DIMENSION = 30


In [4]:
print('Python version:', platform.python_version())
print('Tensorflow Version:', tf.__version__)
print('Tensorflow Addons Version:', tfa.__version__)
print('Pandas Version:', pd.__version__)
print('Numpy Version:', np.__version__)
print(f'{gensim.__version__=}')


Python version: 3.8.3
Tensorflow Version: 2.2.0
Tensorflow Addons Version: 0.10.0
Pandas Version: 1.0.3
Numpy Version: 1.18.5
gensim.__version__='3.8.3'


In [5]:
gensim.models.fasttext.FAST_VERSION


1

## Dataset

In [6]:
def to_list(words):
    if words == '[]':
        return []
    else:
        words = words.strip('"')
        words = words.strip('[')
        words = words.strip(']')
        words = words.split(',')
        words = [w.lstrip(' ') for w in words]
        words = [w.strip("'") for w in words]
        words = [w for w in words if w != '']

        return words


In [7]:
df_train = pd.read_csv('./_csv_with_clean_text/train.min.csv')
df_train['words'] = df_train['words'].apply(to_list)
df_train


Unnamed: 0,filename,category,words
0,45e2d0c97f7bdf8cbf3594beb6fdcda0.jpg,3,[]
1,f74d1a5fc2498bbbfa045c74e3cc333e.jpg,3,"[anti, club]"
2,f6c172096818c5fab10ecae722840798.jpg,3,"[door, hello]"
3,251ffd610399ac00fea7709c642676ee.jpg,3,[]
4,73c7328b8eda399199fdedec6e4badaf.jpg,3,[]
...,...,...,...
105385,047a60001de0331608ba64092cc7ae2b.jpg,25,[]
105386,ea39ac66ccdc4b4d4c6443f6c54d8ae3.jpg,25,"[dunia, fashion]"
105387,6215f8c52c5bbcfe3e63e0f3ac6265f8.jpg,25,[]
105388,1733d8286f6658149c7b7cdeb40d6461.jpg,25,[]


In [8]:
df_test = pd.read_csv('./_csv_with_clean_text/test.min.csv')
df_test['words'] = df_test['words'].apply(to_list)
df_test


Unnamed: 0,filename,category,words
0,fd663cf2b6e1d7b02938c6aaae0a32d2.jpg,43,"[kafe, murah, kiss, meja]"
1,c7fd77508a8c355eaab0d4e10efd6b15.jpg,43,"[come, pusat]"
2,127f3e6d6e3491b2459812353f33a913.jpg,43,[girl]
3,5ca4f2da11eda083064e6c36f37eeb81.jpg,43,"[sniper, helmets, nails]"
4,46d681a542f2c71be017eef6aae23313.jpg,43,[]
...,...,...,...
12181,5ba958eacb23cd7d1673bad4dae55784.jpg,43,[]
12182,efbe41a1c2b666b70e337e438559808b.jpg,43,[]
12183,79fdaa5ac5ba10dbe8004cabd8c35eb3.jpg,43,"[happy, pumping]"
12184,ac3d136124617637a05ba66694e381ef.jpg,43,"[money, back, free, boas, balaga, single, pota..."


In [9]:
X_train = df_train['words'].copy()
X_train = X_train.to_numpy()

y_train = df_train['category'].copy()
y_train = y_train.to_numpy()

X_test = df_test['words'].copy()
X_test = X_test.to_numpy()

y_test = df_test['category'].copy()
y_test = y_test.to_numpy()


# FastText Embedding

In [10]:
class EpochLogger(gensim.models.callbacks.CallbackAny2Vec):
    '''Callback to log information about training'''

    def __init__(self):
        self.epoch = 0

    def on_epoch_begin(self, model):
        print(f"Epoch #{self.epoch} start at {datetime.datetime.now()}")

    def on_epoch_end(self, model):
        print(f"Epoch #{self.epoch} end at {datetime.datetime.now()}")
        self.epoch += 1


In [11]:
model_ft = gensim.models.FastText(X_train, size=EMBEDDING_DIMENSION, window=5, min_count=1, sg=0, hs=0, ns_exponent=0.75, seed=SEED, workers=12, callbacks=[EpochLogger()])
model_ft.train(X_train, total_examples=len(X_train), epochs=15)
# model_ft.save('./dataset/w2v_cbow_ns_min_20.model')


Epoch #0 start at 2020-06-30 18:48:09.228866
Epoch #0 end at 2020-06-30 18:48:09.619440
Epoch #1 start at 2020-06-30 18:48:09.619574
Epoch #1 end at 2020-06-30 18:48:10.014141
Epoch #2 start at 2020-06-30 18:48:10.014294
Epoch #2 end at 2020-06-30 18:48:10.371777
Epoch #3 start at 2020-06-30 18:48:10.371868
Epoch #3 end at 2020-06-30 18:48:10.741905
Epoch #4 start at 2020-06-30 18:48:10.742033
Epoch #4 end at 2020-06-30 18:48:11.097716
Epoch #5 start at 2020-06-30 18:48:11.391304
Epoch #5 end at 2020-06-30 18:48:11.807712
Epoch #6 start at 2020-06-30 18:48:11.807835
Epoch #6 end at 2020-06-30 18:48:12.186200
Epoch #7 start at 2020-06-30 18:48:12.186357
Epoch #7 end at 2020-06-30 18:48:12.557992
Epoch #8 start at 2020-06-30 18:48:12.558125
Epoch #8 end at 2020-06-30 18:48:12.932313
Epoch #9 start at 2020-06-30 18:48:12.932439
Epoch #9 end at 2020-06-30 18:48:13.289876
Epoch #10 start at 2020-06-30 18:48:13.289999
Epoch #10 end at 2020-06-30 18:48:13.704182
Epoch #11 start at 2020-06-30 

In [12]:
model_ft.wv.vocab


{'anti': <gensim.models.keyedvectors.Vocab at 0x7f52056e7b50>,
 'club': <gensim.models.keyedvectors.Vocab at 0x7f520855fcd0>,
 'door': <gensim.models.keyedvectors.Vocab at 0x7f5202fe0910>,
 'hello': <gensim.models.keyedvectors.Vocab at 0x7f5202fe08b0>,
 'miss': <gensim.models.keyedvectors.Vocab at 0x7f5202fe09d0>,
 'sled': <gensim.models.keyedvectors.Vocab at 0x7f5202fe0a30>,
 'dogs': <gensim.models.keyedvectors.Vocab at 0x7f5202fe07c0>,
 'dong': <gensim.models.keyedvectors.Vocab at 0x7f5202fe0ac0>,
 'plan': <gensim.models.keyedvectors.Vocab at 0x7f5202fe0b50>,
 'ones': <gensim.models.keyedvectors.Vocab at 0x7f5202fe0bb0>,
 'plain': <gensim.models.keyedvectors.Vocab at 0x7f5202fe0be0>,
 'need': <gensim.models.keyedvectors.Vocab at 0x7f5202fe0c40>,
 'ten': <gensim.models.keyedvectors.Vocab at 0x7f5202fe0ca0>,
 'staff': <gensim.models.keyedvectors.Vocab at 0x7f5202fe0d00>,
 'pose': <gensim.models.keyedvectors.Vocab at 0x7f5202fe0d90>,
 'gain': <gensim.models.keyedvectors.Vocab at 0x7f520

# Convert Dataset

In [13]:
def create_embedding_vectors(model_ft):
    vocab_size = len(model_ft.wv.vocab) + 2
    embedding_vectors = np.zeros((vocab_size, EMBEDDING_DIMENSION))

    vocab_list = list(model_ft.wv.vocab)
    for i in range(vocab_size - 2):
        current_vocab = vocab_list[i]
        index = model_ft.wv.vocab[current_vocab].index

        embedding_vectors[index] = model_ft.wv[current_vocab]

    return vocab_size, embedding_vectors

from tensorflow.keras.preprocessing.sequence import pad_sequences

def sentence_to_index(sentence, model_ft):
    new_sentence = []
    for word in sentence:
        try:
            new_sentence.append(model_ft.wv.vocab[word].index)
        except:
            new_sentence.append(vocab_size - 2)  # vocab_size - 2 : unseen words
    
    return new_sentence

def sentences_to_sequences(model_ft, X):
    for i in range(len(X)):
        X[i] = sentence_to_index(X[i], model_ft)
    X_seq = pad_sequences(X, value=vocab_size-1, maxlen=MAX_WORD, padding='post')  # vocab_size - 1 : empty / padding

    return X_seq


In [14]:
vocab_size, embedding_vectors = create_embedding_vectors(model_ft)
X_train_seq = sentences_to_sequences(model_ft, X_train)
X_test_seq = sentences_to_sequences(model_ft, X_test)


# Model functions

In [15]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Input, Dense, Activation, BatchNormalization, Embedding, Bidirectional, LSTM, GlobalAveragePooling1D
from attention import attention_3d_block

def test_model(model, X_test, y_test, filename):
    global df_test
    y_pred = model.predict(X_test)
    y_pred = np.argmax(y_pred, axis=-1)

    # f1 = f1_score(y_test, y_pred, average='weighted')
    # print('Weighted F1 Score:', f1)

    # print('Classification Report:')
    # print(classification_report(y_test, y_pred))

    test_ids = df_test['filename'].to_numpy()

    np.savetxt(filename, np.rec.fromarrays([test_ids, y_pred]), fmt=['%s', '%02d'], delimiter=',', header='filename,category', comments='')

def compile_model(model):
    model.compile(
    optimizer=tfa.optimizers.RectifiedAdam(
        lr=0.1,
        total_steps=50,
        warmup_proportion=0.1,
        min_lr=0.02,
    ),
    loss='sparse_categorical_crossentropy',
    metrics=['sparse_categorical_accuracy'])
    return model


# FastText + Bi-LSTM + Many-to-one attention mechanism

In [16]:
m_input = Input(shape=(MAX_WORD, ), name='input')
x = Embedding(input_dim=vocab_size, output_dim=EMBEDDING_DIMENSION, input_length=MAX_WORD, weights=[embedding_vectors], trainable=False, name='embedding')(m_input)
x = Bidirectional(LSTM(units=32, return_sequences=True, name='lstm'), name='bi-directional')(x)
# x = GlobalAveragePooling1D(name='global-average-pooling-1d')(x)
x = attention_3d_block(x)
m_output = Dense(42, name='output', activation='softmax')(x)
modelft2 = tf.keras.Model(inputs=m_input, outputs=m_output)

compile_model(modelft2)
modelft2.summary()


Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input (InputLayer)              [(None, 200)]        0                                            
__________________________________________________________________________________________________
embedding (Embedding)           (None, 200, 30)      199050      input[0][0]                      
__________________________________________________________________________________________________
bi-directional (Bidirectional)  (None, 200, 64)      16128       embedding[0][0]                  
__________________________________________________________________________________________________
attention_score_vec (Dense)     (None, 200, 64)      4096        bi-directional[0][0]             
______________________________________________________________________________________________

In [17]:
modelft2.fit(X_train_seq, y_train, batch_size=1000, epochs=50, verbose=1)


Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<tensorflow.python.keras.callbacks.History at 0x7f5200bb4850>

In [18]:
test_model(modelft2, X_test_seq, y_test, 'modelft2.csv')


# FastText + Bi-LSTM

In [19]:
m_input = Input(shape=(MAX_WORD, ), name='input')
x = Embedding(input_dim=vocab_size, output_dim=EMBEDDING_DIMENSION, input_length=MAX_WORD, weights=[embedding_vectors], trainable=False, name='embedding')(m_input)
x = Bidirectional(LSTM(units=32, return_sequences=False, name='lstm'), name='bi-directional')(x)
# x = GlobalAveragePooling1D(name='global-average-pooling-1d')(x)
# x = attention_3d_block(x)
m_output = Dense(42, name='output', activation='softmax')(x)
modelft3 = tf.keras.Model(inputs=m_input, outputs=m_output)

compile_model(modelft3)
modelft3.summary()


Model: "model_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input (InputLayer)           [(None, 200)]             0         
_________________________________________________________________
embedding (Embedding)        (None, 200, 30)           199050    
_________________________________________________________________
bi-directional (Bidirectiona (None, 64)                16128     
_________________________________________________________________
output (Dense)               (None, 42)                2730      
Total params: 217,908
Trainable params: 18,858
Non-trainable params: 199,050
_________________________________________________________________


In [20]:
modelft3.fit(X_train_seq, y_train, batch_size=1000, epochs=50, verbose=1)


Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<tensorflow.python.keras.callbacks.History at 0x7f51e434ae80>

In [21]:
test_model(modelft3, X_test_seq, y_test, 'modelft3.csv')
