## Library

In [1]:
import platform
import os
import random
import scipy

import pandas as pd
from sklearn.metrics import f1_score, classification_report
import tensorflow as tf
import tensorflow_addons as tfa
import numpy as np
import sklearn


In [2]:
SEED = 42

os.environ['PYTHONHASHSEED']=str(SEED)
random.seed(SEED)
np.random.seed(SEED)
tf.random.set_seed(SEED)


In [3]:
print('Python version:', platform.python_version())
print('Tensorflow Version:', tf.__version__)
print('Tensorflow Addons Version:', tfa.__version__)
print('Pandas Version:', pd.__version__)
print('Numpy Version:', np.__version__)


Python version: 3.8.3
Tensorflow Version: 2.2.0
Tensorflow Addons Version: 0.10.0
Pandas Version: 1.0.3
Numpy Version: 1.18.5


## Dataset

In [4]:
def to_list(words):
    if words == '[]':
        return []
    else:
        words = words.strip('"')
        words = words.strip('[')
        words = words.strip(']')
        words = words.split(',')
        words = [w.lstrip(' ') for w in words]
        words = [w.strip("'") for w in words]
        words = [w for w in words if w != '']

        return words


In [5]:
df_train = pd.read_csv('./_csv_with_clean_text/train.min.csv')
df_train['words'] = df_train['words'].apply(to_list)
df_train


Unnamed: 0,filename,category,words
0,45e2d0c97f7bdf8cbf3594beb6fdcda0.jpg,3,[]
1,f74d1a5fc2498bbbfa045c74e3cc333e.jpg,3,"[anti, club]"
2,f6c172096818c5fab10ecae722840798.jpg,3,"[door, hello]"
3,251ffd610399ac00fea7709c642676ee.jpg,3,[]
4,73c7328b8eda399199fdedec6e4badaf.jpg,3,[]
...,...,...,...
105385,047a60001de0331608ba64092cc7ae2b.jpg,25,[]
105386,ea39ac66ccdc4b4d4c6443f6c54d8ae3.jpg,25,"[dunia, fashion]"
105387,6215f8c52c5bbcfe3e63e0f3ac6265f8.jpg,25,[]
105388,1733d8286f6658149c7b7cdeb40d6461.jpg,25,[]


In [6]:
df_test = pd.read_csv('./_csv_with_clean_text/test.min.csv')
df_test['words'] = df_test['words'].apply(to_list)
df_test


Unnamed: 0,filename,category,words
0,fd663cf2b6e1d7b02938c6aaae0a32d2.jpg,43,"[kafe, murah, kiss, meja]"
1,c7fd77508a8c355eaab0d4e10efd6b15.jpg,43,"[come, pusat]"
2,127f3e6d6e3491b2459812353f33a913.jpg,43,[girl]
3,5ca4f2da11eda083064e6c36f37eeb81.jpg,43,"[sniper, helmets, nails]"
4,46d681a542f2c71be017eef6aae23313.jpg,43,[]
...,...,...,...
12181,5ba958eacb23cd7d1673bad4dae55784.jpg,43,[]
12182,efbe41a1c2b666b70e337e438559808b.jpg,43,[]
12183,79fdaa5ac5ba10dbe8004cabd8c35eb3.jpg,43,"[happy, pumping]"
12184,ac3d136124617637a05ba66694e381ef.jpg,43,"[money, back, free, boas, balaga, single, pota..."


In [7]:
X_train = df_train['words'].copy()
X_train = X_train.apply(lambda words: ' '.join(words))
X_train = X_train.to_numpy()

y_train = df_train['category'].copy()
y_train = y_train.to_numpy()

X_test = df_test['words'].copy()
X_test = X_test.apply(lambda words: ' '.join(words))
X_test = X_test.to_numpy()

y_test = df_test['category'].copy()
y_test = y_test.to_numpy()


## Preprocess word

In [8]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

bow_vectorizer = CountVectorizer(lowercase=False, binary=True)
tfidf_l1_vectorizer = TfidfVectorizer(lowercase=False, norm='l1', sublinear_tf=True)
tfidf_l2_vectorizer = TfidfVectorizer(lowercase=False, norm='l2', sublinear_tf=True)

X_train_bow = bow_vectorizer.fit_transform(X_train)
X_train_tfidf_l1 = tfidf_l1_vectorizer.fit_transform(X_train)
X_train_tfidf_l2 = tfidf_l2_vectorizer.fit_transform(X_train)

X_test_bow = bow_vectorizer.transform(X_test)
X_test_tfidf_l1 = tfidf_l1_vectorizer.transform(X_test)
X_test_tfidf_l2 = tfidf_l2_vectorizer.transform(X_test)


In [9]:
scipy.sparse.csr_matrix.sort_indices(X_train_bow)
scipy.sparse.csr_matrix.sort_indices(X_train_tfidf_l1)
scipy.sparse.csr_matrix.sort_indices(X_train_tfidf_l2)

scipy.sparse.csr_matrix.sort_indices(X_test_bow)
scipy.sparse.csr_matrix.sort_indices(X_test_tfidf_l1)
scipy.sparse.csr_matrix.sort_indices(X_test_tfidf_l2)


# Some functions

In [10]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Input, Dense, Activation, BatchNormalization

def test_model(model, X_test, y_test, filename):
    global df_test
    y_pred = model.predict(X_test)
    y_pred = np.argmax(y_pred, axis=-1)

    # f1 = f1_score(y_test, y_pred, average='weighted')
    # print('Weighted F1 Score:', f1)

    # print('Classification Report:')
    # print(classification_report(y_test, y_pred))

    test_ids = df_test['filename'].to_numpy()

    np.savetxt(filename, np.rec.fromarrays([test_ids, y_pred]), fmt=['%s', '%02d'], delimiter=',', header='filename,category', comments='')

def compile_model(model):
    model.compile(
    optimizer=tfa.optimizers.RectifiedAdam(
        lr=0.005,
        total_steps=50,
        warmup_proportion=0.1,
        min_lr=0.001,
    ),
    loss='sparse_categorical_crossentropy',
    metrics=['sparse_categorical_accuracy'])
    return model


# BOW + NN

In [11]:
model1 = Sequential([
    Input((6633, ), sparse=True),
    Dense(42, activation='softmax')
])
compile_model(model1)
model1.summary()


Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 42)                278628    
Total params: 278,628
Trainable params: 278,628
Non-trainable params: 0
_________________________________________________________________


In [12]:
model1.fit(X_train_bow, y_train, batch_size=100, epochs=50, verbose=1)


Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<tensorflow.python.keras.callbacks.History at 0x7fe4779eb760>

In [13]:
test_model(model1, X_test_bow, y_test, 'model1.csv')


# BOW + MLP

In [14]:
model2 = Sequential([
    Input((6633, ), sparse=True),

    Dense(331),
    BatchNormalization(),
    Activation('relu'),

    Dense(110),
    BatchNormalization(),
    Activation('relu'),

    Dense(42, activation='softmax')
])
compile_model(model2)
model2.summary()


Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_1 (Dense)              (None, 331)               2195854   
_________________________________________________________________
batch_normalization (BatchNo (None, 331)               1324      
_________________________________________________________________
activation (Activation)      (None, 331)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 110)               36520     
_________________________________________________________________
batch_normalization_1 (Batch (None, 110)               440       
_________________________________________________________________
activation_1 (Activation)    (None, 110)               0         
_________________________________________________________________
dense_3 (Dense)              (None, 42)               

In [15]:
model2.fit(X_train_bow, y_train, batch_size=100, epochs=50, verbose=1)


Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<tensorflow.python.keras.callbacks.History at 0x7fe47c3b4100>

In [16]:
test_model(model2, X_test_bow, y_test, 'model2.csv')

# TD-IDF + MLP

In [17]:
model3 = Sequential([
    Input((6633, ), sparse=True),

    Dense(331),
    BatchNormalization(),
    Activation('relu'),

    Dense(110),
    BatchNormalization(),
    Activation('relu'),

    Dense(42, activation='softmax')
])
compile_model(model3)
model3.summary()


Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_4 (Dense)              (None, 331)               2195854   
_________________________________________________________________
batch_normalization_2 (Batch (None, 331)               1324      
_________________________________________________________________
activation_2 (Activation)    (None, 331)               0         
_________________________________________________________________
dense_5 (Dense)              (None, 110)               36520     
_________________________________________________________________
batch_normalization_3 (Batch (None, 110)               440       
_________________________________________________________________
activation_3 (Activation)    (None, 110)               0         
_________________________________________________________________
dense_6 (Dense)              (None, 42)               

In [18]:
model3.fit(X_train_tfidf_l2, y_train, batch_size=100, epochs=50, verbose=1)


Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<tensorflow.python.keras.callbacks.History at 0x7fe51c1c0c10>

In [19]:
test_model(model3, X_test_tfidf_l2, y_test, 'model3.csv')