# UKARA: Training Bi-LSTM with Word2Vec for Data B

This notebook produced the result for Data B in my Ukara NLP Challenge submission. For more information, check the repository.  

Repository: [https://github.com/ilhamfp/ukara-1.0-challenge](https://github.com/ilhamfp/ukara-1.0-challenge)

## Initialization
Importing libraries and setting contant variable

In [1]:
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.metrics import precision_score, recall_score, f1_score
import numpy as np
import pandas as pd
import warnings
import random
import torch
import os
warnings.filterwarnings('ignore')
import sys, re, csv, codecs

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation
from keras.layers import Bidirectional, GlobalMaxPool1D
from keras.models import Model
from keras import initializers, regularizers, constraints, optimizers, layers

RANDOM_STATE = 1
def get_cv():
    return RepeatedStratifiedKFold(
        n_splits=10,
        n_repeats=10,
        random_state=RANDOM_STATE
    )


def set_seed():
    seed=1492
    random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    np.random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    torch.backends.cudnn.deterministic = True
    
set_seed()
embed_size = 100 # Word2Vec embedding dimension
DIR_DATA_A = "../data/data_A"
DIR_DATA_B = "../data/data_B"
DIR_DATA_MISC = "../input/word2vec-100-indonesian" # directory to word2vec. Change this to run locally.
DIR_DATA_FINAL = "../data/data_final"

Using TensorFlow backend.


## Preprocess Text

In [2]:
import nltk
import itertools
import re
def normalizing_words(review):
    return ''.join(''.join(s)[:1] for _, s in itertools.groupby(review))

def preprocess(text):
    text = text.strip()
    text = text.lower()
    text = re.sub('[^0-9a-zA-Z]+', ' ', text)
    text = re.sub(' +', ' ', text).strip()
    return text

In [3]:
data_A_train = pd.read_csv("{}/data_train_A.csv".format(DIR_DATA_A))
data_A_dev = pd.read_csv("{}/data_dev_A.csv".format(DIR_DATA_A))
data_A_test = pd.read_csv("{}/data_test_A.csv".format(DIR_DATA_A))

data_B_train = pd.read_csv("{}/data_train_B.csv".format(DIR_DATA_B))
data_B_dev = pd.read_csv("{}/data_dev_B.csv".format(DIR_DATA_B))
data_B_test = pd.read_csv("{}/data_test_B.csv".format(DIR_DATA_B))

data_A_train['RESPONSE'] = data_A_train['RESPONSE'].apply(lambda x: preprocess(x))
data_A_dev['RESPONSE'] = data_A_dev['RESPONSE'].apply(lambda x: preprocess(x))
data_A_test['RESPONSE'] = data_A_test['RESPONSE'].apply(lambda x: preprocess(str(x)))

data_B_train['RESPONSE'] = data_B_train['RESPONSE'].apply(lambda x: preprocess(x))
data_B_dev['RESPONSE'] = data_B_dev['RESPONSE'].apply(lambda x: preprocess(x))
data_B_test['RESPONSE'] = data_B_test['RESPONSE'].apply(lambda x: preprocess(str(x)))

stimulus_a = ["Pemanasan global terjadi karena peningkatan produksi karbon dioksida yang dihasilkan oleh pembakaran fosil dan konsumsi bahan bakar yang tinggi.",
"Salah satu akibat adalah mencairnya es abadi di kutub utara dan selatan yang menimbulkan naiknya ketinggian air laut.",
"kenaikan air laut akan terjadi terus menerus meskipun dalam hitungan centimeter akan mengakibatkan perubahan yang signifikan.",
"Film “Waterworld”, adalah film fiksi ilmiah yang menunjukkan akibat adanya pemanasan global yang sangat besar sehingga menyebabkan bumi menjadi tertutup oleh lautan.",
"Negara-negara dan daratan yang dulunya kering menjadi tengelamn karena terjadi kenaikan permukaan air laut.",
"Penduduk yang dulunya bisa berkehidupan bebas menjadi terpaksa mengungsi ke daratan yang lebih tinggi atau tinggal diatas air.",
"Apa yang akan menjadi tantangan bagi suatu penduduk ketika terjadi situasi daratan tidak dapat ditinggali kembali karena tengelam oleh naiknya air laut."]

stimulus_b = ["Sebuah toko baju berkonsep self-service menawarkan promosi dua buah baju bertema tahun baru seharga Rp50.000,00. sebelum baju bertema tahun baru dibagikan kepada pembeli, sebuah layar akan menampilkan tampilan gambar yang menampilkan kondisi kerja di dalam sebuah pabrik konveksi/pembuatan baju. ",
"Kemudian pembeli diberi program pilihan untuk menyelesaikan pembeliannya atau menyumpangkan Rp50.000,00 untuk dijadikan donasi pembagian baju musim dingin di suatu daerah yang membutuhkan.",
"Delapan dari sepuluh pembeli memilih untuk memberikan donasi.",
"Menurut anda mengapa banyak dari pembeli yang memilih berdonasi?"]

data_stimulus = []

for text in stimulus_a:
    data_stimulus.append(preprocess(text))
    
for text in stimulus_b:
    data_stimulus.append(preprocess(text))
    
data_stimulus.extend(data_A_train['RESPONSE'].values)
data_stimulus.extend(data_A_dev['RESPONSE'].values)
data_stimulus.extend(data_A_test['RESPONSE'].values)
data_stimulus.extend(data_B_train['RESPONSE'].values)
data_stimulus.extend(data_B_dev['RESPONSE'].values)
data_stimulus.extend(data_B_test['RESPONSE'].values)

In [4]:
print(len(data_stimulus))
data_stimulus[0:3]

2872


['pemanasan global terjadi karena peningkatan produksi karbon dioksida yang dihasilkan oleh pembakaran fosil dan konsumsi bahan bakar yang tinggi',
 'salah satu akibat adalah mencairnya es abadi di kutub utara dan selatan yang menimbulkan naiknya ketinggian air laut',
 'kenaikan air laut akan terjadi terus menerus meskipun dalam hitungan centimeter akan mengakibatkan perubahan yang signifikan']

## Find max feature

In [5]:
unique_string = set()
for x in data_stimulus:
    for y in x.split():
        unique_string.add(y)
        
print(len(unique_string))

2816


## Find max len

In [6]:
len_data = [len(x.split()) for x in data_stimulus]
print(np.mean(len_data))
print(np.median(len_data))
print(np.std(len_data))
print(np.min(len_data))
print(np.max(len_data))
print(np.percentile(len_data, 98))

12.858635097493035
10.0
11.541505061477743
1
176
43.0


## Tokenizing and Padding

In [7]:
max_features = 3000 # how many unique words to use (since the total of unique word is only 2816)
maxlen = 43 # max number of words in a text to use (from 90th percentile)

tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(data_stimulus)
list_tokenized_train = tokenizer.texts_to_sequences(data_B_train["RESPONSE"].values)
list_tokenized_test = tokenizer.texts_to_sequences(data_B_test["RESPONSE"].values)
X_t = pad_sequences(list_tokenized_train, maxlen=maxlen)
X_te = pad_sequences(list_tokenized_test, maxlen=maxlen)

In [8]:
X_t[0]

array([  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   2,  13, 332, 121,  61, 131,   6,  28,  21,
         5,  20,   7,  16], dtype=int32)

## Building Embedding

In [9]:
import gensim
path = '{}/idwiki_word2vec_100.model'.format(DIR_DATA_MISC)
id_w2v = gensim.models.word2vec.Word2Vec.load(path)
print(id_w2v.most_similar('makan'))

[('sarapan', 0.8138196468353271), ('bersantap', 0.7104067802429199), ('minum', 0.7069040536880493), ('menyantap', 0.6977203488349915), ('tidur', 0.6971700191497803), ('santap', 0.6952369213104248), ('dimakan', 0.6562244892120361), ('memasak', 0.6530824303627014), ('menghidangkan', 0.6491507291793823), ('memanggang', 0.6459312438964844)]


In [10]:
!pip install PySastrawi

Collecting PySastrawi
[?25l  Downloading https://files.pythonhosted.org/packages/61/84/b0a5454a040f81e81e6a95a5d5635f20ad43cc0c288f8b4966b339084962/PySastrawi-1.2.0-py2.py3-none-any.whl (210kB)
[K     |████████████████████████████████| 215kB 2.8MB/s 
[?25hInstalling collected packages: PySastrawi
Successfully installed PySastrawi-1.2.0


In [11]:
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory

factory = StemmerFactory()
stemmer = factory.create_stemmer()

In [12]:
index2word_set = set(id_w2v.wv.index2word)

In [13]:
total_known_word = 0
total_unknown_word = 0
dict_known_word = {}
dict_unknown_word = {}

word_index = tokenizer.word_index
nb_words = max_features
embedding_matrix = np.zeros((nb_words, embed_size), dtype=np.float32)
unknown_vector = np.zeros((embed_size,), dtype=np.float32) - 1.
for word, i in word_index.items():
    cur = word
    if cur in index2word_set:
        embedding_matrix[i] = id_w2v[cur]
        
        if cur in dict_known_word:
            dict_known_word[cur] += 1
        else:
            dict_known_word[cur] = 1
        
        total_known_word += 1
        continue
        
    cur = stemmer.stem(word)
    if cur in index2word_set:
        embedding_matrix[i] = id_w2v[cur]
        
        if cur in dict_known_word:
            dict_known_word[cur] += 1
        else:
            dict_known_word[cur] = 1
        
        total_known_word += 1
        continue
    
    cur = normalizing_words(word)
    if cur in index2word_set:
        embedding_matrix[i] = id_w2v[cur]
        
        if cur in dict_known_word:
            dict_known_word[cur] += 1
        else:
            dict_known_word[cur] = 1
            
        total_known_word += 1
        continue
        
    cur = stemmer.stem(cur)
    if cur in index2word_set:
        embedding_matrix[i] = id_w2v[cur]
        
        if cur in dict_known_word:
            dict_known_word[cur] += 1
        else:
            dict_known_word[cur] = 1
            
        total_known_word += 1
        continue
    
    embedding_matrix[i] = unknown_vector
    if cur in dict_unknown_word:
        dict_unknown_word[cur] += 1
    else:
        dict_unknown_word[cur] = 1
        
    total_unknown_word += 1

In [14]:
print(total_unknown_word)
print(total_known_word)
import operator
sorted_known = sorted(dict_known_word.items(), key=operator.itemgetter(1), reverse=True)
print(sorted_known[0:3])

with open('word_frequency_known.txt', 'w') as f:
    for item in sorted_known:
        f.write('{} {}\n'.format(item[0], item[1]))
        
sorted_unknown = sorted(dict_unknown_word.items(), key=operator.itemgetter(1), reverse=True)
print(sorted_unknown[0:3])

with open('word_frequency_unknown.txt', 'w') as f:
    for item in sorted_unknown:
        f.write('{} {}\n'.format(item[0], item[1]))

390
2426
[('lingkungan', 4), ('sumbang', 4), ('pakaian', 3)]
[('2euro', 1), ('bretika', 1), ('pngungsi', 1)]


## Training

In [15]:
from keras import callbacks

from keras import backend as K

def f1(y_true, y_pred):
    def recall(y_true, y_pred):
        """Recall metric.

        Only computes a batch-wise average of recall.

        Computes the recall, a metric for multi-label classification of
        how many relevant items are selected.
        """
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
        recall = true_positives / (possible_positives + K.epsilon())
        return recall

    def precision(y_true, y_pred):
        """Precision metric.

        Only computes a batch-wise average of precision.

        Computes the precision, a metric for multi-label classification of
        how many selected items are relevant.
        """
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
        precision = true_positives / (predicted_positives + K.epsilon())
        return precision
    precision = precision(y_true, y_pred)
    recall = recall(y_true, y_pred)
    return 2*((precision*recall)/(precision+recall+K.epsilon()))

def get_model():
    inp = Input(shape=(maxlen,))
    x = Embedding(max_features, embed_size, weights=[embedding_matrix])(inp)
    x = Bidirectional(LSTM(50, return_sequences=True, dropout=0.1, recurrent_dropout=0.1))(x)
    x = GlobalMaxPool1D()(x)
    x = Dense(50, activation="relu")(x)
    x = Dropout(0.1)(x)
    x = Dense(1, activation="sigmoid")(x)
    model = Model(inputs=inp, outputs=x)
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=[f1])
    return model

In [16]:
X = X_t
y = data_B_train["LABEL"].values

pred_cv = np.zeros(len(y))
pred_test = np.zeros(len(X_te))
count = 0

for train_index, test_index in get_cv().split(X, y):
    count += 1
    print(count, end='')
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    es = callbacks.EarlyStopping(monitor='val_f1', min_delta=0.0001, patience=8,
                                             verbose=1, mode='max', baseline=None, restore_best_weights=True)

    rlr = callbacks.ReduceLROnPlateau(monitor='val_f1', factor=0.5,
                                      patience=3, min_lr=1e-6, mode='max', verbose=1)
    
    model = get_model()
    model.fit(X_train, 
             y_train, batch_size=16, epochs=50,
             validation_data=(X_test, y_test),
             callbacks=[es, rlr],
             verbose=0)
    
    pred_cv[[test_index]] += model.predict(X_test)[:,0]
    pred_test += model.predict(X_te)[:,0]

1
Epoch 00005: ReduceLROnPlateau reducing learning rate to 0.0005000000237487257.

Epoch 00008: ReduceLROnPlateau reducing learning rate to 0.0002500000118743628.
Restoring model weights from the end of the best epoch
Epoch 00010: early stopping
2
Epoch 00007: ReduceLROnPlateau reducing learning rate to 0.0005000000237487257.

Epoch 00010: ReduceLROnPlateau reducing learning rate to 0.0002500000118743628.
Restoring model weights from the end of the best epoch
Epoch 00012: early stopping
3
Epoch 00008: ReduceLROnPlateau reducing learning rate to 0.0005000000237487257.

Epoch 00011: ReduceLROnPlateau reducing learning rate to 0.0002500000118743628.
Restoring model weights from the end of the best epoch
Epoch 00013: early stopping
4
Epoch 00009: ReduceLROnPlateau reducing learning rate to 0.0005000000237487257.

Epoch 00012: ReduceLROnPlateau reducing learning rate to 0.0002500000118743628.
Restoring model weights from the end of the best epoch
Epoch 00014: early stopping
5
Epoch 00005: R

In [17]:
prediksi_CV = {'RES_ID': data_B_train['RES_ID'],
                'LABEL': np.array(pred_cv)/10
               }
df_final_CV = pd.DataFrame(prediksi_CV, columns= ['RES_ID', 'LABEL'])
df_final_CV.head()

Unnamed: 0,RES_ID,LABEL
0,TRB1,0.790597
1,TRB2,0.55096
2,TRB3,0.327951
3,TRB4,0.311809
4,TRB5,0.259474


In [18]:
df_final_CV.to_csv('{}/df_final_cv_lstm_B.csv'.format(DIR_DATA_FINAL), index=False)

In [19]:
pred_cv = np.array(pred_cv)/10
pred_cv[0:5]

array([0.79059727, 0.55096014, 0.32795082, 0.31180948, 0.25947438])

In [20]:
bin_pred_cv = [1 if x>=0.50 else 0 for x in pred_cv]
bin_pred_cv[0:5]

[1, 1, 0, 0, 0]

In [21]:
f1_score(y, bin_pred_cv)

0.7745358090185677

In [22]:
prediksi_data_B = {'RES_ID': data_B_test['RES_ID'],
                   'LABEL': np.array(pred_test)/100
                  }

In [23]:
df_final_B = pd.DataFrame(prediksi_data_B, columns= ['RES_ID', 'LABEL'])
df_final_B.head()

Unnamed: 0,RES_ID,LABEL
0,TSB1,0.585877
1,TSB2,0.775607
2,TSB3,0.592825
3,TSB4,0.611297
4,TSB5,0.796766


In [24]:
df_final_B.to_csv('{}/df_final_B.csv'.format(DIR_DATA_FINAL), index=False)