In [1]:
import pandas as pd
import numpy as np
import tensorflow_addons as tfa
import gensim
from gensim.models import Word2Vec
from keras.preprocessing.text import one_hot, Tokenizer
import tqdm
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from keras.utils.np_utils import to_categorical

import tensorflow as tf
import os

In [2]:
import matplotlib.pyplot as plt

def plot_graphs(history, metric):
    plt.plot(history.history[metric])
    plt.plot(history.history['val_'+metric], '')
    plt.xlabel("Epochs")
    plt.ylabel(metric)
    plt.legend([metric, 'val_'+metric])

In [3]:
model = Word2Vec.load("../w2v/emoji_30_epochs_week5.w2v")

In [4]:
test = pd.read_csv('test_emoji.csv')
df_test = pd.DataFrame(test)

train = pd.read_csv('train_emoji.csv')
df_train = pd.DataFrame(train)

val = pd.read_csv('val_emoji.csv')
df_val = pd.DataFrame(val)

all = pd.read_csv('../3 classes/with emoji.csv')
df = pd.DataFrame(all)

In [5]:
# remove possible empty text cell
df = df[['check_stop', 'sentiment']]

print("before cleaned: ", df.shape)
df['check_stop'].replace('', np.nan, inplace=True)
df = df.dropna()
df = df.reset_index(drop=True)
print("After: ", df.shape)

before cleaned:  (7907, 2)
After:  (7905, 2)


In [6]:
tokenize_text = df['check_stop'].apply(lambda x: x.split())
tokenizer = Tokenizer()
tokenizer.fit_on_texts(tokenize_text)

In [7]:
# 转化成词向量矩阵，利用新的word2vec模型
vocab_size = len(tokenizer.word_index)
error_count=0
embedding_matrix = np.zeros((vocab_size + 1, 128))
for word, i in tokenizer.word_index.items():
    if word in model.wv:
        embedding_matrix[i] = model.wv[word]
    else:
        error_count += 1

In [72]:
tokenize_text_train = df_train['check_stop'].apply(lambda x: x.split())
tokenizer_train = Tokenizer()
tokenizer_train.fit_on_texts(tokenize_text)

error_count=0
train_embedding = np.zeros((len(tokenizer_train.word_index) + 1, 128))
for samples in tokenize_train:
    for i, word in enumerate(samples):
        if word in model.wv:
            train_embedding[i] = model.wv[word]
    else:
        error_count += 1

In [76]:
df_train.iloc[-1]

check_stop    lagi ribu peoples not infected ??
sentiment                                     0
Name: 7114, dtype: object

In [64]:
train_embedding

array([[ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       ...,
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [-0.07664385, -0.07250506,  1.23824787, ..., -0.10372932,
         0.02242264,  0.42636442]])

In [60]:
df_train

Unnamed: 0,check_stop,sentiment
0,next bulan turun banyak tidak salah,1
1,kerajaan pkp selamat baru tidak rakyat meningg...,0
2,mangsa salah sorg suami positif covid diward i...,0
3,alhamdulillah patuh arahan kerajaan bantu fron...,2
4,slow gembira anak boleh sekolah,2
...,...,...
7110,ministers ramai really help rakyat ones start ...,0
7111,bila anti vaccine mnum madu asli sampai tidak ...,0
7112,rakyat penat tengok bila kes banyak frontliner...,0
7113,herannya positiv diam tak bagitahu majikan ker...,0


In [57]:
error_count

7115

In [10]:
tokenizer.word_index.items()

dict_items([('kes', 1), ('tidak', 2), ('pkp', 3), ('covid', 4), ('boleh', 5), ('tak', 6), ('turun', 7), ('alhamdulillah', 8), ('rakyat', 9), ('tapi', 10), ('sop', 11), ('negeri', 12), ('baru', 13), ('semoga', 14), ('kena', 15), ('jangan', 16), ('bila', 17), ('banyak', 18), ('naik', 19), ('sembuh', 20), ('malaysia', 21), ('buka', 22), ('kerajaan', 23), ('rentas', 24), ('ramai', 25), ('menurun', 26), ('balik', 27), ('kkm', 28), ('ribu', 29), ('angka', 30), ('sampai', 31), ('kerja', 32), ('duduk', 33), ('harap', 34), ('jaga', 35), ('lockdown', 36), ('tinggi', 37), ('amin', 38), ('sekolah', 39), ('bukan', 40), ('tengok', 41), ('anak', 42), ('rumah', 43), ('zero', 44), ('jadi', 45), ('raya', 46), ('keluar', 47), ('minggu', 48), ('vaksin', 49), ('tutup', 50), ('pergi', 51), ('baik', 52), ('kilang', 53), ('diri', 54), ('tiada', 55), ('negara', 56), ('ikut', 57), ('masa', 58), ('kurang', 59), ('bulan', 60), ('ekonomi', 61), ('jalan', 62), ('esok', 63), ('moga', 64), ('sektor', 65), ('habis', 6

In [11]:
tokenizer.texts_to_sequences(["high"])

[[402]]

In [12]:
df_test['sentiment'] = pd.Categorical(df_test['sentiment'])
df_test['sentiment'] = df_test.sentiment.cat.codes

df_train['sentiment'] = pd.Categorical(df_train['sentiment'])
df_train['sentiment'] = df_train.sentiment.cat.codes

df_val['sentiment'] = pd.Categorical(df_val['sentiment'])
df_val['sentiment'] = df_val.sentiment.cat.codes


y_test = df_test['sentiment']
y_train = df_train['sentiment']
y_val = df_val['sentiment']

y_train = to_categorical(y_train)
y_val = to_categorical(y_val)
y_test = to_categorical(y_test)

In [13]:
tokenize_train = df_train['check_stop'].apply(lambda x: x.split())
tokenize_test = df_test['check_stop'].apply(lambda x: x.split())
tokenize_val = df_val['check_stop'].apply(lambda x: x.split())

sequence = tokenizer.texts_to_sequences(tokenize_train)
traintitle = pad_sequences(sequence, maxlen=100)
sequence = tokenizer.texts_to_sequences(tokenize_val)
valtitle = pad_sequences(sequence, maxlen=100)
sequence = tokenizer.texts_to_sequences(tokenize_test)
testtitle = pad_sequences(sequence, maxlen=100)

In [47]:
testtitle

array([[   0,    0,    0, ...,  118,  376,  320],
       [   0,    0,    0, ..., 1422,   55, 1053],
       [   0,    0,    0, ...,   88, 3052, 3900],
       ...,
       [   0,    0,    0, ...,  380,   21,   38],
       [   0,    0,    0, ...,   29,   46,    2],
       [   0,    0,    0, ...,  134,  260,  363]])

## Simple RNN

In [14]:
rnn = tf.keras.Sequential([
    tf.keras.layers.Embedding(
        input_dim=len(tokenizer.word_index) + 1,
        output_dim=128,
        input_length=100,
        weights=[embedding_matrix]),
        #embeddings_regularizer = tf.keras.regularizers.L2(0.01),
        # Use masking to handle the variable sequence lengths
        #mask_zero=True),
    tf.keras.layers.SimpleRNN(64),
    tf.keras.layers.Dense(64, activation='softmax'),
    tf.keras.layers.Dense(3)
])

In [15]:
checkpoint_path = "training_1/w2v_rnn_emoji.ckpt"
rnn.load_weights(checkpoint_path)

<tensorflow.python.training.tracking.util.CheckpointLoadStatus at 0x2dfb1bb2430>

## Basic LSTM

In [16]:
lstm = tf.keras.Sequential([
    tf.keras.layers.Embedding(
        input_dim=len(tokenizer.word_index) + 1,
        output_dim=128,
        input_length=100,
        weights=[embedding_matrix]),
        #embeddings_regularizer = tf.keras.regularizers.L2(0.01),
        # Use masking to handle the variable sequence lengths
        #mask_zero=True),
    tf.keras.layers.LSTM(64),
    tf.keras.layers.Dense(64, activation='softmax'),
    tf.keras.layers.Dense(3)
])

In [17]:
checkpoint_path = "training_1/w2v_basic_lstm_emoji.ckpt"
lstm.load_weights(checkpoint_path)

<tensorflow.python.training.tracking.util.CheckpointLoadStatus at 0x2dfb1bbb0d0>

## One Layer biLSTM

In [18]:
REGULARIZER = tf.keras.regularizers.L2(0.01)

bilstm = tf.keras.Sequential([
    tf.keras.layers.Embedding(
        input_dim=len(tokenizer.word_index) + 1,
        output_dim=128,
        input_length=100,
        weights=[embedding_matrix],
        embeddings_regularizer=REGULARIZER),
        #embeddings_regularizer = tf.keras.regularizers.L2(0.01),
        # Use masking to handle the variable sequence lengths
        #mask_zero=True),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64)),
    tf.keras.layers.Dense(64, activation='softmax'),
    tf.keras.layers.Dense(3)
])

In [19]:
checkpoint_path = "training_1/w2v_one_layer_bilstm_emoji.ckpt"
bilstm.load_weights(checkpoint_path)

<tensorflow.python.training.tracking.util.CheckpointLoadStatus at 0x2dfb19fbee0>

## Two Layer biLSTM

In [20]:
REGULARIZER = tf.keras.regularizers.L2(0.01)

two_bilstm = tf.keras.Sequential([
    tf.keras.layers.Embedding(
        input_dim=len(tokenizer.word_index) + 1,
        output_dim=128,
        input_length=100,
        weights=[embedding_matrix],
        embeddings_regularizer=REGULARIZER),
        #embeddings_regularizer = tf.keras.regularizers.L2(0.01),
        # Use masking to handle the variable sequence lengths
        #mask_zero=True),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64, return_sequences=True)),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(32)),
    tf.keras.layers.Dense(64, activation='softmax'),
    tf.keras.layers.Dense(3)
])

In [21]:
checkpoint_path = "training_1/w2v_two_layer_bilstm_emoji.ckpt"
two_bilstm.load_weights(checkpoint_path)

<tensorflow.python.training.tracking.util.CheckpointLoadStatus at 0x2dfb1462880>

## Two Layer biLSTM + dropout 0.2

In [22]:
REGULARIZER = tf.keras.regularizers.L2(0.01)

two_bilstm_02 = tf.keras.Sequential([
    tf.keras.layers.Embedding(
        input_dim=len(tokenizer.word_index) + 1,
        output_dim=128,
        input_length=100,
        weights=[embedding_matrix],
        embeddings_regularizer=REGULARIZER),
        #embeddings_regularizer = tf.keras.regularizers.L2(0.01),
        # Use masking to handle the variable sequence lengths
        #mask_zero=True),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64, return_sequences=True)),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(32)),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Dense(64, activation='softmax'),
    tf.keras.layers.Dense(3)
])

In [23]:
checkpoint_path = "training_1/w2v_two_layer_bilstm_02_emoji.ckpt"
two_bilstm_02.load_weights(checkpoint_path)

<tensorflow.python.training.tracking.util.CheckpointLoadStatus at 0x2dfb5f049d0>

## Batch Normalization

In [24]:
REGULARIZER = tf.keras.regularizers.L2(0.01)

two_bilstm_02_batch = tf.keras.Sequential([
    tf.keras.layers.Embedding(
        input_dim=len(tokenizer.word_index) + 1,
        output_dim=128,
        input_length=100,
        weights=[embedding_matrix],
        embeddings_regularizer=REGULARIZER),
        #embeddings_regularizer = tf.keras.regularizers.L2(0.01),
        # Use masking to handle the variable sequence lengths
        #mask_zero=True),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64, return_sequences=True)),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(32)),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.Dense(32, activation='softmax'),
    tf.keras.layers.Dense(3)
])

In [25]:
checkpoint_path = "training_1/w2v_two_layer_bilstm_02_batch_emoji.ckpt"
two_bilstm_02_batch.load_weights(checkpoint_path)

<tensorflow.python.training.tracking.util.CheckpointLoadStatus at 0x2dfb5a5fac0>

In [26]:
sample_text = 'relax malaysia lagi ribu peoples not infected 😂'
sequence_sample = tokenizer.texts_to_sequences(sample_text.split())
sampletitle = pad_sequences(sequence_sample, maxlen=100)

predictions = rnn.predict(sampletitle)
print("rnn: ")
print(predictions[0])

predictions = lstm.predict(sampletitle)
print("lstm: ")
print(predictions[0])

predictions = bilstm.predict(sampletitle)
print("one layer bilstm: ")
print(predictions[0])

predictions = two_bilstm.predict(sampletitle)
print("two layer bilstm: ")
print(predictions[0])

predictions = two_bilstm_02.predict(sampletitle)
print("two layer bilstm with dropout 0.2: ")
print(predictions[0])

predictions = two_bilstm_02_batch.predict(sampletitle)
print("two layer bilstm with dropout 0.2 and batch normalization: ")
print(predictions[0])

rnn: 
[ 0.47566348  0.3869     -0.8826785 ]
lstm: 
[ 0.87411785 -0.5450326  -0.9456332 ]
one layer bilstm: 
[ 0.04367313  0.3583495  -0.8551684 ]
two layer bilstm: 
[ 0.5064722  -0.08338718 -0.8908354 ]
two layer bilstm with dropout 0.2: 
[-0.29552808  0.9070826  -1.1110216 ]
two layer bilstm with dropout 0.2 and batch normalization: 
[ 0.77500546  0.5346924  -1.7100892 ]


In [27]:
sample_text = 'relax malaysia lagi ribu peoples not infected'
sequence_sample = tokenizer.texts_to_sequences(sample_text)
sampletitle = pad_sequences(sequence_sample, maxlen=100)

predictions = rnn.predict([sampletitle])
print("rnn: ")
print(predictions[0])

predictions = lstm.predict([sampletitle])
print("lstm: ")
print(predictions[0])

predictions = bilstm.predict([sampletitle])
print("one layer bilstm: ")
print(predictions[0])

predictions = two_bilstm.predict([sampletitle])
print("two layer bilstm: ")
print(predictions[0])


predictions = two_bilstm_02.predict([sampletitle])
print("two layer bilstm with dropout 0.2: ")
print(predictions[0])

predictions = two_bilstm_02_batch.predict([sampletitle])
print("two layer bilstm with dropout 0.2 and batch normalization: ")
print(predictions[0])

rnn: 
[ 0.40022454  0.3815778  -0.7915553 ]
lstm: 
[ 0.09695607  0.01887295 -0.540655  ]
one layer bilstm: 
[ 0.06305781  0.34132117 -0.8631671 ]
two layer bilstm: 
[ 0.41761106 -0.08941838 -0.79969746]
two layer bilstm with dropout 0.2: 
[-0.07939219  0.8171775  -1.2661232 ]
two layer bilstm with dropout 0.2 and batch normalization: 
[ 0.06245443  1.1072996  -1.0782496 ]


In [28]:
# serialize model to JSON
model_json = two_bilstm_02_batch.to_json()
with open("w2v_two_layer_bilstm_02_batch_emoji.json", "w") as json_file:
    json_file.write(model_json)
# serialize weights to HDF5
two_bilstm_02_batch.save_weights("w2v_two_layer_bilstm_02_batch_emoji.h5")
print("Saved model to disk")

Saved model to disk


In [29]:
from keras.models import load_model
from keras.models import model_from_json

In [30]:
json_file = "w2v_two_layer_bilstm_02_batch_emoji.json"
model_file = "w2v_two_layer_bilstm_02_batch_emoji.h5"

In [31]:
# load json and create model
json_file = open("w2v_two_layer_bilstm_02_batch_emoji.json", 'r')
loaded_model_json = json_file.read()
json_file.close()
loaded_model = model_from_json(loaded_model_json)
# load weights into new model
loaded_model.load_weights(model_file)
print("Loaded model from disk")

Loaded model from disk


In [32]:
sample_text = 'relax malaysia lagi ribu peoples not infected 😂'
sequence_sample = tokenizer.texts_to_sequences(sample_text.split())
sampletitle = pad_sequences(sequence_sample, maxlen=100)

predictions = loaded_model.predict([sampletitle])
print("two layer bilstm with dropout 0.2 and batch normalization: ")
print(predictions[0])

two layer bilstm with dropout 0.2 and batch normalization: 
[ 0.77500546  0.5346924  -1.7100892 ]


In [33]:
import pickle

In [34]:
# saving
with open('word2vec_tokenizer.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [35]:
# loading
with open('word2vec_tokenizer.pickle', 'rb') as handle:
    tokenizer_loaded = pickle.load(handle)

In [89]:
sample_text = 'relax malaysia lagi ribu peoples not infected 😂'
sequence_sample = tokenizer_loaded.texts_to_sequences(sample_text.split())
sampletitle = pad_sequences(sequence_sample, maxlen=100)

predictions = loaded_model.predict([sampletitle])
print("two layer bilstm with dropout 0.2 and batch normalization: ")
print(predictions[0])

two layer bilstm with dropout 0.2 and batch normalization: 
[ 0.77500546  0.5346924  -1.7100892 ]


In [37]:
predictions[0].argmax()

0

In [42]:
import lime
import lime.lime_tabular
from lime.lime_text import LimeTextExplainer

In [43]:
classes = np.unique(df.sentiment)
explainer = LimeTextExplainer(class_names=classes)

In [101]:
vector_store = model
def word2vec_pipeline(examples):
    global vector_store
    #tokenizer = RegexpTokenizer(r'\w+')
    #tokenized_list = []
    sequence_sample = tokenizer.texts_to_sequences(examples[0].split())
    sampletitle = pad_sequences(sequence_sample, maxlen=100)
    #return loaded_model.predict(sampletitle)

In [102]:
exp = explainer.explain_instance("hi", word2vec_pipeline, num_features=6)

TypeError: 'NoneType' object is not subscriptable