In [1]:
import pandas as pd
import numpy as np
import tensorflow_addons as tfa
import gensim
from gensim.models import Word2Vec
from keras.preprocessing.text import one_hot, Tokenizer
import tqdm
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from keras.utils.np_utils import to_categorical

import tensorflow as tf
import os

In [2]:
import matplotlib.pyplot as plt

def plot_graphs(history, metric):
    plt.plot(history.history[metric])
    plt.plot(history.history['val_'+metric], '')
    plt.xlabel("Epochs")
    plt.ylabel(metric)
    plt.legend([metric, 'val_'+metric])

In [3]:
model = Word2Vec.load("../w2v/no_emoji_30_epochs_week5.w2v")

In [4]:
test = pd.read_csv('test.csv')
df_test = pd.DataFrame(test)

train = pd.read_csv('train.csv')
df_train = pd.DataFrame(train)

val = pd.read_csv('val.csv')
df_val = pd.DataFrame(val)

all = pd.read_csv('../3 classes/facebook_health_cases (all).csv')
df = pd.DataFrame(all)

In [5]:
# remove possible empty text cell
df = df[['check_stop', 'sentiment']]

print("before cleaned: ", df.shape)
df['check_stop'].replace('', np.nan, inplace=True)
df = df.dropna()
df = df.reset_index(drop=True)
print("After: ", df.shape)

before cleaned:  (7907, 2)
After:  (7905, 2)


In [6]:
tokenize_text = df['check_stop'].apply(lambda x: x.split())

In [7]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(tokenize_text)

In [8]:
# 转化成词向量矩阵，利用新的word2vec模型
vocab_size = len(tokenizer.word_index)
error_count=0
embedding_matrix = np.zeros((vocab_size + 1, 128))
for word, i in tokenizer.word_index.items():
    if word in model.wv:
        embedding_matrix[i] = model.wv[word]
    else:
        error_count += 1

In [9]:
error_count

0

In [10]:
embedding_matrix

array([[ 0.00000000e+00,  0.00000000e+00,  0.00000000e+00, ...,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00],
       [ 1.62872031e-01, -4.18042764e-04, -4.52394694e-01, ...,
         8.39686692e-02,  2.78790563e-01,  5.80404282e-01],
       [-6.51707351e-01,  3.76401782e-01, -4.18396175e-01, ...,
         1.41281709e-01,  3.50653052e-01,  9.47910190e-01],
       ...,
       [-1.18422797e-02,  3.49949636e-02,  1.94209423e-02, ...,
         8.95177014e-03,  7.48859392e-03,  2.72510685e-02],
       [-5.66299306e-03,  1.42519949e-02,  1.16926571e-02, ...,
         1.17878104e-02,  8.84990022e-03,  1.76386032e-02],
       [-3.11119203e-03,  1.40225505e-02,  1.50357643e-02, ...,
         7.57448794e-03,  3.70327896e-03,  1.97595917e-02]])

In [11]:
df_test['sentiment'] = pd.Categorical(df_test['sentiment'])
df_test['sentiment'] = df_test.sentiment.cat.codes

df_train['sentiment'] = pd.Categorical(df_train['sentiment'])
df_train['sentiment'] = df_train.sentiment.cat.codes

df_val['sentiment'] = pd.Categorical(df_val['sentiment'])
df_val['sentiment'] = df_val.sentiment.cat.codes


y_test = df_test['sentiment']
y_train = df_train['sentiment']
y_val = df_val['sentiment']

y_train = to_categorical(y_train)
y_val = to_categorical(y_val)
y_test = to_categorical(y_test)

In [12]:
tokenize_train = df_train['check_stop_no_emoji'].apply(lambda x: x.split())
tokenize_test = df_test['check_stop_no_emoji'].apply(lambda x: x.split())
tokenize_val = df_val['check_stop_no_emoji'].apply(lambda x: x.split())

sequence = tokenizer.texts_to_sequences(tokenize_train)
traintitle = pad_sequences(sequence, maxlen=100)
sequence = tokenizer.texts_to_sequences(tokenize_val)
valtitle = pad_sequences(sequence, maxlen=100)
sequence = tokenizer.texts_to_sequences(tokenize_test)
testtitle = pad_sequences(sequence, maxlen=100)

## Simple RNN

In [13]:
rnn = tf.keras.Sequential([
    tf.keras.layers.Embedding(
        input_dim=len(tokenizer.word_index) + 1,
        output_dim=128,
        input_length=100,
        weights=[embedding_matrix]),
        #embeddings_regularizer = tf.keras.regularizers.L2(0.01),
        # Use masking to handle the variable sequence lengths
        #mask_zero=True),
    tf.keras.layers.SimpleRNN(64),
    tf.keras.layers.Dense(64, activation='softmax'),
    tf.keras.layers.Dense(3)
])

In [19]:
checkpoint_path = "training_1/w2v_rnn_no_emoji.ckpt"
rnn.load_weights(checkpoint_path)

Test Loss: 0.7317108511924744
Test Accuracy: 0.7205387353897095
Micro F1-Score: 0.7205387353897095
Macro F1-Score: 0.6495874524116516


## Simple LSTM

In [20]:
lstm = tf.keras.Sequential([
    tf.keras.layers.Embedding(
        input_dim=len(tokenizer.word_index) + 1,
        output_dim=128,
        input_length=100,
        weights=[embedding_matrix]),
        #embeddings_regularizer = tf.keras.regularizers.L2(0.01),
        # Use masking to handle the variable sequence lengths
        #mask_zero=True),
    tf.keras.layers.LSTM(64),
    tf.keras.layers.Dense(64, activation='softmax'),
    tf.keras.layers.Dense(3)
])

In [27]:
checkpoint_path = "training_1/w2v_simple_lstm_no_emoji.ckpt"
lstm.load_weights(checkpoint_path)

Test Loss: 0.7836313247680664
Test Accuracy: 0.7188552021980286
Micro F1-Score: 0.7188551425933838
Macro F1-Score: 0.6511731147766113


## One Layer biLSTM

In [28]:
REGULARIZER = tf.keras.regularizers.L2(0.01)

bilstm = tf.keras.Sequential([
    tf.keras.layers.Embedding(
        input_dim=len(tokenizer.word_index) + 1,
        output_dim=128,
        input_length=100,
        weights=[embedding_matrix],
        embeddings_regularizer=REGULARIZER),
        #embeddings_regularizer = tf.keras.regularizers.L2(0.01),
        # Use masking to handle the variable sequence lengths
        #mask_zero=True),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64)),
    tf.keras.layers.Dense(64, activation='softmax'),
    tf.keras.layers.Dense(3)
])

In [35]:
checkpoint_path = "training_1/w2v_one_layer_bilstm_no_emoji.ckpt"
bilstm.load_weights(checkpoint_path)

Test Loss: 0.8160718679428101
Test Accuracy: 0.7491582632064819
Micro F1-Score: 0.7491582632064819
Macro F1-Score: 0.6462236046791077


## Two Layer biLSTM 

In [36]:
REGULARIZER = tf.keras.regularizers.L2(0.01)

two_bilstm = tf.keras.Sequential([
    tf.keras.layers.Embedding(
        input_dim=len(tokenizer.word_index) + 1,
        output_dim=128,
        input_length=100,
        weights=[embedding_matrix],
        embeddings_regularizer=REGULARIZER),
        #embeddings_regularizer = tf.keras.regularizers.L2(0.01),
        # Use masking to handle the variable sequence lengths
        #mask_zero=True),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64, return_sequences=True)),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(32)),
    tf.keras.layers.Dense(64, activation='softmax'),
    tf.keras.layers.Dense(3)
])

In [44]:
checkpoint_path = "training_1/w2v_two_layer_bilstm_no_emoji.ckpt"
two_bilstm.load_weights(checkpoint_path)

Test Loss: 0.801200270652771
Test Accuracy: 0.747474730014801
Micro F1-Score: 0.7474746704101562
Macro F1-Score: 0.6628547310829163


## Two Layer biLSTM + dropout 0.2

In [13]:
REGULARIZER = tf.keras.regularizers.L2(0.01)

two_bilstm_02 = tf.keras.Sequential([
    tf.keras.layers.Embedding(
        input_dim=len(tokenizer.word_index) + 1,
        output_dim=128,
        input_length=100,
        weights=[embedding_matrix],
        embeddings_regularizer=REGULARIZER),
        #embeddings_regularizer = tf.keras.regularizers.L2(0.01),
        # Use masking to handle the variable sequence lengths
        #mask_zero=True),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64, return_sequences=True)),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(32)),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Dense(64, activation='softmax'),
    tf.keras.layers.Dense(3)
])

In [20]:
checkpoint_path = "training_1/w2v_two_layer_bilstm_02_no_emoji.ckpt"
two_bilstm_02.load_weights(checkpoint_path)

Test Loss: 0.7869081497192383
Test Accuracy: 0.752525269985199
Micro F1-Score: 0.7525252103805542
Macro F1-Score: 0.665414571762085


## Batch Normalization

In [21]:
REGULARIZER = tf.keras.regularizers.L2(0.01)

two_bilstm_02_batch = tf.keras.Sequential([
    tf.keras.layers.Embedding(
        input_dim=len(tokenizer.word_index) + 1,
        output_dim=128,
        input_length=100,
        weights=[embedding_matrix],
        embeddings_regularizer=REGULARIZER),
        #embeddings_regularizer = tf.keras.regularizers.L2(0.01),
        # Use masking to handle the variable sequence lengths
        #mask_zero=True),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64, return_sequences=True)),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(32)),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.Dense(64, activation='softmax'),
    tf.keras.layers.Dense(3)
])

In [28]:
checkpoint_path = "training_1/w2v_two_layer_bilstm_02_batch_no_emoji.ckpt"
two_bilstm_02_batch.load_weights(checkpoint_path)

Test Loss: 0.8832367658615112
Test Accuracy: 0.7356902360916138
Micro F1-Score: 0.735690176486969
Macro F1-Score: 0.670479953289032
