# Mount Google Drive to the Dataset
## If run locally, do not need to run this section

In [1]:
from google.colab import drive
drive.mount('/content/gdrive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3Aietf%3Awg%3Aoauth%3A2.0%3Aoob&scope=email%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdocs.test%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive.photos.readonly%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fpeopleapi.readonly&response_type=code

Enter your authorization code:
··········
Mounted at /content/gdrive


In [2]:
cd /content/gdrive/My Drive/Project/Media

/content/gdrive/My Drive/Project/Media


In [3]:
ls ./Model

content_seg.txt         Sntlst.txt      word_vector_Bi_LSTM.h5
med250.model.bin        Sntlst_v2.txt   word_vector.h5
sgns.weibo.bigram-char  Sntlst_v3.txt   word_vector_LSTM.h5
Sntlst_Rumor_v4.txt     textdata.npz
Sntlst_Truth_v4.txt     wordMatrix.npz


# LSTM Text Classsifier by Keras

In [4]:
# -*- coding: utf-8 -*-
from tensorboardcolab import TensorBoardColab, TensorBoardColabCallback
from sklearn import model_selection, preprocessing, linear_model, naive_bayes, metrics, svm
from sklearn.feature_extraction import text
from sklearn import decomposition, ensemble
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split

from keras import backend as K
from keras import layers
from keras.layers import Dense, Input, Flatten, Dropout
from keras.layers import LSTM, Bidirectional, Embedding, GlobalMaxPool1D
from keras.models import Sequential
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.callbacks import TensorBoard
from gensim.models import Word2Vec
from gensim.models import KeyedVectors

import pandas as pd
import numpy as np

Using TensorFlow backend.


In [0]:
K.tensorflow_backend._get_available_gpus()

['/job:localhost/replica:0/task:0/device:GPU:0']

In [0]:
from tensorflow.python.client import device_lib
print(device_lib.list_local_devices())

[name: "/device:CPU:0"
device_type: "CPU"
memory_limit: 268435456
locality {
}
incarnation: 3597723958571837644
, name: "/device:XLA_CPU:0"
device_type: "XLA_CPU"
memory_limit: 17179869184
locality {
}
incarnation: 12724338663039714448
physical_device_desc: "device: XLA_CPU device"
, name: "/device:XLA_GPU:0"
device_type: "XLA_GPU"
memory_limit: 17179869184
locality {
}
incarnation: 17800070503189039235
physical_device_desc: "device: XLA_GPU device"
, name: "/device:GPU:0"
device_type: "GPU"
memory_limit: 14800692839
locality {
  bus_id: 1
  links {
  }
}
incarnation: 14899410019309818766
physical_device_desc: "device: 0, name: Tesla T4, pci bus id: 0000:00:04.0, compute capability: 7.5"
]


In [0]:

!nvidia-smi

Sun May 26 07:39:20 2019       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 418.67       Driver Version: 410.79       CUDA Version: 10.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   67C    P0    31W /  70W |    221MiB / 15079MiB |      0%      Default |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Processes:                                                       GPU Memory |
|  GPU       PID   Type   Process name                             Usage      |
+-------

###  Hyper Parameters



In [0]:
MAX_SEQUENCE_LENGTH = 128 # max length of each weibo
EMBEDDING_DIM = 300 # dimensions of word embedding
# dim of self trained w2vmodel = 100
# dim of out weibo trained w2vmodel=300 

VAL_SPLIT = 0.36 # split the data to train and test data

In [0]:
# Load Word2Vec Model & Word Embedding Matrix

w2v_model = Word2Vec.load("Model/med250.model.bin")

out_model=KeyedVectors.load_word2vec_format('Model/sgns.weibo.bigram-char') 

tokenizer = Tokenizer()

# # Get texts of aimed file
# def getTexts():
#     Texts = []
#     with open("Model/Sntlst_v3.txt", "r", encoding='utf8') as f:
#         lines = f.readlines()
#         for line in lines:
#             Texts.append(line.strip())
#     return Texts
# all_texts = getTexts()

# # Make texts to sequences
# tokenizer.fit_on_texts(all_texts)
# sequences = tokenizer.texts_to_sequences(all_texts)
# word_index = tokenizer.word_index
# print('Found %s unique tokens.' % len(word_index))
# data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)

# print('Shape of data tensor:', data.shape)

In [0]:
# Get texts of aimed file
def getTexts(filename):
    Texts = []
    with open(filename, "r", encoding='utf8') as f:
        lines = f.readlines()
        for line in lines:
            Texts.append(line.strip())
    return Texts
T_texts = getTexts("./Model/Sntlst_Truth_v4.txt")
R_texts = getTexts("./Model/Sntlst_Rumor_v4.txt")

all_texts = np.append(T_texts, R_texts)

In [0]:
# Make Truth texts to sequences
tokenizer.fit_on_texts(all_texts)
sequences = tokenizer.texts_to_sequences(all_texts)
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))
data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)

print('Shape of data tensor:', data.shape)


Found 70502 unique tokens.
Shape of data tensor: (15360, 128)


In [0]:
# Split Train and Test Data
y_0 = [[0] * 7793]
y_1 = [[1] * 7567]
y = np.append(y_0, y_1)
labels = to_categorical(np.asarray(y))

X_train, X_val, y_train, y_val = train_test_split(data, labels, test_size=VAL_SPLIT, random_state=40)
print(X_train.shape, X_val.shape)

(9830, 128) (5530, 128)


In [0]:
# Get Embedding Matrix of Pretrained Word2Vec Model
embedding_matrix = np.zeros((len(word_index) + 1, EMBEDDING_DIM))
for word, i in word_index.items(): 
    if word in out_model:
        embedding_matrix[i] = np.asarray(out_model[word],
                                         dtype='float32')
print(embedding_matrix.shape)

(70985, 300)


In [0]:
datafile = "textdata.npz"
np.savez(datafile, x=data, y=labels)

In [0]:
# class LSTM_MODEL(layers):

#     def __init__(self, output_dim, **kwargs):
#         self.output_dim = output_dim
#         super(LSTM_MODEL, self).__init__(**kwargs)

#     def build(self, input_shape):
#         # Create a trainable weight variable for this layer.
#         self.embedding_layer = Embedding(input_dim = len(word_index) + 1,
#                                 output_dim = EMBEDDING_DIM,
#                                 weights=[embedding_matrix],
#                                 input_length=MAX_SEQUENCE_LENGTH,
#                                 trainable=False)
        
#         super(LSTM_MODEL, self).build(input_shape)  # Be sure to call this at the end

#     def call(self, x):
#         return K.dot(x, self.kernel)

#     def compute_output_shape(self, input_shape):
#         return (input_shape[0], self.output_dim)

# LSTM Model

In [0]:
# LSTM Model
# Defined Embedding Layer
        
def LSTM_RNN(data, labels):
    
    X_train, X_val, y_train, y_val = train_test_split(data, labels, test_size=0.36, random_state=40)
    embedding_layer = Embedding(input_dim = len(word_index) + 1,
                                output_dim = EMBEDDING_DIM,
                                weights=[embedding_matrix],
                                input_length=MAX_SEQUENCE_LENGTH,
                                trainable=False)
    logs_base_dir = "./logs"

    tbc=TensorBoardColab()

    # LSTM Model
    model = Sequential()
    model.add(embedding_layer)
#     model.add(Embedding(70985, 100, 
#           input_length=128))

    model.add(LSTM(128, dropout=0.5, recurrent_dropout=0.2))
#     model.add(LSTM(256, return_sequences=True, stateful=True))
#     model.add(LSTM(256, dropout=0.5, recurrent_dropout=0.2))
    model.add(Dropout(0.5))
    model.add(Dense(64, activation='relu'))
    model.add(Dense(2, activation='softmax'))
    model.summary()
    model.compile(loss='binary_crossentropy',
                  optimizer='rmsprop',
                  metrics=['acc'])

    model.fit(X_train, y_train, epochs=50, batch_size=128, validation_data=(X_val, y_val),
              shuffle=True, callbacks=[TensorBoardColabCallback(tbc)])
#     model.fit(data, labels, epochs=10, batch_size=128,  validation_split=0.4,
#              shuffle=True, callbacks=[TensorBoardColabCallback(tbc)])
    
#     model.save('Model/word_vector_LSTM.h5')

    return model
    
LSTM_model = LSTM_RNN(data, labels)
y_pred = LSTM_model.predict(X_val)


K.clear_session()

Wait for 8 seconds...
TensorBoard link:
http://b04ef2cb.ngrok.io
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 128, 300)          21295500  
_________________________________________________________________
lstm_1 (LSTM)                (None, 128)               219648    
_________________________________________________________________
dropout_1 (Dropout)          (None, 128)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 64)                8256      
_________________________________________________________________
dense_2 (Dense)              (None, 2)                 130       
Total params: 21,523,534
Trainable params: 228,034
Non-trainable params: 21,295,500
_________________________________________________________________
Train on 9830 samples, validate on 5530 samples
Epoch 1/50


In [0]:
K.clear_session()

In [0]:
matrix = metrics.confusion_matrix(y_val.argmax(axis=1), y_pred.argmax(axis=1))
print(matrix) 

[[2535  236]
 [ 511 2248]]


In [0]:
f_1 = metrics.f1_score(y_val.argmax(axis=1), y_pred.argmax(axis=1))
acc = metrics.accuracy_score(y_val.argmax(axis=1), y_pred.argmax(axis=1))
print(f_1, acc)

0.8575243181384704 0.8649186256781194


In [0]:
del LSTM_model

# Bi-LSTM Model

In [0]:
# LSTM Model
# Defined Embedding Layer
        
def BiLSTM_RNN(data, labels):
    
    X_train, X_val, y_train, y_val = train_test_split(data, labels, test_size=0.36, random_state=40)
    embedding_layer = Embedding(input_dim = len(word_index) + 1,
                                output_dim = EMBEDDING_DIM,
                                weights=[embedding_matrix],
                                input_length=MAX_SEQUENCE_LENGTH,
                                trainable=False)
    logs_base_dir = "./logs"

    tbc=TensorBoardColab()

    # Bi-LSTM Model
    model = Sequential()
    model.add(embedding_layer)
    
    model.add(Bidirectional(LSTM(64, return_sequences=True, dropout=0.25, recurrent_dropout=0.1)))
    model.add(GlobalMaxPool1D())
    model.add(Dropout(0.5))
    model.add(Dense(64, activation='relu'))
    model.add(Dense(2, activation='softmax'))
    model.summary()
    model.compile(loss='binary_crossentropy',
                  optimizer='rmsprop',
                  metrics=['acc'])

    model.fit(X_train, y_train, epochs=30, batch_size=128, validation_data=(X_val, y_val),
              shuffle=True, callbacks=[TensorBoardColabCallback(tbc)])

    
    model.save('Model/word_vector_Bi_LSTM.h5')

    return model
    
BiLSTM_model = BiLSTM_RNN(data, labels)
y_pred = BiLSTM_model.predict(X_val)


K.clear_session()

Wait for 8 seconds...
TensorBoard link:
https://b04ef2cb.ngrok.io
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 128, 300)          21295500  
_________________________________________________________________
bidirectional_1 (Bidirection (None, 128, 128)          186880    
_________________________________________________________________
global_max_pooling1d_1 (Glob (None, 128)               0         
_________________________________________________________________
dropout_1 (Dropout)          (None, 128)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 64)                8256      
_________________________________________________________________
dense_2 (Dense)              (None, 2)                 130       
Total params: 21,490,766
Trainable params: 195,266
Non-trainable params: 21,

In [0]:
matrix = metrics.confusion_matrix(y_val.argmax(axis=1), y_pred.argmax(axis=1))
print(matrix)

[[2558  213]
 [ 526 2233]]


In [0]:
f_1 = metrics.f1_score(y_val.argmax(axis=1), y_pred.argmax(axis=1))
acc = metrics.accuracy_score(y_val.argmax(axis=1), y_pred.argmax(axis=1))
print(f_1, acc)

0.8580211335254562 0.8663652802893309


## Extract Layer

In [52]:
from keras.models import load_model
model = load_model('Model/word_vector_Bi_LSTM.h5')
model.layers[4].output

<tf.Tensor 'dense_1_1/Relu:0' shape=(?, 64) dtype=float32>

In [51]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 128, 300)          21295500  
_________________________________________________________________
bidirectional_1 (Bidirection (None, 128, 128)          186880    
_________________________________________________________________
global_max_pooling1d_1 (Glob (None, 128)               0         
_________________________________________________________________
dropout_1 (Dropout)          (None, 128)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 64)                8256      
_________________________________________________________________
dense_2 (Dense)              (None, 2)                 130       
Total params: 21,490,766
Trainable params: 195,266
Non-trainable params: 21,295,500
__________________________________________________________

In [0]:
BiLSTM_dense = model.layers[4].output