In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import tensorflow as tf
from datetime import datetime

import warnings
warnings.filterwarnings('ignore')

### Dataset

download kaggle dataset: https://www.kaggle.com/datasets/clmentbisaillon/fake-and-real-news-dataset/download?datasetVersionNumber=1  
download liar dataset: https://www.cs.ucsb.edu/~william/data/liar_dataset.zip

In [2]:
from load_dataset import kaggle_dataset
from load_dataset import liar_dataset
kaggle_path = "./dataset/Fake-News/"
liar_path = "./dataset/Liar/"

# Pick dataset you want:
# If kaggle:
# dataset = kaggle_dataset(kaggle_path)
# If Liar:
dataset = liar_dataset(liar_path)

train_sentences = dataset.train_sentences
val_sentences = dataset.val_sentences
train_labels = dataset.train_labels
val_labels = dataset.val_labels


### Model

#### Embedding: nnlm

In [14]:
# nnlm: Text embeddings based on feed-forward Neural-Net Language Models
# pre-trained text embedding model from TensorFlow Hub
# more text-embedding download in https://tfhub.dev/s?module-type=text-embedding
import tensorflow_hub as hub
embedding = "https://tfhub.dev/google/nnlm-en-dim128-with-normalization/2"
hub_layer = hub.KerasLayer(embedding, input_shape=[], 
                           dtype=tf.string, trainable=True)

train_x = hub_layer(train_sentences)
val_x = hub_layer(val_sentences)

In [26]:
# DO THIS ONLY IF USING CNN & LSTM
# Reshape embedding x from (len, dim1*dim2) to (len, dim1, dim2)
dim1 = 32
dim2 = 4
print(train_x.shape)

train_x = np.reshape(train_x, (train_x.shape[0], dim1, dim2))
val_x = np.reshape(val_x, (val_x.shape[0], dim1, dim2))
print(train_x.shape)

(10240, 128)
(10240, 32, 4)


#### Embedding: bert

In [None]:
# bert: Bidirectional Encoder Representations from Transformers
# Easy cause out-of-memory
import tensorflow_hub as hub
import tensorflow_text

text_input = train_sentences
preprocessor = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3")
encoder_inputs = preprocessor(text_input)
encoder = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/4", trainable=True)
outputs = encoder(encoder_inputs)
pooled_output = outputs["pooled_output"]        # [batch_size, 768]. 
sequence_output = outputs["sequence_output"]    # [batch_size, seq_length, 768].

In [2]:
# Only use a small batch of bert embedding data to continue...
# You can got that data in our dirve
bert_batch_path = "./embedding model/bert/"

# load pooled_output for dense
# load sequence_output for cnn & lstm
# train_x = np.load(bert_batch_path+"bert_train_pooled_0.npy")
# val_x = np.load(bert_batch_path+"bert_train_pooled_1.npy")
train_x = np.load(bert_batch_path+"bert_train_seq_0.npy")
val_x = np.load(bert_batch_path+"bert_train_seq_1.npy")
train_labels = np.load(bert_batch_path+"bert_train_label_0.npy")
val_labels = np.load(bert_batch_path+"bert_train_label_1.npy")

#### Network

In [3]:
cov_filters = 5
cov_kernel = 64
pool_size = 4
LSTM_units = 100

# Pick a model
from model import lstm

model = lstm(LSTM_units)
model.compile()
model_history = model.fit(train_x, train_labels, validation_data=(val_x, val_labels))
model.summary()

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dropout (Dropout)           (32, 128, 768)            0         
                                                                 
 lstm (LSTM)                 (32, 100)                 347600    
                                                                 
 dropout_1 (Dropout)         (32, 100)                 0         
                                                                 
 dense (Dense)               (32, 1)                   101       
                                                                 
Total params: 347,701
Trainable params: 347,701
Non-trainable params: 0
_________________________________________________________________


### Attack (TBC)