In [2]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import pandas as pd
from sklearn.model_selection import train_test_split
from tqdm import tqdm
import numpy as np

In [3]:
from tensorflow.python.client import device_lib
print(device_lib.list_local_devices())

[name: "/device:CPU:0"
device_type: "CPU"
memory_limit: 268435456
locality {
}
incarnation: 3768376842654507496
, name: "/device:GPU:0"
device_type: "GPU"
memory_limit: 1549271040
locality {
  bus_id: 1
  links {
  }
}
incarnation: 15813315496928167029
physical_device_desc: "device: 0, name: NVIDIA GeForce RTX 3060 Laptop GPU, pci bus id: 0000:01:00.0, compute capability: 8.6"
]


In [4]:
# MAX_VNUM = 20000
SEQUENCE_LENGTH = 300
EMBED_DIMENSION = 100
BATCH_SIZE=64

In [5]:
# column_names = ["article_content", "labels", "article_title"]
# data = pd.read_csv("../FAKES_Dataset/FA-KES-Dataset.csv", encoding='utf-8', encoding_errors='ignore')[column_names]
# data["article_content"] = data["article_title"] + " " + data["article_content"]
# train, test = train_test_split(data, test_size=0.2)

In [6]:
column_names = ["text", "labels", "title"]
data = pd.read_csv("../ISOT_Dataset/ISOT_Dataset.csv", encoding='utf-8', encoding_errors='ignore')[column_names]
# data["text"] = data["title"] + " " + data["text"]
train, test = train_test_split(data, test_size=0.2)

In [7]:
token = keras.preprocessing.text.Tokenizer()
token.fit_on_texts(data[column_names[0]])
vocab_size = len(token.word_index) + 2

In [8]:
embedding_vector = {}
f = open('../data/glove.6B.100d.txt')
# for line in tqdm(f):
#     value = line.split(' ')
#     word = value[0]
#     coef = np.array(value[1:],dtype = 'float32')
#     embedding_vector[word] = coef
for line in tqdm(f):
    word, coefs = line.split(maxsplit=1)
    coefs = np.fromstring(coefs, "f", sep=" ")
    embedding_vector[word] = coefs
embedding_matrix = np.zeros((vocab_size, EMBED_DIMENSION))
for word,i in tqdm(token.word_index.items()):
    embedding_value = embedding_vector.get(word)
    if embedding_value is not None:
        embedding_matrix[i] = embedding_value

400000it [00:08, 44455.27it/s]
100%|██████████| 138021/138021 [00:00<00:00, 850607.25it/s]


### Model

In [9]:
# inputs = keras.Input(shape=(None,), name="input")
# x = layers.Embedding(input_dim=vocab_size, output_dim=EMBED_DIMENSION, 
#                      embeddings_initializer=keras.initializers.Constant(embedding_matrix), 
#                      input_length=SEQUENCE_LENGTH, trainable = False)(inputs)
# x = layers.Conv1D(128, 5, activation='relu', use_bias=False, name="Conv1D")(x)
# x = layers.MaxPooling1D(pool_size=2, name="MaxPooling1D")(x)
# x = layers.LSTM(32, activation=None, use_bias=False, name="LSTM")(x)
# outputs = layers.Dense(1, activation="sigmoid")(x)
# model = keras.Model(inputs, outputs)
# model.summary()
# model.compile(loss="binary_crossentropy", optimizer="adam", metrics=[
#     "accuracy",
# ])
model = keras.models.Sequential()
model.add(layers.Embedding(input_dim=vocab_size, output_dim=EMBED_DIMENSION, 
                     embeddings_initializer=keras.initializers.Constant(embedding_matrix), 
                     input_length=SEQUENCE_LENGTH, trainable = False))
# model.add(layers.Permute((2, 1)))
model.add(layers.Conv1D(128, 5, activation='relu', name="Conv1D"))
model.add(layers.MaxPooling1D(pool_size=2, name="MaxPooling1D"))
model.add(layers.LSTM(32, activation=None, name="LSTM"))
model.add(layers.Dense(1, activation="sigmoid"))
model.compile(loss="binary_crossentropy", optimizer="adam", metrics=[
    "accuracy",
])
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 300, 100)          13802300  
_________________________________________________________________
Conv1D (Conv1D)              (None, 296, 128)          64128     
_________________________________________________________________
MaxPooling1D (MaxPooling1D)  (None, 148, 128)          0         
_________________________________________________________________
LSTM (LSTM)                  (None, 32)                20608     
_________________________________________________________________
dense (Dense)                (None, 1)                 33        
Total params: 13,887,069
Trainable params: 84,769
Non-trainable params: 13,802,300
_________________________________________________________________


In [10]:
train_x = token.texts_to_sequences(train[column_names[0]])
train_x = keras.preprocessing.sequence.pad_sequences(train_x, maxlen=SEQUENCE_LENGTH, padding='post', truncating="post")


In [11]:
epochs = 10
model.fit(x=train_x, y=train[column_names[1]], validation_split=0.2,
          epochs=epochs, batch_size=BATCH_SIZE)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x1802f63ce50>

In [12]:
test_x = token.texts_to_sequences(test[column_names[0]])
test_x = keras.preprocessing.sequence.pad_sequences(test_x, maxlen=SEQUENCE_LENGTH, padding='post', truncating="post")
model.evaluate(x=test_x, y=test[column_names[1]])



[0.5351209044456482, 0.7097995281219482]