In [1]:
import os

os.environ["KERAS_BACKEND"] = "tensorflow"
os.environ['PYTHONIOENCODING'] = 'utf-8'

import keras
import tensorflow as tf
from keras import layers
import pandas as pd
from sklearn.model_selection import train_test_split

In [2]:
# df_fake = pd.read_csv('../data/DataSet_Misinfo_FAKE.csv')
# df_fake = df_fake.dropna()
# df_real = pd.read_csv('../data/DataSet_Misinfo_TRUE.csv')
# df_real = df_real.dropna()
# df_fake['truth'] = 0
# df_real['truth'] = 1
# df = pd.concat([df_real, df_fake])
# df = df.drop('Unnamed: 0', axis=1)
# df = df.sample(frac=1, random_state=42).reset_index(drop=True)
df = pd.read_csv("../data/WELFake_Dataset.csv")
df['text'] = df['title'] + " " + df['text']
df = df.drop(columns=['title', 'Unnamed: 0'])
df = df.dropna()
df.head()

Unnamed: 0,text,label
0,LAW ENFORCEMENT ON HIGH ALERT Following Threat...,1
2,UNBELIEVABLE! OBAMA’S ATTORNEY GENERAL SAYS MO...,1
3,"Bobby Jindal, raised Hindu, uses story of Chri...",0
4,SATAN 2: Russia unvelis an image of its terrif...,1
5,About Time! Christian Group Sues Amazon and SP...,1


In [3]:
X = df['text'].tolist()
y = df['label'].tolist()

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [4]:
# Model Constants
max_features = 20000
embedding_dim = 128
sequence_length = 500


batch_size = 32
train_ds = tf.data.Dataset.from_tensor_slices((X_train, y_train)).batch(batch_size)
val_ds = tf.data.Dataset.from_tensor_slices((X_test, y_test)).batch(batch_size)

vectorize_layer = keras.layers.TextVectorization(
    max_tokens=max_features,
    output_mode="int",
    output_sequence_length=sequence_length,
)

text_ds = train_ds.map(lambda x, y: x)

vectorize_layer.adapt(text_ds)


In [5]:
# vocab = vectorize_layer.get_vocabulary()
# with open('vocab.txt', 'w', encoding='utf-8') as f:
#     for word in vocab:
#         f.write(f"{word}\n")

In [6]:
def vectorize_text(text, label):
    text = tf.expand_dims(text, -1)
    return vectorize_layer(text), label

train_ds = train_ds.map(vectorize_text)
val_ds = val_ds.map(vectorize_text)

train_ds = train_ds.cache().prefetch(buffer_size=10)
val_ds = val_ds.cache().prefetch(buffer_size=10)

In [7]:
# The Model
inputs = keras.Input(shape=(None,), dtype="int64")

x = layers.Embedding(max_features, embedding_dim)(inputs)
x = layers.Dropout(0.5)(x)

x = layers.Conv1D(128, 7, padding="valid", activation="relu", strides=3)(x)
x = layers.Conv1D(128, 7, padding="valid", activation="relu", strides=3)(x)
x = layers.GlobalMaxPooling1D()(x)

x = layers.Dense(128, activation="relu")(x)
x = layers.Dropout(0.5)(x)

predictions = layers.Dense(1, activation="sigmoid", name="predictions")(x)

model = keras.Model(inputs, predictions)

model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])


In [8]:
epochs = 3

model.fit(train_ds, validation_data=val_ds, epochs=epochs)

Epoch 1/3
[1m1789/1789[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m150s[0m 83ms/step - accuracy: 0.8654 - loss: 0.2482 - val_accuracy: 0.9797 - val_loss: 0.0566
Epoch 2/3
[1m1789/1789[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m492s[0m 275ms/step - accuracy: 0.9836 - loss: 0.0457 - val_accuracy: 0.9815 - val_loss: 0.0584
Epoch 3/3
[1m1789/1789[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m171s[0m 96ms/step - accuracy: 0.9929 - loss: 0.0220 - val_accuracy: 0.9732 - val_loss: 0.0995


<keras.src.callbacks.history.History at 0x1de62de5ed0>

In [10]:
def vectorize_text_test(text):
    text = tf.expand_dims(text, -1)
    return vectorize_layer(text)

In [14]:
text = ["In the Donbass region Georgian and Polish mercenaries opened fire on Ukrainian soldiers.  Reportedly, the incident is related to the low morale of foreign fighters as well as the fact that the Ukrainian leadership delayed mercenaries' payments."]
vectorized_article = vectorize_text_test(text)
prediction = model.predict(vectorized_article)
prediction

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 94ms/step


0.9989874

In [None]:
df_test = pd.read_csv("../data/FakeNewsNet.csv")
df_test = df_test.drop(columns=['news_url', 'tweet_num', 'source_domain'])
df_test = df_test.dropna()
df_test.head()

In [None]:
# test_text = df_test['title'].tolist()
# test_label = df_test['real'].tolist()

# test_ds = tf.data.Dataset.from_tensor_slices((test_text, test_label)).batch(batch_size)
# test_ds = test_ds.map(vectorize_text)

# test_ds = test_ds.cache().prefetch(buffer_size=10)


In [None]:
# model.evaluate(test_ds);

In [None]:
# A string input
inputs = keras.Input(shape=(1,), dtype="string")
# Turn strings into vocab indices
indices = vectorize_layer(inputs)
# Turn vocab indices into predictions
outputs = model(indices)

# Our end to end model
end_to_end_model = keras.Model(inputs, outputs)
end_to_end_model.compile(
    loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"]
)

In [None]:
test_text2 = df_test['title'].tolist()
test_label2 = df_test['real'].tolist()
test_ds2 = tf.data.Dataset.from_tensor_slices((test_text2, test_label2)).batch(batch_size)
results = end_to_end_model.evaluate(test_ds2)


In [None]:
single_string_array = tf.convert_to_tensor(["Video: Why CNN correspondent believes Putin agreed to prisoner swap | CNN PoliticsClose icon CNN's Matthew Chance explains why he believes President Vladimir Putin agreed to a massive prisoner swap that included Wall Street Journal reporter Evan Gershkovich. "])
result = end_to_end_model.predict(single_string_array)
result[0]

In [None]:
# end_to_end_model.save('../models/neural_network.h5')