In [1]:
import os

os.environ["KERAS_BACKEND"] = "tensorflow"
os.environ['PYTHONIOENCODING'] = 'utf-8'

import keras
import tensorflow as tf
from keras import layers
import pandas as pd
from sklearn.model_selection import train_test_split
import string
import re
tf.random.set_seed(42)


In [2]:
# df_fake = pd.read_csv('../data/DataSet_Misinfo_FAKE.csv')
# df_fake = df_fake.dropna()
# df_real = pd.read_csv('../data/DataSet_Misinfo_TRUE.csv')
# df_real = df_real.dropna()
# df_fake['truth'] = 0
# df_real['truth'] = 1
# df = pd.concat([df_real, df_fake])
# df = df.drop('Unnamed: 0', axis=1)
# df = df.sample(frac=1, random_state=42).reset_index(drop=True)
df = pd.read_csv("../data/WELFake_Dataset.csv")
df['text'] = df['title'] + " " + df['text']
df = df.drop(columns=['title', 'Unnamed: 0'])
df = df.dropna()
df.head()

Unnamed: 0,text,label
0,LAW ENFORCEMENT ON HIGH ALERT Following Threat...,1
2,UNBELIEVABLE! OBAMA’S ATTORNEY GENERAL SAYS MO...,1
3,"Bobby Jindal, raised Hindu, uses story of Chri...",0
4,SATAN 2: Russia unvelis an image of its terrif...,1
5,About Time! Christian Group Sues Amazon and SP...,1


In [3]:
df_test = pd.read_csv("../data/FakeNewsNet.csv")
df_test = df_test.drop(columns=['news_url', 'tweet_num', 'source_domain'])
df_test = df_test.dropna()
df_test.head()

Unnamed: 0,title,real
0,Kandi Burruss Explodes Over Rape Accusation on...,1
1,People's Choice Awards 2018: The best red carp...,1
2,Sophia Bush Sends Sweet Birthday Message to 'O...,1
3,Colombian singer Maluma sparks rumours of inap...,1
4,Gossip Girl 10 Years Later: How Upper East Sid...,1


In [4]:
test_text = df_test['title'].tolist()
test_label = df_test['real'].tolist()


In [5]:
X = df['text'].tolist()
y = df['label'].tolist()

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [58]:
def custom_standardization(input_data):
    # Convert to lowercase
    lowercase = tf.strings.lower(input_data)
    # Replace non-breaking spaces and regular spaces
    cleaned = tf.strings.regex_replace(lowercase, "[\xa0\u2009]", " ")  # Replace non-breaking space and thin space
    # Remove leading/trailing whitespace
    cleaned = tf.strings.strip(cleaned)
    # Remove punctuation (if desired)
    cleaned = tf.strings.regex_replace(cleaned, f"[{re.escape(string.punctuation)}]", "")
    return cleaned

In [59]:
# Model Constants
max_features = 20000
embedding_dim = 128
sequence_length = 500

batch_size = 32
train_ds = tf.data.Dataset.from_tensor_slices((X_train, y_train)).batch(batch_size)
val_ds = tf.data.Dataset.from_tensor_slices((X_test, y_test)).batch(batch_size)
raw_test_ds = tf.data.Dataset.from_tensor_slices((test_text, test_label)).batch(batch_size)

vectorize_layer = keras.layers.TextVectorization(
    standardize=custom_standardization,
    max_tokens=max_features,
    output_mode="int",
    output_sequence_length=sequence_length,
)

text_ds = train_ds.map(lambda x, y: x)

vectorize_layer.adapt(text_ds)


In [63]:
vocab = vectorize_layer.get_vocabulary()
with open('vocab.txt', 'w', encoding='utf-8') as f:
    for token in vocab:
        f.write(f"{token}\n")

In [64]:
def vectorize_text(text, label):
    text = tf.expand_dims(text, -1)
    return vectorize_layer(text), label

train_ds = train_ds.map(vectorize_text)
val_ds = val_ds.map(vectorize_text)
test_ds = raw_test_ds.map(vectorize_text)

train_ds = train_ds.cache().prefetch(buffer_size=10)
val_ds = val_ds.cache().prefetch(buffer_size=10)
test_ds = test_ds.cache().prefetch(buffer_size=10)

In [65]:
# The Model
inputs = keras.Input(shape=(None,), dtype="int64")

x = layers.Embedding(max_features, embedding_dim)(inputs)
x = layers.Dropout(0.5)(x)

x = layers.Conv1D(128, 7, padding="valid", activation="relu", strides=3)(x)
x = layers.Conv1D(128, 7, padding="valid", activation="relu", strides=3)(x)
x = layers.GlobalMaxPooling1D()(x)

x = layers.Dense(128, activation="relu")(x)
x = layers.Dropout(0.5)(x)

predictions = layers.Dense(1, activation="sigmoid", name="predictions")(x)

model = keras.Model(inputs, predictions)

model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])


In [66]:
epochs = 3

model.fit(train_ds, validation_data=val_ds, epochs=epochs)

Epoch 1/3


[1m1789/1789[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m260s[0m 133ms/step - accuracy: 0.8640 - loss: 0.2503 - val_accuracy: 0.9808 - val_loss: 0.0540
Epoch 2/3
[1m1789/1789[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m311s[0m 174ms/step - accuracy: 0.9842 - loss: 0.0429 - val_accuracy: 0.9818 - val_loss: 0.0542
Epoch 3/3
[1m1789/1789[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m264s[0m 148ms/step - accuracy: 0.9933 - loss: 0.0198 - val_accuracy: 0.9800 - val_loss: 0.0743


<keras.src.callbacks.history.History at 0x1ed47c1e910>

In [67]:
model.evaluate(test_ds);

[1m725/725[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 21ms/step - accuracy: 0.7441 - loss: 1.4269


In [14]:
def vectorize_text_test(text):
    text = tf.expand_dims(text, -1)
    return vectorize_layer(text)

In [68]:
text = "Video: Why CNN correspondent believes Putin agreed to prisoner swap | CNN PoliticsClose icon CNN's Matthew Chance explains why he believes President Vladimir Putin agreed to a massive prisoner swap that included Wall Street Journal reporter Evan Gershkovich. "
vectorized_article = vectorize_text_test(text)
print(f"Vectorized article: {vectorized_article.numpy()}")

Vectorized article: [[  175   206   524  5199  1382   535   983     3  6593 12011   524     1
   6576  9708  4390  1034  2673   206    12  1382    51  1554   535   983
      3     6  1181  6593 12011     8   939   454   539  1836  1171  7711
      1     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     

In [69]:
prediction = model.predict(vectorized_article)
prediction[0][0]

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 328ms/step


2.1856604e-06

In [70]:
model.save("../models/nn.keras")

In [18]:
# # A string input
# inputs = keras.Input(shape=(1,), dtype="string")
# # Turn strings into vocab indices
# indices = vectorize_layer(inputs)
# # Turn vocab indices into predictions
# outputs = model(indices)

# # Our end to end model
# end_to_end_model = keras.Model(inputs, outputs)
# end_to_end_model.compile(
#     loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"]
# )

In [19]:
# single_string_array = tf.convert_to_tensor(["Video: Why CNN correspondent believes Putin agreed to prisoner swap | CNN PoliticsClose icon CNN's Matthew Chance explains why he believes President Vladimir Putin agreed to a massive prisoner swap that included Wall Street Journal reporter Evan Gershkovich. "])
# result = end_to_end_model.predict(single_string_array)
# result[0]

In [20]:
# end_to_end_model.save('../models/neural_network.h5')