In [80]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, Bidirectional
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from tensorflow.keras.utils import to_categorical

from datasets import load_dataset, ClassLabel
import pandas as pd
import re
import string
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score


In [81]:

print(f"TensorFlow Version: {tf.__version__}")
print(f"Num GPUs Available: {len(tf.config.experimental.list_physical_devices('GPU'))}")


TensorFlow Version: 2.19.0
Num GPUs Available: 0


In [82]:

# --- Text Preprocessing Functions ---
# Load the dataset
try:
    dataset = load_dataset('csv', data_files='WELFake_Dataset.csv')
except FileNotFoundError:
    print("WELFake_Dataset.csv not found. Attempting to load 'davanstrien/WELFake' from Hugging Face Hub.")
    dataset = load_dataset("davanstrien/WELFake")


WELFake_Dataset.csv not found. Attempting to load 'davanstrien/WELFake' from Hugging Face Hub.


In [83]:

if 'train' in dataset:
    data_split = dataset['train']
else:
    data_split = dataset


In [84]:

def combine_text(example):
    combined_text = ""
    if 'title' in example and example['title'] is not None:
        combined_text += str(example['title'])
    if 'text' in example and example['text'] is not None:
        if combined_text:
            combined_text += " "
        combined_text += str(example['text'])
    
    if not combined_text.strip():
        return {"full_text": ""}
    return {"full_text": combined_text}


In [85]:

data_split = data_split.map(combine_text)
data_split = data_split.remove_columns(['title', 'text']) if 'title' in data_split.column_names and 'text' in data_split.column_names else data_split


In [86]:

def clean_text(text):
    if not isinstance(text, str):
        return ""
    text = text.lower()
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    text = re.sub(r'<.*?>', '', text)
    # Keep alphanumeric characters and spaces only, then remove extra spaces
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text) 
    text = text.strip()
    text = re.sub(r'\s+', ' ', text).strip()
    return text


In [90]:

data_split = data_split.map(lambda example: {"full_text": clean_text(example["full_text"])})


Map:   0%|          | 0/72134 [00:00<?, ? examples/s]

In [91]:

# Rename 'label' to 'labels' and cast to ClassLabel if not already
if 'label' in data_split.column_names:
    data_split = data_split.rename_column("label", "labels")


In [92]:

print("Sample preprocessed text:", data_split['full_text'][0])


Sample preprocessed text: law enforcement on high alert following threats against cops and whites on 911by blacklivesmatter and fyf911 terrorists video no comment is expected from barack obama members of the fyf911 or fukyoflag and blacklivesmatter movements called for the lynching and hanging of white people and cops they encouraged others on a radio show tuesday night to turn the tide and kill white people and cops to send a message about the killing of black people in americaone of the fyoflag organizers is called sunshine she has a radio blog show hosted from texas called sunshine s fing opinion radio show a snapshot of her fyf911 lolatwhitefear twitter page at 953 pm shows that she was urging supporters to call now fyf911 tonight we continue to dismantle the illusion of white below is a snapshot twitter radio call invite fyf911the radio show aired at 1000 pm eastern standard timeduring the show callers clearly call for lynching and killing of white peoplea 239 minute clip from the

In [93]:

# --- Tokenization and Padding for Keras LSTM ---
VOCAB_SIZE = 10000 # Max number of words to keep in vocabulary
MAX_SEQUENCE_LENGTH = 256 # Max length of text sequences


In [94]:

print("\nFitting Keras Tokenizer and converting text to sequences...")
# Initialize Keras Tokenizer
# num_words: the maximum number of words to keep, based on word frequency. Only the most common num_words-1 words will be kept.
# oov_token: A token to represent out-of-vocabulary words.
tokenizer = Tokenizer(num_words=VOCAB_SIZE, oov_token="<unk>")



Fitting Keras Tokenizer and converting text to sequences...


In [95]:

# Fit the tokenizer on the full_text column of the dataset
# This builds the vocabulary
tokenizer.fit_on_texts(data_split['full_text'])


In [96]:

# Convert texts to sequences of integers
sequences = tokenizer.texts_to_sequences(data_split['full_text'])


In [97]:

# Pad sequences to ensure uniform length
padded_sequences = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH, padding='post', truncating='post')


In [98]:

# Prepare labels (one-hot encode for Keras categorical_crossentropy)
labels = np.array(data_split['labels'])
num_classes = len(np.unique(labels))
labels_one_hot = to_categorical(labels, num_classes=num_classes)


In [99]:

# --- Split the Data ---
X_train, X_test, y_train, y_test = train_test_split(
    padded_sequences, labels_one_hot, test_size=0.2, stratify=labels, random_state=42
)

print(f"\nTraining data shape: {X_train.shape}")
print(f"Training labels shape: {y_train.shape}")
print(f"Test data shape: {X_test.shape}")
print(f"Test labels shape: {y_test.shape}")
print(f"Vocabulary size after tokenization: {len(tokenizer.word_index) + 1}") # +1 for padding/Oov



Training data shape: (57707, 256)
Training labels shape: (57707, 2)
Test data shape: (14427, 256)
Test labels shape: (14427, 2)
Vocabulary size after tokenization: 352710


In [100]:

# --- Keras LSTM Model Definition ---
EMBEDDING_DIM = 128 # Dimension of the word embeddings
LSTM_UNITS = 128    # Number of units in the LSTM layer
DROPOUT_RATE = 0.3  # Dropout rate for regularization

model = Sequential([
    # Embedding layer: vocab_size + 1 because Keras tokenizer reserves index 0 for padding.
    Embedding(len(tokenizer.word_index) + 1, EMBEDDING_DIM, input_length=MAX_SEQUENCE_LENGTH),
    Dropout(DROPOUT_RATE),
    # Bidirectional LSTM processes sequence in both directions
    Bidirectional(LSTM(LSTM_UNITS, return_sequences=False)), # return_sequences=False for sequence-to-one classification
    Dropout(DROPOUT_RATE),
    Dense(num_classes, activation='softmax') # Output layer with softmax for classification
])




In [101]:

# Compile the model
# Using 'adam' optimizer and 'categorical_crossentropy' for one-hot encoded labels
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

model.summary()


In [102]:

# --- Model Training ---
EPOCHS = 10
BATCH_SIZE = 32


In [103]:

# Callbacks for early stopping and saving the best model
callbacks = [
    EarlyStopping(monitor='val_loss', patience=3, verbose=1, restore_best_weights=True),
    ModelCheckpoint('best_lstm_model_tf.keras', monitor='val_loss', save_best_only=True, verbose=1)
]

print(f"\nStarting training for {EPOCHS} epochs with batch size {BATCH_SIZE}...")
history = model.fit(
    X_train, y_train,
    epochs=EPOCHS,
    batch_size=BATCH_SIZE,
    validation_data=(X_test, y_test), # Using test set as validation for simplicity
    callbacks=callbacks,
    verbose=1
)
print("\n--- Training Finished ---")



Starting training for 10 epochs with batch size 32...
Epoch 1/10
[1m1804/1804[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 243ms/step - accuracy: 0.9064 - loss: 0.2109
Epoch 1: val_loss improved from inf to 0.08090, saving model to best_lstm_model_tf.keras
[1m1804/1804[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m461s[0m 254ms/step - accuracy: 0.9064 - loss: 0.2109 - val_accuracy: 0.9702 - val_loss: 0.0809
Epoch 2/10
[1m1804/1804[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 251ms/step - accuracy: 0.9803 - loss: 0.0593
Epoch 2: val_loss improved from 0.08090 to 0.07644, saving model to best_lstm_model_tf.keras
[1m1804/1804[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m475s[0m 263ms/step - accuracy: 0.9803 - loss: 0.0593 - val_accuracy: 0.9710 - val_loss: 0.0764
Epoch 3/10
[1m1804/1804[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 263ms/step - accuracy: 0.9920 - loss: 0.0251
Epoch 3: val_loss did not improve from 0.07644
[1m1804/1804[0m [32m━━━━━━━

In [104]:

# --- Model Evaluation ---
print("\n--- Evaluating Model on Test Set ---")
# Load the best saved model for final evaluation
best_model = tf.keras.models.load_model('best_lstm_model_tf.keras')
loss, accuracy = best_model.evaluate(X_test, y_test, verbose=0)

print(f"\nFinal Test Loss: {loss:.4f}")
print(f"Final Test Accuracy: {accuracy:.4f}")



--- Evaluating Model on Test Set ---

Final Test Loss: 0.0764
Final Test Accuracy: 0.9710


In [105]:

# Detailed Classification Report
y_pred_probs = best_model.predict(X_test)
y_pred = np.argmax(y_pred_probs, axis=1)
y_true = np.argmax(y_test, axis=1)

print("\nClassification Report:")
print(classification_report(y_true, y_pred, target_names=['real', 'fake']))

print("\nConfusion Matrix:")
print(confusion_matrix(y_true, y_pred))

[1m451/451[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 40ms/step

Classification Report:
              precision    recall  f1-score   support

        real       0.99      0.95      0.97      7006
        fake       0.96      0.99      0.97      7421

    accuracy                           0.97     14427
   macro avg       0.97      0.97      0.97     14427
weighted avg       0.97      0.97      0.97     14427


Confusion Matrix:
[[6670  336]
 [  83 7338]]


In [106]:
model.save('final_lstm_tf_model.keras')
