In [1]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Bidirectional, GRU, Dense, Dropout # Changed LSTM to GRU
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

from datasets import load_dataset
import pandas as pd
import re
import string
import numpy as np

  from .autonotebook import tqdm as notebook_tqdm


In [2]:

# --- Text Preprocessing Functions ---
# Load the dataset
try:
    dataset = load_dataset('csv', data_files='WELFake_Dataset.csv')
except FileNotFoundError:
    print("WELFake_Dataset.csv not found. Attempting to load 'davanstrien/WELFake' from Hugging Face Hub.")
    dataset = load_dataset("davanstrien/WELFake")

if 'train' in dataset:
    data_split = dataset['train']
else:
    data_split = dataset

def combine_text(example):
    combined_text = ""
    if 'title' in example and example['title'] is not None:
        combined_text += str(example['title'])
    if 'text' in example and example['text'] is not None:
        if combined_text:
            combined_text += " "
        combined_text += str(example['text'])
    
    if not combined_text.strip():
        return {"full_text": ""}
    return {"full_text": combined_text}


WELFake_Dataset.csv not found. Attempting to load 'davanstrien/WELFake' from Hugging Face Hub.


In [3]:

data_split = data_split.map(combine_text)
data_split = data_split.remove_columns(['title', 'text']) if 'title' in data_split.column_names and 'text' in data_split.column_names else data_split


In [4]:

def clean_text(text):
    if not isinstance(text, str):
        return ""
    text = text.lower()
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    text = re.sub(r'<.*?>', '', text)
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text) 
    text = text.strip()
    text = re.sub(r'\s+', ' ', text).strip()
    return text


In [5]:

data_split = data_split.map(lambda example: {"full_text": clean_text(example["full_text"])})


In [6]:

if 'label' in data_split.column_names:
    data_split = data_split.rename_column("label", "labels")

print("Sample preprocessed text:", data_split['full_text'][0])


Sample preprocessed text: law enforcement on high alert following threats against cops and whites on 911by blacklivesmatter and fyf911 terrorists video no comment is expected from barack obama members of the fyf911 or fukyoflag and blacklivesmatter movements called for the lynching and hanging of white people and cops they encouraged others on a radio show tuesday night to turn the tide and kill white people and cops to send a message about the killing of black people in americaone of the fyoflag organizers is called sunshine she has a radio blog show hosted from texas called sunshine s fing opinion radio show a snapshot of her fyf911 lolatwhitefear twitter page at 953 pm shows that she was urging supporters to call now fyf911 tonight we continue to dismantle the illusion of white below is a snapshot twitter radio call invite fyf911the radio show aired at 1000 pm eastern standard timeduring the show callers clearly call for lynching and killing of white peoplea 239 minute clip from the

In [7]:

# --- Dataset Splitting ---
train_test_split = data_split.train_test_split(test_size=0.2, stratify_by_column="labels", seed=42)

train_texts = [ex['full_text'] for ex in train_test_split['train']]
train_labels = np.array([ex['labels'] for ex in train_test_split['train']], dtype=np.int32)

test_texts = [ex['full_text'] for ex in train_test_split['test']]
test_labels = np.array([ex['labels'] for ex in train_test_split['test']], dtype=np.int32)


In [8]:

# --- Keras TextVectorization and Padding ---
MAX_TOKENS = 10000
MAX_SEQUENCE_LENGTH = 256

print("\nBuilding vocabulary with TextVectorization...")
vectorize_layer = tf.keras.layers.TextVectorization(
    max_tokens=MAX_TOKENS,
    output_mode='int',
    output_sequence_length=MAX_SEQUENCE_LENGTH,
    standardize=None 
)



Building vocabulary with TextVectorization...


In [9]:

vectorize_layer.adapt(train_texts)

VOCAB_SIZE = len(vectorize_layer.get_vocabulary())
print(f"Vocabulary size: {VOCAB_SIZE}")
print("Sample vocabulary:", vectorize_layer.get_vocabulary()[:20])

train_sequences = vectorize_layer(tf.constant(train_texts)).numpy()
test_sequences = vectorize_layer(tf.constant(test_texts)).numpy()

print("\nTrain Sequences Example (first 5 values):", train_sequences[0][:5])
print("Train Labels Example:", train_labels[0])


Vocabulary size: 10000
Sample vocabulary: ['', '[UNK]', 'the', 'to', 'of', 'and', 'a', 'in', 'that', 'is', 'for', 'on', 'it', 'he', 'with', 'was', 'as', 'said', 'by', 'trump']

Train Sequences Example (first 5 values): [3145 3674    1  124  902]
Train Labels Example: 1


In [10]:

# --- BiGRU Model Definition (TensorFlow Keras) ---
EMBEDDING_DIM = 100
GRU_UNITS = 256 # Units for the GRU layer
DROPOUT_RATE = 0.5
NUM_CLASSES = 2

model = Sequential([
    tf.keras.Input(shape=(MAX_SEQUENCE_LENGTH,)),
    Embedding(input_dim=VOCAB_SIZE, 
              output_dim=EMBEDDING_DIM,
              input_length=MAX_SEQUENCE_LENGTH,
              mask_zero=True),
    
    # Using Bidirectional wrapper around GRU layer
    Bidirectional(GRU(GRU_UNITS, return_sequences=False)), 
    
    Dropout(DROPOUT_RATE),
    Dense(NUM_CLASSES, activation='softmax')
])




In [11]:

# --- Compile the Model ---
model.compile(optimizer='adam',
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

model.summary()


In [12]:

# --- Training Callbacks ---
early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)
model_checkpoint = ModelCheckpoint('best_bigru_tf_model.keras', save_best_only=True, monitor='val_loss', mode='min')


In [13]:

# --- Train the Model ---
EPOCHS = 10
BATCH_SIZE = 64

print(f"\nStarting training for {EPOCHS} epochs with batch size {BATCH_SIZE}...")
history = model.fit(
    train_sequences,
    train_labels,
    epochs=EPOCHS,
    batch_size=BATCH_SIZE,
    validation_data=(test_sequences, test_labels),
    callbacks=[early_stopping, model_checkpoint]
)
print("\n--- Training Finished ---")



Starting training for 10 epochs with batch size 64...
Epoch 1/10
[1m902/902[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m765s[0m 846ms/step - accuracy: 0.8534 - loss: 0.3118 - val_accuracy: 0.9792 - val_loss: 0.0601
Epoch 2/10
[1m902/902[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m680s[0m 754ms/step - accuracy: 0.9862 - loss: 0.0421 - val_accuracy: 0.9781 - val_loss: 0.0615
Epoch 3/10
[1m902/902[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m701s[0m 777ms/step - accuracy: 0.9951 - loss: 0.0153 - val_accuracy: 0.9812 - val_loss: 0.0551
Epoch 4/10
[1m902/902[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m813s[0m 901ms/step - accuracy: 0.9976 - loss: 0.0084 - val_accuracy: 0.9832 - val_loss: 0.0747
Epoch 5/10
[1m902/902[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m561s[0m 622ms/step - accuracy: 0.9989 - loss: 0.0039 - val_accuracy: 0.9816 - val_loss: 0.0840
Epoch 6/10
[1m902/902[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m692s[0m 768ms/step - accuracy: 0.9985 - loss

In [14]:

# --- Evaluate the Model on the Test Set ---
print("\n--- Evaluating Model on Test Set ---")
loss, accuracy = model.evaluate(test_sequences, test_labels, batch_size=BATCH_SIZE)
print(f"Test Loss: {loss:.4f}")
print(f"Test Accuracy: {accuracy*100:.2f}%")


--- Evaluating Model on Test Set ---
[1m226/226[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m35s[0m 155ms/step - accuracy: 0.9821 - loss: 0.0504
Test Loss: 0.0551
Test Accuracy: 98.12%


In [15]:
from tensorflow.keras import metrics
from sklearn.metrics import classification_report, confusion_matrix

In [16]:

# --- Generate Comprehensive Classification Report ---
print("\n--- Comprehensive Classification Report ---")
# Get raw predictions (probabilities)
predictions_raw = model.predict(test_sequences)
# Convert probabilities to predicted class (0 or 1)
predicted_classes = np.argmax(predictions_raw, axis=1)




--- Comprehensive Classification Report ---
[1m451/451[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m56s[0m 123ms/step


In [17]:
# Define target names based on your numerical labels (0 and 1)
# Assuming 0: real, 1: fake, based on your previous reports
target_names = ['real', 'fake']

print(classification_report(test_labels, predicted_classes, target_names=target_names))

print("\n--- Confusion Matrix ---")
print(confusion_matrix(test_labels, predicted_classes))

              precision    recall  f1-score   support

        real       0.98      0.98      0.98      7006
        fake       0.98      0.98      0.98      7421

    accuracy                           0.98     14427
   macro avg       0.98      0.98      0.98     14427
weighted avg       0.98      0.98      0.98     14427


--- Confusion Matrix ---
[[6892  114]
 [ 157 7264]]


In [18]:
model.save("bigru.h5")

