In [10]:
import re
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Embedding, SpatialDropout1D, Bidirectional, GlobalAveragePooling1D, Dropout
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import ReduceLROnPlateau
from sklearn.model_selection import train_test_split
from datasets import load_dataset
import pickle

In [11]:
ds = load_dataset("ErfanMoosaviMonazzah/fake-news-detection-dataset-English")
train_df = pd.DataFrame(ds['train'])

In [12]:
def clean_text(text):
    if pd.isna(text):
        return ""
    text = text.lower()
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)  # Remove special characters
    text = re.sub(r'\s+', ' ', text).strip()  # Remove extra spaces
    return text

train_df['cleaned_text'] = train_df['text'].apply(clean_text)

In [13]:
# Tokenization & Padding
max_words = 10000
max_len = 600  # Increased to handle longer real-time inputs
tokenizer = Tokenizer(num_words=max_words, oov_token="<OOV>")
tokenizer.fit_on_texts(train_df['cleaned_text'])
X = tokenizer.texts_to_sequences(train_df['cleaned_text'])
X = pad_sequences(X, maxlen=max_len, padding='post')
y = np.array(train_df['label'])

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [15]:
# Define Model
model = Sequential([
    Embedding(input_dim=max_words, output_dim=256, input_length=max_len),
    SpatialDropout1D(0.3),
    Bidirectional(LSTM(64, return_sequences=True)),
    GlobalAveragePooling1D(),  # Helps with large inputs
    Dropout(0.3),
    Dense(64, activation='relu'),
    Dropout(0.2),
    Dense(1, activation='sigmoid')
])




In [16]:
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [20]:
lr_reducer = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=2, verbose=1)


In [None]:
model.fit(X_train, y_train, epochs=25, batch_size=64, validation_data=(X_test, y_test), callbacks=[lr_reducer])


Epoch 1/25
[1m375/375[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m481s[0m 1s/step - accuracy: 0.9978 - loss: 0.0116 - val_accuracy: 0.9852 - val_loss: 0.0504 - learning_rate: 0.0010
Epoch 2/25
[1m375/375[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m513s[0m 1s/step - accuracy: 0.9977 - loss: 0.0067 - val_accuracy: 0.9883 - val_loss: 0.0521 - learning_rate: 0.0010
Epoch 3/25
[1m375/375[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m468s[0m 1s/step - accuracy: 0.9987 - loss: 0.0054 - val_accuracy: 0.9905 - val_loss: 0.0398 - learning_rate: 0.0010
Epoch 4/25
[1m375/375[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m473s[0m 1s/step - accuracy: 0.9993 - loss: 0.0033 - val_accuracy: 0.9870 - val_loss: 0.0641 - learning_rate: 0.0010
Epoch 5/25
[1m375/375[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m634s[0m 2s/step - accuracy: 0.9992 - loss: 0.0031 - val_accuracy: 0.9882 - val_loss: 0.0369 - learning_rate: 0.0010
Epoch 6/25
[1m272/375[0m [32m━━━━━━━━━━━━━━[0m[37m━━━━━━[0m