In [5]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.metrics import classification_report, accuracy_score
from tensorflow.keras.callbacks import EarlyStopping

# Load the dataset
df = pd.read_csv('WELFake_Dataset.csv')

# Display the first few rows of the dataset
print(df.head())


   Unnamed: 0                                              title  \
0           0  LAW ENFORCEMENT ON HIGH ALERT Following Threat...   
1           1                                                NaN   
2           2  UNBELIEVABLE! OBAMA’S ATTORNEY GENERAL SAYS MO...   
3           3  Bobby Jindal, raised Hindu, uses story of Chri...   
4           4  SATAN 2: Russia unvelis an image of its terrif...   

                                                text  label  
0  No comment is expected from Barack Obama Membe...      1  
1     Did they post their votes for Hillary already?      1  
2   Now, most of the demonstrators gathered last ...      1  
3  A dozen politically active pastors came here f...      0  
4  The RS-28 Sarmat missile, dubbed Satan 2, will...      1  


In [7]:
# Check for missing values
print(df.isnull().sum())

# Drop rows with missing 'text' as it's required for prediction
df = df.dropna(subset=['text']).reset_index(drop=True)

# Combine 'title' and 'text' columns (if required)
df['combined_text'] = df['title'].fillna('') + ' ' + df['text']

# Display the dataset after preprocessing
print(df.head())


Unnamed: 0      0
title         558
text           39
label           0
dtype: int64
   Unnamed: 0                                              title  \
0           0  LAW ENFORCEMENT ON HIGH ALERT Following Threat...   
1           1                                                NaN   
2           2  UNBELIEVABLE! OBAMA’S ATTORNEY GENERAL SAYS MO...   
3           3  Bobby Jindal, raised Hindu, uses story of Chri...   
4           4  SATAN 2: Russia unvelis an image of its terrif...   

                                                text  label  \
0  No comment is expected from Barack Obama Membe...      1   
1     Did they post their votes for Hillary already?      1   
2   Now, most of the demonstrators gathered last ...      1   
3  A dozen politically active pastors came here f...      0   
4  The RS-28 Sarmat missile, dubbed Satan 2, will...      1   

                                       combined_text  
0  LAW ENFORCEMENT ON HIGH ALERT Following Threat...  
1     Did they po

In [8]:
# Define features and target variable
X = df['combined_text']
y = df['label']

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Display the shapes of the splits
print(f"Training data: {X_train.shape}, Testing data: {X_test.shape}")


Training data: (57676,), Testing data: (14419,)


In [9]:
# Set tokenizer parameters
vocab_size = 10000
max_len = 200

# Initialize and fit the tokenizer
tokenizer = Tokenizer(num_words=vocab_size, oov_token='<OOV>')
tokenizer.fit_on_texts(X_train)

# Convert text to sequences
X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

# Pad sequences to ensure uniform length
X_train_pad = pad_sequences(X_train_seq, maxlen=max_len, padding='post', truncating='post')
X_test_pad = pad_sequences(X_test_seq, maxlen=max_len, padding='post', truncating='post')

# Convert target variable to numpy arrays
y_train = np.array(y_train)
y_test = np.array(y_test)


In [10]:
# Define the LSTM model
model = Sequential([
    Embedding(input_dim=vocab_size, output_dim=100, input_length=max_len),
    LSTM(64, return_sequences=True),
    Dropout(0.2),
    LSTM(32),
    Dropout(0.2),
    Dense(32, activation='relu'),
    Dense(1, activation='sigmoid')
])

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Display model summary
model.summary()




In [11]:
# Initialize EarlyStopping callback
early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

# Train the model
history = model.fit(
    X_train_pad, y_train,
    validation_data=(X_test_pad, y_test),
    epochs=10,
    batch_size=32,
    callbacks=[early_stopping]
)


Epoch 1/10
[1m1803/1803[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m127s[0m 69ms/step - accuracy: 0.7182 - loss: 0.5553 - val_accuracy: 0.8247 - val_loss: 0.4213
Epoch 2/10
[1m1803/1803[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m159s[0m 88ms/step - accuracy: 0.8128 - loss: 0.4299 - val_accuracy: 0.9081 - val_loss: 0.2680
Epoch 3/10
[1m1803/1803[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m175s[0m 97ms/step - accuracy: 0.8297 - loss: 0.3688 - val_accuracy: 0.9433 - val_loss: 0.1548
Epoch 4/10
[1m1803/1803[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m169s[0m 94ms/step - accuracy: 0.9513 - loss: 0.1403 - val_accuracy: 0.9602 - val_loss: 0.1153
Epoch 5/10
[1m1803/1803[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m160s[0m 89ms/step - accuracy: 0.9724 - loss: 0.0865 - val_accuracy: 0.9675 - val_loss: 0.0890
Epoch 6/10
[1m1803/1803[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m161s[0m 89ms/step - accuracy: 0.9857 - loss: 0.0472 - val_accuracy: 0.9744 - val_loss: 0.089

In [12]:
# Evaluate the model on the test set
loss, accuracy = model.evaluate(X_test_pad, y_test)
print(f"Test Loss: {loss}")
print(f"Test Accuracy: {accuracy}")

# Generate a classification report
y_pred = (model.predict(X_test_pad) > 0.5).astype('int32')
print(classification_report(y_test, y_pred))


[1m451/451[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 17ms/step - accuracy: 0.9676 - loss: 0.0896
Test Loss: 0.08898383378982544
Test Accuracy: 0.967542827129364
[1m451/451[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 18ms/step
              precision    recall  f1-score   support

           0       0.97      0.96      0.97      7010
           1       0.96      0.98      0.97      7409

    accuracy                           0.97     14419
   macro avg       0.97      0.97      0.97     14419
weighted avg       0.97      0.97      0.97     14419



In [13]:
# Save the trained model
model.save('model.h5')




In [22]:
import numpy as np
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import load_model
from tensorflow.keras.preprocessing.text import Tokenizer

# Function to predict if the input text is fake news
def predict_fake_news(input_text):
    """
    Predict if the given input text is fake news.
    
    Parameters:
    input_text (str): The news text to be analyzed.
    
    Returns:
    str: The prediction ('Fake News' or 'Real News').
    float: The probability of being 'Fake News'.
    """
    try:
        print("Loading the trained model...")
        # Load the trained model
        model = load_model('model.h5')  # Ensure the 'model.h5' file is present in the working directory

        print("Initializing the tokenizer...")
        # Recreate the tokenizer with the same configuration as during training
        tokenizer = Tokenizer(num_words=10000, oov_token="<OOV>")
        tokenizer.fit_on_texts(["dummy"])  # Dummy fit to initialize the tokenizer (can be replaced with saved vocab)

        max_len = 200  # Ensure the max_len matches the value used during training
        
        print("Tokenizing and padding input text...")
        # Tokenize the input text
        input_sequence = tokenizer.texts_to_sequences([input_text])
        print(f"Tokenized sequence: {input_sequence}")
        
        # Pad the sequence
        padded_sequence = pad_sequences(input_sequence, maxlen=max_len, padding='post', truncating='post')
        print(f"Padded sequence shape: {padded_sequence.shape}")
        
        # Make prediction
        print("Making prediction...")
        prediction = model.predict(padded_sequence)
        print(f"Prediction raw output: {prediction}")
        
        # Convert prediction to label
        probability = prediction[0][0]
        result = "Fake News" if probability >= 0.5 else "Real News"
        return result, probability

    except Exception as e:
        print(f"An error occurred: {e}")


In [24]:
text='''What does it take to channel the spirit of Jacques Cousteau and search for secret treasure?

For Jon Collins-Black, this question sparked a thrilling journey that led him to hide five treasure chests across the United States.

His new book, “There’s Treasure Inside,” offers hints for eager treasure hunters, sending them on an expedition to find hidden chests with a combined prize value of more than $2 million.

The inspiration
"There's Treasure Inside" contains all of the necessary clues to find one of Collins-Black's treasure chests.
"There's Treasure Inside" contains all of the necessary clues to find one of Collins-Black's treasure chests. Courtesy Jon Collins-Black
Collins-Black has been a lifelong fantasy enthusiast, immersing himself in games and mythical adventures such as Dungeons & Dragons since childhood.

By 2015, the successful musician and entrepreneur was looking for a change of pace and envisioned a project that would help him reconnect with his younger imagination.

Motivated by Forrest Fenn’s infamous treasure hunt launched back in 2010, Collins-Black dreamed of creating something more personal and accessible. Instead of Fenn’s single chest hidden in the Rocky Mountains, Collins-Black envisioned multiple troves allowing every person across the country the opportunity to be in closer proximity to one of the chests.

“I wanted to have the chests spread out to give people the optimistic, adventurous possibility,” he said.

With a creative background in writing, from poetry to children’s book publishing, Collins-Black combined his skills to produce “There’s Treasure Inside,” aimed to entertain even those who do not plan to look for the treasure.'''

In [25]:
predict_fake_news(text)



Loading the trained model...
Initializing the tokenizer...
Tokenizing and padding input text...
Tokenized sequence: [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]
Padded sequence shape: (1, 200)
Making prediction...




[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 638ms/step
Prediction raw output: [[0.99375665]]


('Fake News', 0.99375665)

In [27]:
df['label'].value_counts()


1    37067
0    35028
Name: label, dtype: int64