# Importing Libraries

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, GRU, Bidirectional
from tensorflow.keras.callbacks import EarlyStopping
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import nltk
from nltk.stem.porter import PorterStemmer
import re
import numpy as np
import tensorflow as tf

In [None]:
# read csv
data = pd.read_csv('/content/drive/MyDrive/Fake_news/news.csv').iloc[:,1:]
data.head()

Unnamed: 0,title,text,label
0,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",FAKE
1,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,FAKE
2,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,REAL
3,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",FAKE
4,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,REAL


# Preprocessing Data

In [None]:
# Merging title and text
data["final_text"] = data["title"] + ' ' + data["text"]
text_data = data['final_text']
labels = data['label']

In [None]:
# converting labes to 0 and 1
le = LabelEncoder()
labels = le.fit_transform(labels)

In [None]:
labels

array([0, 0, 1, ..., 0, 1, 1])

In [None]:
text_data

0       You Can Smell Hillary’s Fear Daniel Greenfield...
1       Watch The Exact Moment Paul Ryan Committed Pol...
2       Kerry to go to Paris in gesture of sympathy U....
3       Bernie supporters on Twitter erupt in anger ag...
4       The Battle of New York: Why This Primary Matte...
                              ...                        
6330    State Department says it can't find emails fro...
6331    The ‘P’ in PBS Should Stand for ‘Plutocratic’ ...
6332    Anti-Trump Protesters Are Tools of the Oligarc...
6333    In Ethiopia, Obama seeks progress on peace, se...
6334    Jeb Bush Is Suddenly Attacking Trump. Here's W...
Name: final_text, Length: 6335, dtype: object

## Downloading Stopwords

In [None]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [None]:
# stopwords removal and stemming
ps = PorterStemmer()
stop = stopwords.words('english')

for i in range(0, len(text_data)):
    text_data[i] = re.sub('[^a-zA-Z]', ' ', text_data[i])
    text_data[i] = text_data[i].lower()
    text_data[i] = text_data[i].split()
    text_data[i] = [ps.stem(word) for word in text_data[i] if not word in stopwords.words('english')]
    text_data[i] = ' '.join(text_data[i])

text_data[0]

'smell hillari fear daniel greenfield shillman journal fellow freedom center new york writer focus radic islam final stretch elect hillari rodham clinton gone war fbi word unpreced thrown around often elect ought retir still unpreced nomine major polit parti go war fbi exactli hillari peopl done coma patient wake watch hour cnn hospit bed would assum fbi director jame comey hillari oppon elect fbi attack everyon obama cnn hillari peopl circul letter attack comey current media hit piec lambast target trump surpris clinton alli start run attack ad fbi fbi leadership warn entir left wing establish form lynch mob continu go hillari fbi credibl attack media democrat preemptiv head result investig clinton foundat hillari clinton covert struggl fbi agent obama doj peopl gone explos public new york time compar comey j edgar hoover bizarr headlin jame comey role recal hoover fbi fairli practic admit front spout nonsens boston globe publish column call comey resign outdon time editori claim scan

In [None]:
max_words = 10000  # Maximum number of words in vocab
max_sequence_length = 200  # Maximum sequence length for padding

In [None]:
tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(text_data)
sequences = tokenizer.texts_to_sequences(text_data)
X = pad_sequences(sequences, maxlen=max_sequence_length)

In [None]:
X

array([[1558, 9056,  234, ...,   11,  377,  754],
       [ 142,    8,  202, ...,  796, 1130,  484],
       [1040,  275, 1531, ...,  997,  481, 4878],
       ...,
       [ 267, 2374, 1578, ...,  396,   51,  227],
       [2204,  590,  362, ...,  404, 1444,  944],
       [ 156,  737,  714, ...,    1, 2223, 3549]], dtype=int32)

In [None]:
#Uncomment to save the array X for next execution
# np.save('/content/drive/MyDrive/Fake_news/X',X)

In [None]:
#Uncomment the below line to Load the array X from saved state
# X = np.load('/content/drive/MyDrive/Fake_news/X.npy')
X

array([[1558, 9056,  234, ...,   11,  377,  754],
       [ 142,    8,  202, ...,  796, 1130,  484],
       [1040,  275, 1531, ...,  997,  481, 4878],
       ...,
       [ 267, 2374, 1578, ...,  396,   51,  227],
       [2204,  590,  362, ...,  404, 1444,  944],
       [ 156,  737,  714, ...,    1, 2223, 3549]], dtype=int32)

# Train-Test Split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, labels, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

# Neural Network Architectures

In [None]:
embedding_dim = 100
lstm_units = 128

In [None]:
def LSTM_model():
  model = Sequential()
  model.add(Embedding(input_dim=max_words, output_dim=embedding_dim, input_length=max_sequence_length))
  model.add(LSTM(units=lstm_units, dropout=0.2, recurrent_dropout=0.2))
  model.add(Dense(1, activation='sigmoid'))

  model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
  return model

In [None]:
def GRU_model():
  model = Sequential()
  model.add(Embedding(input_dim=max_words, output_dim=embedding_dim, input_length=max_sequence_length))
  model.add(GRU(units=lstm_units, dropout=0.2, recurrent_dropout=0.2))
  model.add(Dense(1, activation='sigmoid'))

  model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
  return model

In [None]:
def Bidirection_model():
  model = Sequential()
  model.add(Embedding(input_dim=max_words, output_dim=embedding_dim, input_length=max_sequence_length))
  model.add(Bidirectional(LSTM(units=lstm_units, dropout=0.2, recurrent_dropout=0.2)))
  model.add(Dense(1, activation='sigmoid'))

  model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
  return model

# Training

In [None]:
early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

## LSTM

In [None]:
model = LSTM_model()
history = model.fit(X_train, y_train, validation_data=(X_val, y_val),
                    epochs=20, batch_size=64, callbacks=[early_stopping])



Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20


In [None]:
#Saving the model
model.save('/content/drive/MyDrive/Fake_news/lstm.keras')

In [None]:
# Uncomment the below line to skip training and load the saved model
# model = tf.keras.models.load_model('/content/drive/MyDrive/Fake_news/lstm.keras')



In [None]:
loss, accuracy = model.evaluate(X_test, y_test)
print(f"Test Loss: {loss:.4f}, Test Accuracy: {accuracy:.4f}")

Test Loss: 0.2298, Test Accuracy: 0.9140


In [None]:
new_articles = ["The federal government is rolling out a trusted employer program that is meant to reduce red tape and make it easier for Canadian employers to bring in temporary foreign workers.Officials say the Recognized Employer Pilot program will be open for applications as soon as September, first to employers in agriculture, then to all others starting in January.",
                "Breaking: Aliens have landed!"]
new_sequences = tokenizer.texts_to_sequences(new_articles)
new_padded_sequences = pad_sequences(new_sequences, maxlen=max_sequence_length)
predictions = model.predict(new_padded_sequences)



In [None]:
decoded_predictions = le.inverse_transform(predictions.round().astype(int))
print("Predictions:", decoded_predictions)

Predictions: ['REAL' 'FAKE']


  y = column_or_1d(y, warn=True)


GRU

In [None]:
model = GRU_model()
history = model.fit(X_train, y_train, validation_data=(X_val, y_val),
                    epochs=20, batch_size=64, callbacks=[early_stopping])



Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20


In [None]:
model.save('/content/drive/MyDrive/Fake_news/gru.keras')

In [None]:
# Uncomment the below line to skip training and load the saved model
# model = tf.keras.models.load_model('/content/drive/MyDrive/Fake_news/gru.keras')



In [None]:
loss, accuracy = model.evaluate(X_test, y_test)
print(f"Test Loss: {loss:.4f}, Test Accuracy: {accuracy:.4f}")

Test Loss: 0.2865, Test Accuracy: 0.8856


In [None]:
new_articles = ["The federal government is rolling out a trusted employer program that is meant to reduce red tape and make it easier for Canadian employers to bring in temporary foreign workers.Officials say the Recognized Employer Pilot program will be open for applications as soon as September, first to employers in agriculture, then to all others starting in January.",
                "Breaking: Aliens have landed!"]
new_sequences = tokenizer.texts_to_sequences(new_articles)
new_padded_sequences = pad_sequences(new_sequences, maxlen=max_sequence_length)
predictions = model.predict(new_padded_sequences)



In [None]:
decoded_predictions = le.inverse_transform(predictions.round().astype(int))
print("Predictions:", decoded_predictions)

Predictions: ['REAL' 'FAKE']


  y = column_or_1d(y, warn=True)


Bidirectional

In [None]:
model = Bidirection_model()
history = model.fit(X_train, y_train, validation_data=(X_val, y_val),
                    epochs=20, batch_size=64, callbacks=[early_stopping])



Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20


In [None]:
model.save('/content/drive/MyDrive/Fake_news/bidir.keras')

In [None]:
# Uncomment the below line to skip training and load the saved model
#model = tf.keras.models.load_model('/content/drive/MyDrive/Fake_news/bidir.keras')



In [None]:
loss, accuracy = model.evaluate(X_test, y_test)
print(f"Test Loss: {loss:.4f}, Test Accuracy: {accuracy:.4f}")

Test Loss: 0.2376, Test Accuracy: 0.9116


In [None]:
new_articles = ["The federal government is rolling out a trusted employer program that is meant to reduce red tape and make it easier for Canadian employers to bring in temporary foreign workers.Officials say the Recognized Employer Pilot program will be open for applications as soon as September, first to employers in agriculture, then to all others starting in January.",
                "Breaking: Aliens have landed!"]
new_sequences = tokenizer.texts_to_sequences(new_articles)
new_padded_sequences = pad_sequences(new_sequences, maxlen=max_sequence_length)
predictions = model.predict(new_padded_sequences)



In [None]:
decoded_predictions = le.inverse_transform(predictions.round().astype(int))
print("Predictions:", decoded_predictions)

Predictions: ['REAL' 'FAKE']


  y = column_or_1d(y, warn=True)
