## Fake News Detection Using RNN

In [None]:
# dependency

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import os
import re
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score
import seaborn as sns
plt.style.use("ggplot")

# tf things
from tensorflow.keras.preprocessing.text import Tokenizer
import tensorflow as tf

# to download
import gdown
import zipfile

In [None]:
# get the data
os.makedirs(os.path.join(os.getcwd(), "data"), exist_ok=True)
url = "https://drive.google.com/uc?id=1eXjwRX-Ds48IbSRKAkcoIrEujKuMj5wN"
output = "data/data.zip"
gdown.download(url, output, quiet=True)


In [None]:
# upzip data
with zipfile.ZipFile(os.path.join("data", "data.zip"), "r") as f:
    f.extractall("data")

In [None]:
# loading df
fake_df = pd.read_csv("data/Fake.csv")
real_df = pd.read_csv("data/True.csv")

fake_df.shape, real_df.shape

In [None]:
# any missing values in fake data ??

fake_df.isna().sum()

In [None]:
# any missing values in real data ??

real_df.isna().sum()

In [None]:
# Lemme just check the fake df !

fake_df.sample(5)

In [None]:
# what about the real one ?

real_df.sample(5)

In [None]:
# who are the characters in fake data ?

fake_df.subject.unique()

In [None]:
# oh, then who is the hero in here ?

fake_df.subject.value_counts()

In [None]:
# okay, what about the real ones ?

real_df.subject.value_counts()

In [None]:
# hmm, I think subject and date are laggers, we don't need them

real_df.drop(["date", "subject"], axis=1, inplace=True)
fake_df.drop(["date", "subject"], axis=1, inplace=True)

In [None]:
# give the data some name to call
real_df["class"] = 1
fake_df["class"] = 0

In [None]:
# who is biggest and who is smallest

plt.figure(figsize=(10, 5))
plt.bar('Fake News', len(fake_df), color='orange')
plt.bar('Real News', len(real_df), color='green')
plt.title('Distribution of Fake News and Real News', size=15)
plt.xlabel('News Type', size=15)
plt.ylabel('# of News Articles', size=15)


total_len = len(fake_df) + len(real_df)
plt.figure(figsize=(10, 5))
plt.bar('Fake News', len(fake_df) / total_len, color='orange')
plt.bar('Real News', len(real_df) / total_len, color='green')
plt.title('Distribution of Fake News and Real News', size=15)
plt.xlabel('News Type', size=15)
plt.ylabel('Proportion of News Articles', size=15)

In [None]:
print('Difference in news articles:',len(fake_df)-len(real_df))

In [None]:
# combining both parties

news_df = pd.concat([fake_df, real_df], ignore_index=True, sort=False)
news_df.sample(10)

In [None]:
# Combining the title with the text, it is much easier to process this way.

news_df['text'] = news_df['title'] + news_df['text']
news_df.drop('title', axis=1, inplace=True)
news_df.sample(5)

In [None]:
# splitting

features = news_df['text']
targets = news_df['class']

X_train, X_test, y_train, y_test = train_test_split(features, targets, test_size=0.20, random_state=18)

X_train.shape, X_test.shape

In [None]:
# normalizing the data

def normalize(data):
    normalized = []
    for i in data:
        i = i.lower()
        # get rid of urls
        i = re.sub('https?://\S+|www\.\S+', '', i)
        # get rid of non words and extra spaces
        i = re.sub('\\W', ' ', i)
        i = re.sub('\n', '', i)
        i = re.sub(' +', ' ', i)
        i = re.sub('^ ', '', i)
        i = re.sub(' $', '', i)
        normalized.append(i)
    return normalized

X_train = normalize(X_train)
X_test = normalize(X_test)

In [None]:
X_train[:5]

In [None]:
max_vocab = 10000
tokenizer = Tokenizer(num_words=max_vocab)
tokenizer.fit_on_texts(X_train)

In [None]:
# tokenize the text into vectors 
X_train = tokenizer.texts_to_sequences(X_train)
X_test = tokenizer.texts_to_sequences(X_test)

In [None]:
print(type(X_train))
len(X_train), len(X_train[0])


In [None]:
# add some padding

X_train = tf.keras.preprocessing.sequence.pad_sequences(X_train, padding='post', maxlen=256)
X_test = tf.keras.preprocessing.sequence.pad_sequences(X_test, padding='post', maxlen=256)

In [None]:
len(X_train), len(X_train[0])

In [None]:
X_train[0]

In [None]:
# Building RNN

model = tf.keras.Sequential([
    tf.keras.layers.Embedding(max_vocab, 128),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64,  return_sequences=True)),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(16)),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.Dense(1)
])

model.summary()

We are going to use early stop, which stops when the validation loss no longer improve.

In [None]:
early_stop = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=2, restore_best_weights=True)
model.compile(loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
              optimizer=tf.keras.optimizers.Adam(1e-4),
              metrics=['accuracy'])

history = model.fit(X_train, y_train, epochs=10,validation_split=0.1, batch_size=30, shuffle=True, callbacks=[early_stop])

In [None]:
# visualising the training process

history_dict = history.history

acc = history_dict['accuracy']
val_acc = history_dict['val_accuracy']
loss = history_dict['loss']
val_loss = history_dict['val_loss']
epochs = history.epoch

plt.figure(figsize=(12,9))
plt.plot(epochs, loss, 'r', label='Training loss')
plt.plot(epochs, val_loss, 'b', label='Validation loss')
plt.title('Training and validation loss', size=20)
plt.xlabel('Epochs', size=20)
plt.ylabel('Loss', size=20)
plt.legend(prop={'size': 20})
plt.show()

plt.figure(figsize=(12,9))
plt.plot(epochs, acc, 'g', label='Training acc')
plt.plot(epochs, val_acc, 'b', label='Validation acc')
plt.title('Training and validation accuracy', size=20)
plt.xlabel('Epochs', size=20)
plt.ylabel('Accuracy', size=20)
plt.legend(prop={'size': 20})
plt.ylim((0.5,1))
plt.show()

In [None]:
# testing

model.evaluate(X_test, y_test)

In [None]:
pred = model.predict(X_test)

binary_predictions = []

for i in pred:
    if i >= 0.5:
        binary_predictions.append(1)
    else:
        binary_predictions.append(0) 

In [None]:
# eval metrics
print('Accuracy on testing set:', accuracy_score(binary_predictions, y_test))
print('Precision on testing set:', precision_score(binary_predictions, y_test))
print('Recall on testing set:', recall_score(binary_predictions, y_test))

In [None]:
# confustion matrix

matrix = confusion_matrix(binary_predictions, y_test, normalize='all')
plt.figure(figsize=(16, 10))
ax= plt.subplot()
sns.heatmap(matrix, annot=True, ax = ax)

# labels, title and ticks
ax.set_xlabel('Predicted Labels', size=20)
ax.set_ylabel('True Labels', size=20)
ax.set_title('Confusion Matrix', size=20) 
ax.xaxis.set_ticklabels([0,1], size=15)
ax.yaxis.set_ticklabels([0,1], size=15)

Saves the weights for visualiation

In [None]:
e = model.layers[0]
weights = e.get_weights()[0]
print(weights.shape) # shape: (vocab_size, embedding_dim)

In [None]:
word_index = list(tokenizer.word_index.keys())
word_index = word_index[:max_vocab-1]

In [None]:
import io

out_v = io.open('fakenews_vecs.tsv', 'w', encoding='utf-8')
out_m = io.open('fakenews_meta.tsv', 'w', encoding='utf-8')

for num, word in enumerate(word_index):
  vec = weights[num+1] # skip 0, it's padding.
  out_m.write(word + "\n")
  out_v.write('\t'.join([str(x) for x in vec]) + "\n")
out_v.close()
out_m.close()