In [42]:
import tensorflow as tf
import pandas as pd
import numpy as np
import seaborn as sns
from bs4 import BeautifulSoup
import re,string,unicodedata
import matplotlib.pyplot as plt
import tensorflow.keras.layers as tfl
from keras.preprocessing import text, sequence
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, Bidirectional
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import Adam
from tensorflow.keras import regularizers
import tensorflow.keras.utils as ku 
from sklearn.model_selection import train_test_split
from nltk.corpus import stopwords

In [22]:
true = pd.read_csv("../input/fake-and-real-news-dataset/True.csv")
false = pd.read_csv("../input/fake-and-real-news-dataset/Fake.csv")

In [23]:
true['category'] = 1
false['category'] = 0

In [24]:
true.head()

In [25]:
false.head()

In [26]:
news = pd.concat([true,false])

In [27]:
sns.set_style("darkgrid")
# Count occurences of both real and fake news
sns.countplot(x=news.category)

In [28]:
news['text'] = news['text'] + " " + news['title']
del news['title']
del news['subject']
del news['date']

In [37]:
stop = set(stopwords.words('english'))
punctuation = list(string.punctuation)
stop.update(punctuation)

In [38]:
def strip_html(text):
    soup = BeautifulSoup(text, "html.parser")
    return soup.get_text()

#Removing the square brackets
def remove_between_square_brackets(text):
    return re.sub('\[[^]]*\]', '', text)
# Removing URL's
def remove_between_square_brackets(text):
    return re.sub(r'http\S+', '', text)
#Removing the stopwords from text
def remove_stopwords(text):
    final_text = []
    for i in text.split():
        if i.strip().lower() not in stop:
            final_text.append(i.strip())
    return " ".join(final_text)
#Removing the noisy text
def denoise_text(text):
    text = strip_html(text)
    text = remove_between_square_brackets(text)
    text = remove_stopwords(text)
    return text
#Apply function on review column
news['text']=news['text'].apply(denoise_text)

In [40]:
def get_corpus(text):
    words = []
    for i in text:
        for j in i.split():
            words.append(j.strip())
    return words
corpus = get_corpus(news.text)

In [44]:
x_train,x_test,y_train,y_test = train_test_split(news.text,news.category,random_state = 27)

In [45]:
max_features = 10000
maxlen = 300

In [46]:
tokenizer = text.Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(x_train)
tokenized_train = tokenizer.texts_to_sequences(x_train)
x_train = sequence.pad_sequences(tokenized_train, maxlen=maxlen)

In [47]:
tokenized_test = tokenizer.texts_to_sequences(x_test)
x_test = sequence.pad_sequences(tokenized_test, maxlen=maxlen)

In [56]:
# Model Hyperparameters
batch_size = 256
epochs = 2
embed_size = 100

In [57]:
model = Sequential([
                    Embedding(max_features, embed_size, input_length=maxlen),
                    Bidirectional(LSTM(150, return_sequences=True, dropout=0.25)),
                    LSTM(100, dropout=0.1),
                    Dense(32, activation='relu', kernel_regularizer = regularizers.l2(0.01)),
                    Dense(1, activation='sigmoid')
])

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['acc'])

In [59]:
history = model.fit(
    x_train,
    y_train,
    epochs=epochs,
    batch_size=batch_size,
    validation_data = (x_test,y_test)
)

In [64]:
acc = history.history['acc']
loss = history.history['loss']
val_acc = history.history['val_acc']
val_loss = history.history['val_loss']

plt.plot(acc, color='b', label='Train Accuracy')
plt.plot(val_acc, color='r', label='Val Accuracy')
plt.title('Training and Validation Accuracy')
plt.legend()

plt.figure()
plt.plot(loss, color='b', label='Train Loss')
plt.plot(val_loss, color='r', label='Val Loss')
plt.title('Training and Validation Loss')
plt.legend()