In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import re
import nltk
import seaborn as sns
from wordcloud import WordCloud

In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, LSTM, Conv1D, MaxPool1D
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score

In [None]:
fake = pd.read_csv('.\\dataset\\fake.csv')
fake.head()

In [None]:
fake['subject'].value_counts()

In [None]:
plt.figure(figsize=(10, 6))
sns.countplot(x='subject',data=fake)

In [None]:
## wordcloud -----------> for wordcloud we have to make signle text data. so need to join the list data.
text = ' '.join(fake['text'].tolist())

In [None]:
wordcloud = WordCloud(width=1920, height=1080).generate(text)
fig  = plt.figure(figsize=(10, 10))
plt.imshow(wordcloud)
plt.axis('off')
plt.tight_layout(pad=0)
plt.show()

In [None]:
real = pd.read_csv(".\\dataset\\True.csv")
real.head()

In [None]:
text = ' '.join(real['text'].tolist())
wordcloud = WordCloud(width=1920, height=1080).generate(text)
fig  = plt.figure(figsize=(10, 10))
plt.imshow(wordcloud)
plt.axis('off')
plt.tight_layout(pad=0)
plt.show()

In [None]:
unknown_publisher = []
for index, row in enumerate(real.text.values):
    try:
        record = row.split(' - ', maxsplit=1)
        assert(len(record[0]) < 260)   ### check the length of the reuters less then tweet data. tweet data length is 260. so if data is > 260 then consider as unknown publisher.
        assert(len(record) == 2)  ### this check is used for empty text. empty text is consider as unknown publisher
    except:
        unknown_publisher.append(index)

In [None]:
len(unknown_publisher)

In [None]:
temp_text = []
publisher = []

for index, row in enumerate(real.text.values):
    if index in unknown_publisher:
        publisher.append('Unknown')
        temp_text.append(row)
    else:
        record = row.split(' - ', maxsplit=1)
        publisher.append(record[0])
        temp_text.append(record[1])


In [None]:
real['publisher'] = publisher
real['text'] = temp_text

In [None]:
real.head()

Fake news analysis

In [None]:
fake_empty_index = [index for index, text in enumerate(fake.text.tolist()) if str(text).strip() == ""]
len(fake_empty_index)

In [None]:
fake.iloc[fake_empty_index]

In [None]:
## here we can see in fake dataframe 630 text rows are missing. So now for training data we merge title and text columns
real['text'] = real['title'] + " " + real['text']
fake['text'] = fake['title'] + " " + fake['text']

Text Pre-processing

In [None]:
real['text'] = real['text'].apply(lambda x : str(x).lower())
fake['text'] = fake['text'].apply(lambda x : str(x).lower())

In [None]:
real['class'] = 1
fake['class'] = 0

In [None]:
real.columns

In [None]:
real = real[['text', 'class']]
fake = fake[['text', 'class']]

In [None]:
cols = [real, fake]
data = pd.concat(cols)
data.sample(5)

In [None]:
## remove special char from text

# !pip install spacy
# !python -m spacy download en_core_web_sm
# !pip install beautifulsoup4
# !pip install textblob

In [None]:
def remove_special_chars(x):
	x = re.sub(r'[^\w ]+', "", x)
	x = ' '.join(x.split())
	return x


In [None]:
data['text'].apply(lambda x: remove_special_chars(x))

Vectorization - word2vec

In [None]:

# !pip install gensim

import gensim


In [None]:
y = data['class'].values

In [None]:
X = [d.split() for d in data['text']]  ## create the list of list of words from text columns

In [None]:
DIM = 100
w2v_model = gensim.models.Word2Vec(sentences=X, vector_size=DIM, window=10, min_count=1, sg=1)

In [None]:
w2v_model.wv['india']

In [None]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X)

In [None]:
X = tokenizer.texts_to_sequences(X)
print(X)

In [None]:
tokenizer.word_index  ## convert word to sequence 

In [None]:
## check the maximum length of the sentance

plt.hist([len(x) for x in X], bins=700)
plt.show()

In [None]:
maxlen = 1000
X = pad_sequences(X, maxlen=maxlen)

In [None]:
X

In [None]:
vocab_size = len(tokenizer.word_index) + 1  ## 1 is for unknown sequence
vocab = tokenizer.word_index

In [None]:
def get_weight_matrix(model):
    weight_matrix = np.zeros((vocab_size, DIM))

    for word, i in vocab.items():
        weight_matrix[i] = model.wv[word]
        
    return weight_matrix

In [None]:
embedding_vectors = get_weight_matrix(w2v_model)

In [None]:
model = Sequential()
model.add(Embedding(vocab_size, output_dim=DIM, weights=[embedding_vectors], input_length=maxlen, trainable=False))
model.add(LSTM(units = 128))
model.add(Dense(1, activation='sigmoid'))
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])

In [None]:
model.summary()

In [None]:
x_train, x_test, y_train,  y_test = train_test_split(X, y, test_size=0.2)

In [None]:
model.fit(x_train, y_train, validation_split=0.3, epochs=6)

In [None]:
y_pred = (model.predict(x_test) >= 0.5).astype(int)

In [None]:
y_pred 

In [None]:
accuracy_score(y_test, y_pred)

In [None]:
print(classification_report(y_test, y_pred))