### Extract zipfile

In [1]:
import zipfile

zip_ref = zipfile.ZipFile('fake_news.zip')
zip_ref.extractall()

In [2]:
import pandas as pd

df = pd.read_csv('WELFake_Dataset.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,title,text,label
0,0,LAW ENFORCEMENT ON HIGH ALERT Following Threat...,No comment is expected from Barack Obama Membe...,1
1,1,,Did they post their votes for Hillary already?,1
2,2,UNBELIEVABLE! OBAMA’S ATTORNEY GENERAL SAYS MO...,"Now, most of the demonstrators gathered last ...",1
3,3,"Bobby Jindal, raised Hindu, uses story of Chri...",A dozen politically active pastors came here f...,0
4,4,SATAN 2: Russia unvelis an image of its terrif...,"The RS-28 Sarmat missile, dubbed Satan 2, will...",1


### clean the dataset

#### Nulls

In [3]:
df.isnull().sum()

Unnamed: 0,0
Unnamed: 0,0
title,558
text,39
label,0


In [4]:
df.shape

(72134, 4)

In [5]:
df = df.dropna()

In [6]:
df.isnull().sum()

Unnamed: 0,0
Unnamed: 0,0
title,0
text,0
label,0


In [7]:
df.head()

Unnamed: 0.1,Unnamed: 0,title,text,label
0,0,LAW ENFORCEMENT ON HIGH ALERT Following Threat...,No comment is expected from Barack Obama Membe...,1
2,2,UNBELIEVABLE! OBAMA’S ATTORNEY GENERAL SAYS MO...,"Now, most of the demonstrators gathered last ...",1
3,3,"Bobby Jindal, raised Hindu, uses story of Chri...",A dozen politically active pastors came here f...,0
4,4,SATAN 2: Russia unvelis an image of its terrif...,"The RS-28 Sarmat missile, dubbed Satan 2, will...",1
5,5,About Time! Christian Group Sues Amazon and SP...,All we can say on this one is it s about time ...,1


#### reset index

* we have removed some rows so we have to reset the index

In [8]:
df = df.reset_index()

In [9]:
df.head()

Unnamed: 0.1,index,Unnamed: 0,title,text,label
0,0,0,LAW ENFORCEMENT ON HIGH ALERT Following Threat...,No comment is expected from Barack Obama Membe...,1
1,2,2,UNBELIEVABLE! OBAMA’S ATTORNEY GENERAL SAYS MO...,"Now, most of the demonstrators gathered last ...",1
2,3,3,"Bobby Jindal, raised Hindu, uses story of Chri...",A dozen politically active pastors came here f...,0
3,4,4,SATAN 2: Russia unvelis an image of its terrif...,"The RS-28 Sarmat missile, dubbed Satan 2, will...",1
4,5,5,About Time! Christian Group Sues Amazon and SP...,All we can say on this one is it s about time ...,1


### X and y

In [10]:
X = df['title']
y = df['label']

In [11]:
X.head()

Unnamed: 0,title
0,LAW ENFORCEMENT ON HIGH ALERT Following Threat...
1,UNBELIEVABLE! OBAMA’S ATTORNEY GENERAL SAYS MO...
2,"Bobby Jindal, raised Hindu, uses story of Chri..."
3,SATAN 2: Russia unvelis an image of its terrif...
4,About Time! Christian Group Sues Amazon and SP...


#### we are  going to predict the that the news is whether fake or true by the title of that news

### steps for word to vect by the embadding layer

#### get the data

In [12]:
message = X.copy()
message.head()

Unnamed: 0,title
0,LAW ENFORCEMENT ON HIGH ALERT Following Threat...
1,UNBELIEVABLE! OBAMA’S ATTORNEY GENERAL SAYS MO...
2,"Bobby Jindal, raised Hindu, uses story of Chri..."
3,SATAN 2: Russia unvelis an image of its terrif...
4,About Time! Christian Group Sues Amazon and SP...


#### clean the data

import the neccessarry libraries

In [13]:
import nltk
import re
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [14]:
corpus = []
ps = PorterStemmer()

for i in range(0,len(message)):

  line = re.sub('[^a-zA-Z0-9]',' ',message[i])
  line = line.lower()
  line = line.split()
  line = [ps.stem(word) for word in line if word not in stopwords.words('english')]
  line = ' '.join(line)

  corpus.append(line)



In [15]:
corpus

['law enforc high alert follow threat cop white 9 11bi blacklivesmatt fyf911 terrorist video',
 'unbeliev obama attorney gener say charlott rioter peac protest home state north carolina video',
 'bobbi jindal rais hindu use stori christian convers woo evangel potenti 2016 bid',
 'satan 2 russia unv imag terrifi new supernuk western world take notic',
 'time christian group sue amazon splc design hate group',
 'dr ben carson target ir never audit spoke nation prayer breakfast',
 'hous intel chair trump russia fake stori evid anyth video',
 'sport bar owner ban nfl game show true american sport like speak rural america video',
 'latest pipelin leak underscor danger dakota access pipelin',
 'gop senat smack punchabl alt right nazi internet',
 'may brexit offer would hurt cost eu citizen eu parliament',
 'schumer call trump appoint offici overse puerto rico relief',
 'watch hilari ad call question health age clinton crime famili boss',
 'chang expect espn polit agenda despit huge subscrib 

#### vocabulary size

In [16]:
voc_size = 5000

#### OHE

In [17]:
from tensorflow.keras.preprocessing.text import one_hot

ohe_docs = [one_hot(word,voc_size) for word in corpus]
ohe_docs

[[1354,
  2026,
  1350,
  2863,
  4986,
  3384,
  4269,
  3588,
  3371,
  2429,
  1092,
  615,
  4099,
  4339],
 [3210,
  4164,
  1656,
  3580,
  2294,
  4036,
  4031,
  959,
  4993,
  942,
  3564,
  2337,
  3989,
  4339],
 [889, 207, 3699, 859, 1597, 3947, 892, 3785, 805, 2839, 3068, 1865, 880],
 [3070, 1470, 1375, 3577, 4693, 2826, 2554, 2340, 351, 2072, 2012, 4058],
 [2379, 892, 2065, 3556, 3015, 3784, 1276, 2034, 2065],
 [1279, 4437, 4280, 1102, 3213, 88, 1803, 349, 2155, 2013, 1224],
 [3215, 1011, 4559, 3310, 1375, 4426, 3947, 1261, 4244, 4339],
 [3956,
  704,
  2983,
  1808,
  3288,
  4168,
  3182,
  4134,
  3530,
  3956,
  4578,
  375,
  3057,
  3280,
  4339],
 [2768, 3769, 2712, 3973, 159, 3773, 4566, 3769],
 [4115, 4661, 716, 2700, 2984, 3289, 3932, 4351],
 [2436, 3459, 4822, 2838, 3458, 4957, 3602, 3366, 3602, 822],
 [4804, 2861, 3310, 2724, 1338, 2760, 1857, 968, 1700],
 [2189, 3735, 3399, 2861, 4820, 1995, 4500, 2383, 1498, 427, 2148],
 [3925, 3099, 3714, 2237, 3141, 2877, 

#### padding

In [18]:
df['count_by_words'] = df['title'].apply(lambda x: len(str(x).split()))
df['count_by_words'].max()

72

In [19]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

eq_sent_length = 75

pad_ohe_docs = pad_sequences(ohe_docs,padding = 'pre',maxlen=eq_sent_length)
pad_ohe_docs

array([[   0,    0,    0, ...,  615, 4099, 4339],
       [   0,    0,    0, ..., 2337, 3989, 4339],
       [   0,    0,    0, ..., 3068, 1865,  880],
       ...,
       [   0,    0,    0, ..., 4751, 3036, 1188],
       [   0,    0,    0, ...,  863, 4285, 1513],
       [   0,    0,    0, ..., 2180, 2383, 4545]], dtype=int32)

#### train test split

In [20]:
from sklearn.model_selection import train_test_split
import numpy as np

X_train,X_test,y_train,y_test = train_test_split(pad_ohe_docs,np.array(y),test_size=0.2,random_state=42)

#### Model Creation & Embadding layer :

* Converting word to vect and applying LSTM RNN on it

In [21]:
from tensorflow.keras.layers import Embedding,LSTM,Dense,Bidirectional
from tensorflow.keras.models import Sequential




#### initialize the model : change for the use of **bidirectional** lstm

In [22]:

model = Sequential([
    Embedding(input_dim=voc_size,
              output_dim=20,
              input_shape = (eq_sent_length,)),

    Bidirectional(LSTM(100)),         ## small change to use bidirectional lstm

    Dense(1,activation='sigmoid')
])

  super().__init__(**kwargs)


compile the model

In [23]:
model.compile(loss='binary_crossentropy',  # becuase of binary classification
              optimizer = 'adam',
              metrics = ['accuracy'])

fit the model

In [24]:
model.fit(X_train,y_train,
          batch_size=32,
          epochs = 10,
          validation_data=(X_test,y_test),
          )

Epoch 1/10
[1m1789/1789[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 11ms/step - accuracy: 0.8091 - loss: 0.3997 - val_accuracy: 0.8958 - val_loss: 0.2510
Epoch 2/10
[1m1789/1789[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 11ms/step - accuracy: 0.9087 - loss: 0.2281 - val_accuracy: 0.8994 - val_loss: 0.2449
Epoch 3/10
[1m1789/1789[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 11ms/step - accuracy: 0.9269 - loss: 0.1870 - val_accuracy: 0.8982 - val_loss: 0.2469
Epoch 4/10
[1m1789/1789[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 11ms/step - accuracy: 0.9357 - loss: 0.1658 - val_accuracy: 0.8982 - val_loss: 0.2511
Epoch 5/10
[1m1789/1789[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 11ms/step - accuracy: 0.9444 - loss: 0.1452 - val_accuracy: 0.8974 - val_loss: 0.2657
Epoch 6/10
[1m1789/1789[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 11ms/step - accuracy: 0.9515 - loss: 0.1277 - val_accuracy: 0.8978 - val_loss: 0.2761
Epoc

<keras.src.callbacks.history.History at 0x794aab2ec530>

predict the X_test

In [25]:
y_pred_prob = model.predict(X_test)
y_pred_prob

[1m448/448[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 4ms/step


array([[8.9596942e-02],
       [1.4106522e-04],
       [1.2222596e-05],
       ...,
       [9.9997401e-01],
       [2.7321423e-03],
       [9.9567997e-01]], dtype=float32)

In [26]:
y_pred = np.where( y_pred_prob >= 0.5,1,0)
y_pred

array([[0],
       [0],
       [0],
       ...,
       [1],
       [0],
       [1]])

classification report

In [27]:
from sklearn.metrics import classification_report

print(classification_report(y_pred,y_test))

              precision    recall  f1-score   support

           0       0.89      0.91      0.90      6938
           1       0.91      0.89      0.90      7370

    accuracy                           0.90     14308
   macro avg       0.90      0.90      0.90     14308
weighted avg       0.90      0.90      0.90     14308



confusion matrix

In [8]:
from sklearn.metrics import ConfusionMatrixDisplay,confusion_matrix
import matplotlib.pyplot as plt


cm = confusion_matrix(y_test,y_pred)

d = ConfusionMatrixDisplay(cm)
d.plot(cmap='Blues_r')
plt.xlabel('y_test')
plt.ylabel('y_pred')

NameError: name 'y_test' is not defined