## Fake News Classifier Using LSTM

Dataset: https://www.kaggle.com/c/fake-news/data#

#### steps
1. import dataset
2. split into dependent and independent features
3. cleaning the data -> Stemming, stop words
4. Fix a sentence length for input , padding
5. one hot representation, Embedding layer
6. LSTM neural network

In [None]:
import pandas as pd

In [None]:
df=pd.read_csv('/content/drive/MyDrive/class/9_deep_learning/rnn/lstm/fake_news_classifier/train.csv',error_bad_lines=False,engine='python')
df.head()

In [None]:
df.shape

In [None]:
df.isnull().sum()

In [None]:
###Drop Nan Values
df=df.dropna()

In [None]:
df.head()

In [None]:
## Get the Independent Features

x = df.drop('label',axis=1)

In [None]:
## Get the Dependent features
y = df['label']

In [None]:
x.shape

In [None]:
y.shape

In [None]:
import tensorflow as tf

In [None]:
from tensorflow.keras.layers import Embedding
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing.text import one_hot
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import Dense

In [None]:
### Vocabulary size
voc_size=5000

### Onehot Representation

In [None]:
messages=x.copy()

In [None]:
messages['title'][1]

In [None]:
messages.head(10)

In [None]:
messages.reset_index(inplace=True)

In [None]:
messages.head(10)

In [None]:
import nltk
import re
from nltk.corpus import stopwords

In [None]:
nltk.download('stopwords')

In [None]:
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()

In [None]:
### Dataset Preprocessing


corpus = []
for i in range(0, len(messages)):
    review = re.sub('[^a-zA-Z]', ' ', messages['title'][i])
    review = review.lower()
    review = review.split()

    review = [ps.stem(word) for word in review if not word in stopwords.words('english')]
    review = ' '.join(review)
    corpus.append(review)

In [None]:
corpus

In [None]:
corpus[1]

In [None]:
onehot_repr=[one_hot(words,voc_size)for words in corpus]
onehot_repr

In [None]:
corpus[1]

In [None]:
onehot_repr[1]

### Embedding Representation

In [None]:
# padding
sent_length=20
embedded_docs=pad_sequences(onehot_repr,padding='post',maxlen=sent_length)
print(embedded_docs)

In [None]:
embedded_docs[1]

In [None]:
## Creating model
embedding_vector_features=40 ##features representation
model=Sequential()
model.add(Embedding(voc_size,embedding_vector_features,input_length=sent_length))
model.add(LSTM(100))
model.add(Dense(1,activation='sigmoid'))
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
print(model.summary())

In [None]:
len(embedded_docs),y.shape

In [None]:
import numpy as np
x_final=np.array(embedded_docs)
y_final=np.array(y)

In [None]:
x_final.shape,y_final.shape

In [None]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x_final, y_final, test_size=0.33, random_state=42)

## Model Training

In [None]:
model.fit(x_train,y_train,validation_data=(x_test,y_test),epochs=10,batch_size=64)

## Performance Metrics And Accuracy

In [None]:
y_pred=model.predict(x_test)

In [None]:
y_pred = np.where(y_pred > 0.5, 1,0)

In [None]:
y_pred

In [None]:
y_test

In [None]:
from sklearn.metrics import confusion_matrix

In [None]:
confusion_matrix(y_test,y_pred)

In [None]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test,y_pred)

In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_test,y_pred))

### Adding Dropout

In [None]:
from tensorflow.keras.layers import Dropout
## Creating model
embedding_vector_features=40
model=Sequential()
model.add(Embedding(voc_size,embedding_vector_features,input_length=sent_length))
model.add(Dropout(0.3))
model.add(LSTM(100))
model.add(Dropout(0.3))
model.add(Dense(1,activation='sigmoid'))
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])