# FAKE NEWS CLASSIFICATION USING NLP TECHNIQUES


## PROJECT FLOW
1. Problem Statement
2. Data Gathering
3. Data Preprocessing
    A. Tokenization
    B. Convert to Lowercase
    C. Remove Stopwords
    D. Lemmatization / Stemming
4. Vectorization (Convert Text Data into Vector)
    A. Bag of Words
    B. TF-IDF
5. Model Building
6. Model Evaluation
7. Model Deployment
8. Prediction on Client Data

## REQUIRED LIBRARIES

In [None]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.linear_model import PassiveAggressiveClassifier, LogisticRegression
from sklearn import svm
from sklearn.model_selection import GridSearchCV
from sklearn.naive_bayes import MultinomialNB

## DATA GATHERING

In [None]:
df1 = pd.read_csv('DATASETS/politifact_fake.csv')
df1.head()

Unnamed: 0,id,news_url,title,tweet_ids
0,politifact15014,speedtalk.com/forum/viewtopic.php?t=51650,BREAKING: First NFL Team Declares Bankruptcy O...,937349434668498944\t937379378006282240\t937380...
1,politifact15156,politics2020.info/index.php/2018/03/13/court-o...,Court Orders Obama To Pay $400 Million In Rest...,972666281441878016\t972678396575559680\t972827...
2,politifact14745,www.nscdscamps.org/blog/category/parenting/467...,UPDATE: Second Roy Moore Accuser Works For Mic...,929405740732870656\t929439450400264192\t929439...
3,politifact14355,https://howafrica.com/oscar-pistorius-attempts...,Oscar Pistorius Attempts To Commit Suicide,886941526458347521\t887011300278194176\t887023...
4,politifact15371,http://washingtonsources.org/trump-votes-for-d...,Trump Votes For Death Penalty For Being Gay,915205698212040704\t915242076681506816\t915249...


In [None]:
df1.shape

(432, 4)

In [None]:
df1.isna().sum()

id            0
news_url      4
title         0
tweet_ids    40
dtype: int64

In [None]:
df1 = df1.dropna()

In [None]:
df1.isna().sum()

id           0
news_url     0
title        0
tweet_ids    0
dtype: int64

In [None]:
df1.shape

(389, 4)

In [None]:
df1['label'] = ['FAKE']*389
df1.head(10)

Unnamed: 0,id,news_url,title,tweet_ids,label
0,politifact15014,speedtalk.com/forum/viewtopic.php?t=51650,BREAKING: First NFL Team Declares Bankruptcy O...,937349434668498944\t937379378006282240\t937380...,FAKE
1,politifact15156,politics2020.info/index.php/2018/03/13/court-o...,Court Orders Obama To Pay $400 Million In Rest...,972666281441878016\t972678396575559680\t972827...,FAKE
2,politifact14745,www.nscdscamps.org/blog/category/parenting/467...,UPDATE: Second Roy Moore Accuser Works For Mic...,929405740732870656\t929439450400264192\t929439...,FAKE
3,politifact14355,https://howafrica.com/oscar-pistorius-attempts...,Oscar Pistorius Attempts To Commit Suicide,886941526458347521\t887011300278194176\t887023...,FAKE
4,politifact15371,http://washingtonsources.org/trump-votes-for-d...,Trump Votes For Death Penalty For Being Gay,915205698212040704\t915242076681506816\t915249...,FAKE
5,politifact14404,gloria.tv/video/yRrtUtTCfPga6cq2VDJPcgQe4,Putin says: ‘Pope Francis Is Not A Man Of God’...,893290900637483009\t893290950700802048\t893290...,FAKE
7,politifact14795,https://web.archive.org/web/20171027105356/htt...,Saudi Arabia to Behead 6 School Girls for Bein...,923126512458616832\t923135295070990341\t923189...,FAKE
8,politifact14328,https://web.archive.org/web/20170702174006/htt...,Malia Obama Fired From Cushy Internship At Spa...,880455776107679747\t880457763876462598\t880461...,FAKE
9,politifact13775,http://beforeitsnews.com/opinion-conservative/...,Target to Discontinue Sale of Holy Bible,732741826084397057\t732741823534227456\t732741...,FAKE
12,politifact14376,http://dailyfeed.news/barack-obama-tweets-sick...,Barack Obama Tweets SICK Attack On John McCain...,888028346742964224\t888130872934780928\t888190...,FAKE


In [None]:
df1.reset_index(inplace=True)
df1.head(10)

In [None]:
df2 = pd.read_csv('DATASETS/politifact_real.csv')
df2.head()

Unnamed: 0,id,news_url,title,tweet_ids
0,politifact14984,http://www.nfib-sbet.org/,National Federation of Independent Business,967132259869487105\t967164368768196609\t967215...
1,politifact12944,http://www.cq.com/doc/newsmakertranscripts-494...,comments in Fayetteville NC,942953459\t8980098198\t16253717352\t1668513250...
2,politifact333,https://web.archive.org/web/20080204072132/htt...,"Romney makes pitch, hoping to close deal : Ele...",
3,politifact4358,https://web.archive.org/web/20110811143753/htt...,Democratic Leaders Say House Democrats Are Uni...,
4,politifact779,https://web.archive.org/web/20070820164107/htt...,"Budget of the United States Government, FY 2008",89804710374154240\t91270460595109888\t96039619...


In [None]:
df2.shape

(624, 4)

In [None]:
df2.isna().sum()

id             0
news_url      57
title          0
tweet_ids    215
dtype: int64

In [None]:
df2 = df2.dropna()

In [None]:
df2.isna().sum()

id           0
news_url     0
title        0
tweet_ids    0
dtype: int64

In [None]:
df2.shape

(373, 4)

In [None]:
df2['label'] = ['REAL']*373
df2.head()

Unnamed: 0,id,news_url,title,tweet_ids,label
0,politifact14984,http://www.nfib-sbet.org/,National Federation of Independent Business,967132259869487105\t967164368768196609\t967215...,REAL
1,politifact12944,http://www.cq.com/doc/newsmakertranscripts-494...,comments in Fayetteville NC,942953459\t8980098198\t16253717352\t1668513250...,REAL
4,politifact779,https://web.archive.org/web/20070820164107/htt...,"Budget of the United States Government, FY 2008",89804710374154240\t91270460595109888\t96039619...,REAL
5,politifact14064,http://www.politifact.com/truth-o-meter/statem...,Donald Trump exaggerates when he says China ha...,690248006399049728\t690254026663821312\t690276...,REAL
6,politifact14474,https://www.law.cornell.edu/constitution/amend...,25th Amendment,1262604762\t10969740933\t11182364398\t17507543...,REAL


In [None]:
df = pd.concat([df1,df2])
df.head()

Unnamed: 0,id,news_url,title,tweet_ids,label
0,politifact15014,speedtalk.com/forum/viewtopic.php?t=51650,BREAKING: First NFL Team Declares Bankruptcy O...,937349434668498944\t937379378006282240\t937380...,FAKE
1,politifact15156,politics2020.info/index.php/2018/03/13/court-o...,Court Orders Obama To Pay $400 Million In Rest...,972666281441878016\t972678396575559680\t972827...,FAKE
2,politifact14745,www.nscdscamps.org/blog/category/parenting/467...,UPDATE: Second Roy Moore Accuser Works For Mic...,929405740732870656\t929439450400264192\t929439...,FAKE
3,politifact14355,https://howafrica.com/oscar-pistorius-attempts...,Oscar Pistorius Attempts To Commit Suicide,886941526458347521\t887011300278194176\t887023...,FAKE
4,politifact15371,http://washingtonsources.org/trump-votes-for-d...,Trump Votes For Death Penalty For Being Gay,915205698212040704\t915242076681506816\t915249...,FAKE


In [None]:
df.reset_index(inplace=True)
df.head()

Unnamed: 0,index,id,news_url,title,tweet_ids,label
0,0,politifact15014,speedtalk.com/forum/viewtopic.php?t=51650,BREAKING: First NFL Team Declares Bankruptcy O...,937349434668498944\t937379378006282240\t937380...,FAKE
1,1,politifact15156,politics2020.info/index.php/2018/03/13/court-o...,Court Orders Obama To Pay $400 Million In Rest...,972666281441878016\t972678396575559680\t972827...,FAKE
2,2,politifact14745,www.nscdscamps.org/blog/category/parenting/467...,UPDATE: Second Roy Moore Accuser Works For Mic...,929405740732870656\t929439450400264192\t929439...,FAKE
3,3,politifact14355,https://howafrica.com/oscar-pistorius-attempts...,Oscar Pistorius Attempts To Commit Suicide,886941526458347521\t887011300278194176\t887023...,FAKE
4,4,politifact15371,http://washingtonsources.org/trump-votes-for-d...,Trump Votes For Death Penalty For Being Gay,915205698212040704\t915242076681506816\t915249...,FAKE


In [None]:
df.isna().sum()

index        0
id           0
news_url     0
title        0
tweet_ids    0
label        0
dtype: int64

In [None]:
df.shape

(762, 6)

In [None]:
df = pd.read_csv('DATASETS/.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,title,text,label
0,8476,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",FAKE
1,10294,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,FAKE
2,3608,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,REAL
3,10142,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",FAKE
4,875,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,REAL


## DATA ANALYSIS

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 762 entries, 0 to 761
Data columns (total 6 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   index      762 non-null    int64 
 1   id         762 non-null    object
 2   news_url   762 non-null    object
 3   title      762 non-null    object
 4   tweet_ids  762 non-null    object
 5   label      762 non-null    object
dtypes: int64(1), object(5)
memory usage: 35.8+ KB


In [None]:
df['label'].value_counts()

FAKE    389
REAL    373
Name: label, dtype: int64

In [None]:
df.shape

(762, 6)

In [None]:
df.isna().sum()

Unnamed: 0    0
title         0
text          0
label         0
dtype: int64

In [None]:
df = df.dropna()

In [None]:
df.isnull().sum()

title    0
label    0
dtype: int64

In [None]:
df.shape

(6335, 4)

In [None]:
#df.reset_index(inplace=True)
df.head()

Unnamed: 0,index,id,news_url,title,tweet_ids,label
0,0,politifact15014,speedtalk.com/forum/viewtopic.php?t=51650,BREAKING: First NFL Team Declares Bankruptcy O...,937349434668498944\t937379378006282240\t937380...,FAKE
1,1,politifact15156,politics2020.info/index.php/2018/03/13/court-o...,Court Orders Obama To Pay $400 Million In Rest...,972666281441878016\t972678396575559680\t972827...,FAKE
2,2,politifact14745,www.nscdscamps.org/blog/category/parenting/467...,UPDATE: Second Roy Moore Accuser Works For Mic...,929405740732870656\t929439450400264192\t929439...,FAKE
3,3,politifact14355,https://howafrica.com/oscar-pistorius-attempts...,Oscar Pistorius Attempts To Commit Suicide,886941526458347521\t887011300278194176\t887023...,FAKE
4,4,politifact15371,http://washingtonsources.org/trump-votes-for-d...,Trump Votes For Death Penalty For Being Gay,915205698212040704\t915242076681506816\t915249...,FAKE


In [None]:
#df['content'] = df['title'] + ' by ' + df['author']

In [None]:
df = df.drop(['id','news_url','tweet_ids','index'],axis=1)

In [None]:
df.head(10)

Unnamed: 0,title,label
0,BREAKING: First NFL Team Declares Bankruptcy O...,FAKE
1,Court Orders Obama To Pay $400 Million In Rest...,FAKE
2,UPDATE: Second Roy Moore Accuser Works For Mic...,FAKE
3,Oscar Pistorius Attempts To Commit Suicide,FAKE
4,Trump Votes For Death Penalty For Being Gay,FAKE
5,Putin says: ‘Pope Francis Is Not A Man Of God’...,FAKE
6,Saudi Arabia to Behead 6 School Girls for Bein...,FAKE
7,Malia Obama Fired From Cushy Internship At Spa...,FAKE
8,Target to Discontinue Sale of Holy Bible,FAKE
9,Barack Obama Tweets SICK Attack On John McCain...,FAKE


In [None]:
df['title'][19]

'Julia Roberts miracle baby at 50'

## DATA PREPROCESSING

### 1. TOKENIZATION

In [None]:
sample = 'The quick brown fox jumps over the lazy dog'
sample = sample.split()
sample

['The', 'quick', 'brown', 'fox', 'jumps', 'over', 'the', 'lazy', 'dog']

### 2. MAKE LOWERCASE

In [None]:
sample = [data.lower() for data in sample]
sample

['the', 'quick', 'brown', 'fox', 'jumps', 'over', 'the', 'lazy', 'dog']

### 3. REMOVE STOPWORDS

In [None]:
stop = stopwords.words('english')
print(stop[:10])
print(len(stop))

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're"]
179


In [None]:
sample = [data for data in sample if data not in stop]
sample

['quick', 'brown', 'fox', 'jumps', 'lazy', 'dog']

### 4. STEMMING

In [None]:
ps = PorterStemmer()
sample_stemming = [ps.stem(data) for data in sample]
sample_stemming

['quick', 'brown', 'fox', 'jump', 'lazi', 'dog']

### 5. LEMMATIZATION

In [None]:
lm = WordNetLemmatizer()
sample_lemma = [lm.lemmatize(data) for data in sample]
sample_lemma

In [None]:
lm = WordNetLemmatizer()
corpus = []
for i in range(len(df)):
    review = re.sub('^a-zA-Z0-9',' ',df['title'][i]) #matches any character that is not a letter (a-z or A-Z) or a digit (0-9) and replaces those characters with spaces.
    review = review.lower()
    review = review.split()
    #review = review[:100]
    review = [lm.lemmatize(x) for x in review if x not in (stopwords.words('english'))]
    review = ' '.join(review)
    corpus.append(review)

In [None]:
len(corpus)

6335

In [None]:
df['title'][0]

'Did Miley Cyrus and Liam Hemsworth secretly get married?'

In [None]:
corpus[0]

'miley cyrus liam hemsworth secretly get married?'

## VECTORIZATION

In [None]:
tf = TfidfVectorizer()
x = tf.fit_transform(corpus).toarray()
x

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [None]:
y = df['label']
y.head()

0    FAKE
1    FAKE
2    FAKE
3    FAKE
4    FAKE
Name: label, dtype: object

SPLITTING DATA INTO TRAIN AND TEST

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.3, random_state=10, stratify=y)

In [None]:
len(x_train),len(y_train),len(x_test),len(y_test)

(609, 609, 153, 153)

## MODEL BUILDING

In [None]:
# Create a Random Forest Classifier
rf_classifier = RandomForestClassifier(random_state=42)

# Define a grid of hyperparameters to search
param_grid = {
    'n_estimators': [100, 200, 300],  # Number of trees in the forest
    'max_depth': [None, 10, 20, 30],  # Maximum depth of the trees
    'min_samples_split': [2, 5, 10],  # Minimum number of samples required to split an internal node
    'min_samples_leaf': [1, 2, 4],  # Minimum number of samples required to be a leaf node
    'max_features': ['auto', 'sqrt', 'log2']  # Number of features to consider for the best split
}

# Create the GridSearchCV object
grid_search = GridSearchCV(estimator=rf_classifier, param_grid=param_grid, scoring='accuracy', cv=5)

# Fit the grid search to your data
grid_search.fit(x, y)  # X is your feature matrix, and y is your target vector

# Find the best hyperparameters
best_params = grid_search.best_params_

# Access the best model with the best hyperparameters
# best_rf_model = grid_search.best_estimator_


In [None]:
rf = RandomForestClassifier()
rf.fit(x_train, y_train)

In [None]:
lr = LogisticRegression()
lr.fit(x_train, y_train)

In [None]:
pac = PassiveAggressiveClassifier()
pac.fit(x_train, y_train)

In [None]:
svm_model = svm.SVC(kernel='linear', C=1.0)
svm_model.fit(x_train, y_train)

In [None]:
mnb = MultinomialNB()
mnb.fit(x_train, y_train)

## MODEL EVALUATION

In [None]:
y_pred_rf = rf.predict(x_test)
rf_ac = accuracy_score(y_test, y_pred_rf)
print(f'Accuracy Score for Random Forest: {round(rf_ac*100,5)}')

Accuracy Score for Random Forest: 77.29258


In [None]:
y_train_pred_rf = rf.predict(x_train)
rf_train_ac = accuracy_score(y_train, y_train_pred_rf)
print(f'Training Accuracy Score for Random Forest: {round(rf_train_ac*100,5)}')

Training Accuracy Score for Random Forest: 99.62477


In [None]:
y_pred_lr = lr.predict(x_test)
lr_ac = accuracy_score(y_test,y_pred_lr)
print(f'Accuracy Score for LR: {round(lr_ac*100,5)}')

Accuracy Score for LR: 82.09607


In [None]:
y_pred_pac = pac.predict(x_test)
pac_ac = accuracy_score(y_test,y_pred_pac)
print(f'Accuracy Score for PAC: {round(pac_ac*100,5)}')

Accuracy Score for PAC: 79.0393


In [None]:
y_pred_svm = svm_model.predict(x_test)
svm_ac = accuracy_score(y_test,y_pred_svm)
print(f'Accuracy Score for SVM: {round(svm_ac*100,5)}')

Accuracy Score for SVM: 81.22271


In [None]:
y_pred_mnb = mnb.predict(x_test)
mnb_ac = accuracy_score(y_test,y_pred_mnb)
print(f'Accuracy Score for SVM: {round(mnb_ac*100,5)}')

Accuracy Score for SVM: 80.78603


In [None]:
class Evaluation:

    def __init__(self, model, x_train, x_test, y_train, y_test):
        self.model = model
        self.x_train = x_train
        self.y_train = y_train
        self.x_test = x_test
        self.y_test = y_test

    def train_evaluation(self):
        y_pred_train = self.model.predict(self.x_train)
        acc_scr_train = accuracy_score(self.y_train, y_pred_train)
        print("\nAccuracy Score on Training Data Set", acc_scr_train*100)
        con_mat_train = confusion_matrix(self.y_train, y_pred_train)
        print("\nConfusion Matrix on Training Data Set\n", con_mat_train*100)
        class_rep_train = classification_report(self.y_train, y_pred_train)
        print("\nConfusion Matrix on Training Data Set\n", class_rep_train*100)

    def test_evaluation(self):
        y_pred_test = self.model.predict(self.x_test)
        acc_scr_test = accuracy_score(self.y_test, y_pred_test)
        print("\nAccuracy Score on Testing Data Set", acc_scr_test*100)
        con_mat_test = confusion_matrix(self.y_test, y_pred_test)
        print("\nConfusion Matrix on Testing Data Set\n", con_mat_test*100)
        class_rep_test = classification_report(self.y_test, y_pred_test)
        print("\nConfusion Matrix on Testing Data Set\n", class_rep_test*100)

### RANDOM FOREST
Random Forest is an ensemble method that can be effective for fake news detection, especially when combined with informative features.

In [None]:
Evaluation(rf,x_train,x_test,y_train,y_test).test_evaluation()


Accuracy Score on Testing Data Set 99.3437841779074

Confusion Matrix on Testing Data Set
 [[308100   2800]
 [   800 236900]]

Confusion Matrix on Testing Data Set
               precision    recall  f1-score   support

           0       1.00      0.99      0.99      3109
           1       0.99      1.00      0.99      2377

    accuracy                           0.99      5486
   macro avg       0.99      0.99      0.99      5486
weighted avg       0.99      0.99      0.99      5486
              precision    recall  f1-score   support

           0       1.00      0.99      0.99      3109
           1       0.99      1.00      0.99      2377

    accuracy                           0.99      5486
   macro avg       0.99      0.99      0.99      5486
weighted avg       0.99      0.99      0.99      5486
              precision    recall  f1-score   support

           0       1.00      0.99      0.99      3109
           1       0.99      1.00      0.99      2377

    accuracy      

### PASSIVE AGGRESSIVE CLASSIFIER
This is a popular choice for text classification tasks, including fake news detection. It's known for its efficiency and ability to adapt to data streams.

In [None]:
Evaluation(pac,x_train,x_test,y_train,y_test).test_evaluation()


Accuracy Score on Testing Data Set 93.05504921618666

Confusion Matrix on Testing Data Set
 [[286900  24000]
 [ 14100 223600]]

Confusion Matrix on Testing Data Set
               precision    recall  f1-score   support

           0       0.95      0.92      0.94      3109
           1       0.90      0.94      0.92      2377

    accuracy                           0.93      5486
   macro avg       0.93      0.93      0.93      5486
weighted avg       0.93      0.93      0.93      5486
              precision    recall  f1-score   support

           0       0.95      0.92      0.94      3109
           1       0.90      0.94      0.92      2377

    accuracy                           0.93      5486
   macro avg       0.93      0.93      0.93      5486
weighted avg       0.93      0.93      0.93      5486
              precision    recall  f1-score   support

           0       0.95      0.92      0.94      3109
           1       0.90      0.94      0.92      2377

    accuracy     

### LOGISTIC REGRESSION
Logistic regression is often used for binary classification problems like fake news detection. It's interpretable and can provide insights into feature importance.

In [None]:
Evaluation(lr,x_train,x_test,y_train,y_test).test_evaluation()

#SOLVERS:
# liblinear : 92.253
# sag : 92.271
# saga : 92.271
# newton-cg : 92.271
# lbfgs : 92.271


Accuracy Score on Testing Data Set 98.1407218374043

Confusion Matrix on Testing Data Set
 [[302300   8600]
 [  1600 236100]]

Confusion Matrix on Testing Data Set
               precision    recall  f1-score   support

           0       0.99      0.97      0.98      3109
           1       0.96      0.99      0.98      2377

    accuracy                           0.98      5486
   macro avg       0.98      0.98      0.98      5486
weighted avg       0.98      0.98      0.98      5486
              precision    recall  f1-score   support

           0       0.99      0.97      0.98      3109
           1       0.96      0.99      0.98      2377

    accuracy                           0.98      5486
   macro avg       0.98      0.98      0.98      5486
weighted avg       0.98      0.98      0.98      5486
              precision    recall  f1-score   support

           0       0.99      0.97      0.98      3109
           1       0.96      0.99      0.98      2377

    accuracy      

### NAIVE BAYES

### SUPPORT VECTOR MACHINE
Linear SVMs are known for their good performance in text classification tasks. They can be applied to fake news detection problems.

In [None]:
Evaluation(svm,x_train,x_test,y_train,y_test).test_evaluation()

# linear : 93.47
# rbf :
# poly :
# sigmoid :



Accuracy Score on Testing Data Set 93.47429821363471

Confusion Matrix on Testing Data Set
 [[282100  28800]
 [  7000 230700]]

Confusion Matrix on Testing Data Set
               precision    recall  f1-score   support

           0       0.98      0.91      0.94      3109
           1       0.89      0.97      0.93      2377

    accuracy                           0.93      5486
   macro avg       0.93      0.94      0.93      5486
weighted avg       0.94      0.93      0.93      5486
              precision    recall  f1-score   support

           0       0.98      0.91      0.94      3109
           1       0.89      0.97      0.93      2377

    accuracy                           0.93      5486
   macro avg       0.93      0.94      0.93      5486
weighted avg       0.94      0.93      0.93      5486
              precision    recall  f1-score   support

           0       0.98      0.91      0.94      3109
           1       0.89      0.97      0.93      2377

    accuracy     

### LSTM & GRU NETWORKS

### ENSEMBLE METHODS

## PREDICTION PIPELINE

In [None]:
class Preprocessing:

    def __init__(self, data):
        self.data = data

    def text_preprocessing_user(self):
        lm = WordNetLemmatizer()
        pred_data = [self.data]
        preprocess_data = []
        for data in pred_data:
            review = re.sub('^a-zA-Z0-9',' ', data) #matches any character that is not a letter (a-z or A-Z) or a digit (0-9) and replaces those characters with spaces.
            review = review.lower()
            review = review.split()
            review = [lm.lemmatize(x) for x in review if x not in (stopwords.words('english'))]
            review = ' '.join(review)
            preprocess_data.append(review)
        return preprocess_data

In [None]:
class Prediction:

    def __init__(self,pred_data,model):
        self.pred_data = pred_data
        self.model = model

    def prediction_model(self):
        preprocess_data = Preprocessing(self.pred_data).text_preprocessing_user()
        data = tf.transform(preprocess_data)
        prediction = self.model.predict(data)
        if prediction[0] == 0:
            return 'The News is Fake'
        else:
            return 'The News is Real'

In [None]:
data = 'Breitbart'
Prediction(data,rf).prediction_model()

'The News is Fake'