#Importing Libraries

In [1]:
import numpy as np 
import pandas as pd
import re
import nltk
# from nltk.tokenize import tokenize
from nltk.tokenize import wordpunct_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

#Data Gathering

In [2]:
news_data = pd.read_csv("/content/drive/MyDrive/train.csv")

#Data Analysis

In [3]:
news_data.head()

Unnamed: 0,id,title,author,text,label
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0
2,2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1
3,3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,1
4,4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,1


The news is classified as:
*   1: *Unreliable*
*   0: *Reliable*





In [4]:
news_data.shape

(20800, 5)

In [5]:
news_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20800 entries, 0 to 20799
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   id      20800 non-null  int64 
 1   title   20242 non-null  object
 2   author  18843 non-null  object
 3   text    20761 non-null  object
 4   label   20800 non-null  int64 
dtypes: int64(2), object(3)
memory usage: 812.6+ KB


In [6]:
news_data['label'].value_counts()

1    10413
0    10387
Name: label, dtype: int64

The value count of both *Reliable and Unreliable* labels are almost equal so no need of balancing the column.

In [7]:
news_data.isnull().sum()

id           0
title      558
author    1957
text        39
label        0
dtype: int64

Handling the null values by dropping them.

In [8]:
news_data = news_data.dropna()

In [9]:
news_data.isnull().sum()

id        0
title     0
author    0
text      0
label     0
dtype: int64

In [10]:
news_data.shape

(18285, 5)

In [11]:
news_data.head(10)

Unnamed: 0,id,title,author,text,label
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0
2,2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1
3,3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,1
4,4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,1
5,5,Jackie Mason: Hollywood Would Love Trump if He...,Daniel Nussbaum,"In these trying times, Jackie Mason is the Voi...",0
7,7,Benoît Hamon Wins French Socialist Party’s Pre...,Alissa J. Rubin,"PARIS — France chose an idealistic, traditi...",0
9,9,"A Back-Channel Plan for Ukraine and Russia, Co...",Megan Twohey and Scott Shane,A week before Michael T. Flynn resigned as nat...,0
10,10,Obama’s Organizing for Action Partners with So...,Aaron Klein,"Organizing for Action, the activist group that...",0
11,11,"BBC Comedy Sketch ""Real Housewives of ISIS"" Ca...",Chris Tomlinson,The BBC produced spoof on the “Real Housewives...,0


Resetting the index of the dataframe after dropping the null valued columns.

In [12]:
news_data.reset_index(inplace = True)

In [13]:
news_data.head()

Unnamed: 0,index,id,title,author,text,label
0,0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1
1,1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0
2,2,2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1
3,3,3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,1
4,4,4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,1


Extracting the required dataset.

In [14]:
news_data['data'] = news_data['author']+" "+news_data['title']

In [15]:
news_data.head()

Unnamed: 0,index,id,title,author,text,label,data
0,0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1,Darrell Lucus House Dem Aide: We Didn’t Even S...
1,1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0,"Daniel J. Flynn FLYNN: Hillary Clinton, Big Wo..."
2,2,2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1,Consortiumnews.com Why the Truth Might Get You...
3,3,3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,1,Jessica Purkiss 15 Civilians Killed In Single ...
4,4,4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,1,Howard Portnoy Iranian woman jailed for fictio...


In [16]:
news_data = news_data.drop(['id', 'author','text', 'title'] , axis= 1)

In [17]:
news_data.head(10)

Unnamed: 0,index,label,data
0,0,1,Darrell Lucus House Dem Aide: We Didn’t Even S...
1,1,0,"Daniel J. Flynn FLYNN: Hillary Clinton, Big Wo..."
2,2,1,Consortiumnews.com Why the Truth Might Get You...
3,3,1,Jessica Purkiss 15 Civilians Killed In Single ...
4,4,1,Howard Portnoy Iranian woman jailed for fictio...
5,5,0,Daniel Nussbaum Jackie Mason: Hollywood Would ...
6,7,0,Alissa J. Rubin Benoît Hamon Wins French Socia...
7,9,0,Megan Twohey and Scott Shane A Back-Channel Pl...
8,10,0,Aaron Klein Obama’s Organizing for Action Part...
9,11,0,"Chris Tomlinson BBC Comedy Sketch ""Real Housew..."


#Data Preprocessing


The following steps are to be taken:
- Remove symbols
- Convert String to lowercase
- Tokenize the string
- Remove Stopwords
- Lemmatization
- Join the list to a string
- Append in column








In [18]:
import spacy
nlp = spacy.load('en_core_web_sm')



In [19]:
corpus = []

for i in range(len(news_data)):
  content = re.sub('a-zA-Z0-9', ' ', news_data['data'][i]) 
  content = content.lower()
  content = nlp(content)
  content = [token for token in content if not nlp.vocab[token.text].is_stop]
  content = [token.lemma_ for token in content]
  content = " ".join(content)
  corpus.append(content)

In [20]:
len(corpus)

18285

In [21]:
news_data['data'][0]
corpus[0]

'darrell lucus house dem aide : comey letter jason chaffetz tweet'

#Vectorization

In [22]:
tf = TfidfVectorizer()
X = tf.fit_transform(corpus).toarray()
X

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [23]:
Y = news_data['label']
Y.head()

0    1
1    0
2    1
3    1
4    1
Name: label, dtype: int64

#Splitting the dataset into training and testing sets



In [24]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.3, random_state = 10, stratify= Y)

In [25]:
len(X_train), len(Y_train)

(12799, 12799)

In [26]:
len(X_test), len(Y_test)

(5486, 5486)

#Model Building

In [27]:
rf = RandomForestClassifier()
rf.fit(X_train, Y_train)

#Model Evaluation

In [28]:
Y_pred = rf.predict(X_test)
accuracy_score(Y_test, Y_pred)

0.993437841779074

In [29]:
class Evaluation:
  def __init__(self, model, x_train, x_test, y_train, y_test):
    self.model = model
    self.x_train = x_train
    self.x_test = x_test
    self.y_train = y_train
    self.y_test = y_test

  def training_evaluation(self):
    y_pred_train = self.model.predict(self.x_train)
    acc_scr_train = accuracy_score(self.y_train, y_pred_train)
    print("The Accuracy Score of the model on testing data is: \n",acc_scr_train, "\n")
    
    con_mat_train = confusion_matrix(self.y_train, y_pred_train)
    print("The Confusion Matrix of the training data prediction is:\n",con_mat_train, "\n")

    class_report = classification_report(self.y_train, y_pred_train , target_names = ["Reliable", "Unreliable"])
    print("The Classification Report on the training data is: \n",class_report)

  def testing_evaluation(self):
    y_pred_test = self.model.predict(self.x_test)
    acc_scr_train = accuracy_score(self.y_test, y_pred_test)
    print("The Accuracy Score of the model on testing data is: \n",acc_scr_train, "\n")
    
    con_mat_train = confusion_matrix(self.y_test, y_pred_test)
    print("The Confusion Matrix of the testing data prediction is:\n",con_mat_train, "\n")

    class_report = classification_report(self.y_test, y_pred_test, target_names = ["Reliable", "Unreliable"])
    print("The Classification Report on the testing data is: \n",class_report)



### Chechking the accuracy on training dataset



In [30]:
Evaluation(rf, X_train, X_test, Y_train, Y_test).training_evaluation()

The Accuracy Score of the model on testing data is: 
 1.0 

The Confusion Matrix of the training data prediction is:
 [[7252    0]
 [   0 5547]] 

The Classification Report on the training data is: 
               precision    recall  f1-score   support

    Reliable       1.00      1.00      1.00      7252
  Unreliable       1.00      1.00      1.00      5547

    accuracy                           1.00     12799
   macro avg       1.00      1.00      1.00     12799
weighted avg       1.00      1.00      1.00     12799



### Chechking the accuracy on testing dataset


In [31]:
Evaluation(rf, X_train, X_test, Y_train, Y_test).testing_evaluation()

The Accuracy Score of the model on testing data is: 
 0.993437841779074 

The Confusion Matrix of the testing data prediction is:
 [[3081   28]
 [   8 2369]] 

The Classification Report on the testing data is: 
               precision    recall  f1-score   support

    Reliable       1.00      0.99      0.99      3109
  Unreliable       0.99      1.00      0.99      2377

    accuracy                           0.99      5486
   macro avg       0.99      0.99      0.99      5486
weighted avg       0.99      0.99      0.99      5486



#Prediction Pipeline

In [32]:
class Preprocessing:
  
  def __init__(self, data):
    self.data = data

  def text_preprocessing(self):
    prediction_data = [self.data]
    preprocessed_data = []

    for data in prediction_data:
      content = re.sub('a-zA-Z0-9', ' ', data) 
      content = content.lower()
      content = nlp(content)
      content = [token for token in content if not nlp.vocab[token.text].is_stop]
      content = [token.lemma_ for token in content]
      content = " ".join(content)
      preprocessed_data.append(content)
    
    return preprocessed_data


In [33]:
news_data['data'][5]

'Daniel Nussbaum Jackie Mason: Hollywood Would Love Trump if He Bombed North Korea over Lack of Trans Bathrooms (Exclusive Video) - Breitbart'

In [34]:
data = 'Jackie Mason: Hollywood Would Love Trump if He Bombed North Korea over Lack of Trans Bathrooms (Exclusive Video) - Breitbart'
Preprocessing(data).text_preprocessing()

['jackie mason : hollywood love trump bomb north korea lack tran bathroom ( exclusive video ) - breitbart']

In [35]:
class Prediction:
  
  def __init__(self, prediction_data, model):
    self.prediction_data = prediction_data
    self.model= model

  def prediction(self):
    preprocessed_data = Preprocessing(self.prediction_data).text_preprocessing()
    data_vector = tf.transform(preprocessed_data)
    prediction = self.model.predict(data_vector)

    if prediction[0] == 0:
      print("The news is reliable.")
    else :
      print("The news is fake!")

In [36]:
news_data['data'][5]

'Daniel Nussbaum Jackie Mason: Hollywood Would Love Trump if He Bombed North Korea over Lack of Trans Bathrooms (Exclusive Video) - Breitbart'

In [37]:
data = "Daniel Nussbaum Jackie Mason: Hollywood Would Love Trump if He Bombed North Korea over Lack of Trans Bathrooms (Exclusive Video) - Breitbart"
Prediction(data, rf).prediction()

The news is reliable.
