# FAKE NEWS DETECTION


###  Importing Dependencies 

In [4]:
#Importing Libraries 
import numpy as np 
import pandas as pd 
import re #Regular Expression
from nltk.corpus import stopwords #Natural Language Tool Kit
from nltk.stem.porter import PorterStemmer 
from sklearn.feature_extraction.text import TfidfVectorizer #Term frequency inverse document frequency
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression 
from sklearn.metrics import accuracy_score 

### Usage of each library  

- **numpy** : This library is basically used to create arrays in which we will store data 
- **pandas** : This library is used to very useful in reading csv files. 
- **re** : The full form of this library is 'Regular Expression'. 
- **nltk** : It is Natural Language Tool kit. It is used whenever we want to deal with languages we speak on daily basis. Here, we will use this library to remove stopwords. Also, we will use to stem the content.
- **sklearn** : This library is called sci kit learn. We will use this library to change text to numeric data. Thereafter, we                   will use to split the data. Also, we will use to apply logistic regression and calculate accuracy of the model. 

In [5]:
#Downloading stopwords
import nltk 
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Hasanali\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [6]:
#Printing stopwords
print(stopwords.words('english'))

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

### Data Pre-Processing

In [7]:
#Loading dataset into pandas DataFrame 
news_dataset = pd.read_csv('train.csv') 

In [8]:
#Size of dataset
news_dataset.shape

(20800, 5)

In [9]:
#Printing first 5 rows of dataset
news_dataset.head()

Unnamed: 0,id,title,author,text,label
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0
2,2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1
3,3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,1
4,4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,1


In [10]:
#counting number of missing values in the dataset 
news_dataset.isnull().sum()

id           0
title      558
author    1957
text        39
label        0
dtype: int64

In [11]:
#Replacing the null values in the dataset with empty string
news_dataset = news_dataset.fillna('')

In [12]:
#merging the author name and news title 
news_dataset['content'] = news_dataset['author'] + " " +  news_dataset['title']

In [13]:
news_dataset['content'].head() 

0    Darrell Lucus House Dem Aide: We Didn’t Even S...
1    Daniel J. Flynn FLYNN: Hillary Clinton, Big Wo...
2    Consortiumnews.com Why the Truth Might Get You...
3    Jessica Purkiss 15 Civilians Killed In Single ...
4    Howard Portnoy Iranian woman jailed for fictio...
Name: content, dtype: object

In [14]:
#seperating the data & label  
X = news_dataset.drop(columns='label',axis=1)
Y = news_dataset['label'] 

In [15]:
X.head() 

Unnamed: 0,id,title,author,text,content
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus House Dem Aide: We Didn’t Even S...
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,"Daniel J. Flynn FLYNN: Hillary Clinton, Big Wo..."
2,2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",Consortiumnews.com Why the Truth Might Get You...
3,3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,Jessica Purkiss 15 Civilians Killed In Single ...
4,4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,Howard Portnoy Iranian woman jailed for fictio...


In [16]:
Y.head()

0    1
1    0
2    1
3    1
4    1
Name: label, dtype: int64

### Stemming 

Stemming is the process of reducing a word into its root word. 
</br>
</br>
example: actress, actor, acting --> act 
</br> 
</br>
Here is the link given below if you want to learn more about stemming:
</br>
https://towardsdatascience.com/stemming-lemmatization-what-ba782b7c0bd8

In [17]:
#We will use in-built PortStemmer function to implement stemming
port_stem = PorterStemmer() 

In [18]:
#function to implement stemming
def stemming(content):
    stemmed_content = re.sub('[^a-zA-Z]',' ',content) #Working with alphabet characters
    stemmed_content = stemmed_content.lower() #COnverting to lowercase
    stemmed_content = stemmed_content.split() #make list which contains root words
    stemmed_content = [port_stem.stem(word) for word in stemmed_content if not word in stopwords.words('english')] #removing stopwords
    stemmed_content = ' '.join(stemmed_content) #joining text back out of list
    return stemmed_content

### Implementation of Stemming 

- First of all we are removing all the symbols and numbers from the content and working with only lowercase and uppercase letters 
- After that we are converting every character to lowercase letters. We are doing this because to thereafter to deal with only lowercase letters. 
- Splitting the content and making list out of content in which only root words are present. 
- Thereafter removing stopwords from the list, because stopwords doesnot make much difference. 
- Joining all the characters to make again text.

In [19]:
#Applying stemming function to content
news_dataset['content'] = news_dataset['content'].apply(stemming)

In [20]:
news_dataset['content'].head()

0    darrel lucu hous dem aid even see comey letter...
1    daniel j flynn flynn hillari clinton big woman...
2               consortiumnew com truth might get fire
3    jessica purkiss civilian kill singl us airstri...
4    howard portnoy iranian woman jail fiction unpu...
Name: content, dtype: object

In [21]:
#Seperating data and content
X = news_dataset['content'].values 
Y = news_dataset['label'].values

In [22]:
#Printing content
print(X) 

['darrel lucu hous dem aid even see comey letter jason chaffetz tweet'
 'daniel j flynn flynn hillari clinton big woman campu breitbart'
 'consortiumnew com truth might get fire' ...
 'michael j de la merc rachel abram maci said receiv takeov approach hudson bay new york time'
 'alex ansari nato russia hold parallel exercis balkan'
 'david swanson keep f aliv']


In [23]:
#Printing label
print(Y) 

[1 0 1 ... 0 1 1]


### Text to Numeric  

We will use vectorizer to convert text to numeric values.

In [24]:
#Converting textual to numeric data 
#Tf-counts the number of times a significant word occured 
#idf-It detects the word which is repeatedly coming in each sample
vectorizer = TfidfVectorizer()  
vectorizer.fit(X) 
X = vectorizer.transform(X)

### Implementation of TfidfVectorizer

- Here Tf means Term frequency which counts how much times a significant word occured in one sample. 
- idf detects the word which is occuring large number of times in different samples. Eg. In review section of movie "End Game" you will see word "Avengers" occuring in large number of samples. So, idf detects this type of word. 

- We will use this function to convert text to numeric data.



In [25]:
print(X)

  (0, 15686)	0.28485063562728646
  (0, 13473)	0.2565896679337957
  (0, 8909)	0.3635963806326075
  (0, 8630)	0.29212514087043684
  (0, 7692)	0.24785219520671603
  (0, 7005)	0.21874169089359144
  (0, 4973)	0.233316966909351
  (0, 3792)	0.2705332480845492
  (0, 3600)	0.3598939188262559
  (0, 2959)	0.2468450128533713
  (0, 2483)	0.3676519686797209
  (0, 267)	0.27010124977708766
  (1, 16799)	0.30071745655510157
  (1, 6816)	0.1904660198296849
  (1, 5503)	0.7143299355715573
  (1, 3568)	0.26373768806048464
  (1, 2813)	0.19094574062359204
  (1, 2223)	0.3827320386859759
  (1, 1894)	0.15521974226349364
  (1, 1497)	0.2939891562094648
  (2, 15611)	0.41544962664721613
  (2, 9620)	0.49351492943649944
  (2, 5968)	0.3474613386728292
  (2, 5389)	0.3866530551182615
  (2, 3103)	0.46097489583229645
  :	:
  (20797, 13122)	0.2482526352197606
  (20797, 12344)	0.27263457663336677
  (20797, 12138)	0.24778257724396507
  (20797, 10306)	0.08038079000566466
  (20797, 9588)	0.174553480255222
  (20797, 9518)	0.295420

### Splitting Data

In [26]:
X_train, X_test,Y_train,Y_test = train_test_split(X,Y, test_size=0.2, stratify=Y, random_state=2) 

## Training: Logistic Regression

In [27]:
model = LogisticRegression()

In [28]:
model.fit(X_train,Y_train)

### Evaluation

In [29]:
#accuracy score on training data
X_train_prediction = model.predict(X_train) 
training_data_accuracy = accuracy_score(X_train_prediction,Y_train) 

In [30]:
print("Accuracy score of the trainig data: ",training_data_accuracy)

Accuracy score of the trainig data:  0.9865985576923076


In [31]:
#accuracy score on testing data
X_test_prediction = model.predict(X_test) 
testing_data_accuracy = accuracy_score(X_test_prediction,Y_test)  

In [32]:
print("Accuracy score of the testing data: ",testing_data_accuracy)

Accuracy score of the testing data:  0.9790865384615385


## Prediction

In [33]:
X_new = X_test[1] 

prediction = model.predict(X_new) 

print(prediction) 

if(prediction==0):
    print("News is real") 
else:
    print("News is fake")

[0]
News is real


In [34]:
print(Y_test[1])

0
