In [None]:
# About the dataset
# 1. id: unique id for a news article
# 2. title: the title of a news article
# 3. author: the author of the news article
# 4. text: the text of the article; could be incomplete
# 5. label: a label that marks whether the news article is real or fake
# 1: fake news, 0: real news

In [1]:
# Import the dependencies
import numpy as np
import pandas as pd
import re
# re module is used for regex
from nltk.corpus import stopwords # natural language toolkit
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer # Convert a collection of raw documents to a matrix of TF-IDF features.
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score


In [2]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\hoang\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


True

In [6]:
# printing stopwords in English
print(stopwords.words('english'))

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [7]:
# data pre-processing
# loading the dataset to pandas dataframe
new_dataset = pd.read_csv('./train.csv')


In [9]:
new_dataset.shape
print(new_dataset.head())

   id                                              title              author  \
0   0  House Dem Aide: We Didn’t Even See Comey’s Let...       Darrell Lucus   
1   1  FLYNN: Hillary Clinton, Big Woman on Campus - ...     Daniel J. Flynn   
2   2                  Why the Truth Might Get You Fired  Consortiumnews.com   
3   3  15 Civilians Killed In Single US Airstrike Hav...     Jessica Purkiss   
4   4  Iranian woman jailed for fictional unpublished...      Howard Portnoy   

                                                text  label  
0  House Dem Aide: We Didn’t Even See Comey’s Let...      1  
1  Ever get the feeling your life circles the rou...      0  
2  Why the Truth Might Get You Fired October 29, ...      1  
3  Videos 15 Civilians Killed In Single US Airstr...      1  
4  Print \nAn Iranian woman has been sentenced to...      1  


In [10]:
# counting the number of missing value 
new_dataset.isnull().sum()

id           0
title      558
author    1957
text        39
label        0
dtype: int64

In [11]:
# replacing the null value with empty string
new_dataset = new_dataset.fillna('')

# merging the author name and news title
new_dataset['content'] = new_dataset['author'] + ' ' + new_dataset['title']

In [12]:
print(new_dataset['content'])

0        Darrell Lucus House Dem Aide: We Didn’t Even S...
1        Daniel J. Flynn FLYNN: Hillary Clinton, Big Wo...
2        Consortiumnews.com Why the Truth Might Get You...
3        Jessica Purkiss 15 Civilians Killed In Single ...
4        Howard Portnoy Iranian woman jailed for fictio...
                               ...                        
20795    Jerome Hudson Rapper T.I.: Trump a ’Poster Chi...
20796    Benjamin Hoffman N.F.L. Playoffs: Schedule, Ma...
20797    Michael J. de la Merced and Rachel Abrams Macy...
20798    Alex Ansary NATO, Russia To Hold Parallel Exer...
20799              David Swanson What Keeps the F-35 Alive
Name: content, Length: 20800, dtype: object


In [14]:
# separate the data and the label
X = new_dataset.drop(columns='label', axis=1)
Y = new_dataset['label']

Stemming:
Stemming is the process of reducing a word to its Root word
example:
actor, actress, acting ---> act

In [18]:
# Stemming
port_stem = PorterStemmer()

In [22]:
def stemming(content):
    stemmed_content = re.sub('[^a-zA-Z]', ' ', content) # replace everything except a-z or A-Z to ' '   
    stemmed_content = stemmed_content.lower()
    stemmed_content = stemmed_content.split()
    stemmed_content = [port_stem.stem(word) for word in stemmed_content if not word in stopwords.words('english')]
    stemmed_content = ' '.join(stemmed_content)
    return stemmed_content

In [27]:
# stemming("eats 1 acted 2 acting 3")
new_dataset['content'] = new_dataset['content'].apply(stemming)

In [35]:
# separate the data and label
X = new_dataset['content'].values
Y = new_dataset['label'].values


In [34]:
print(X.shape)

(20800,)


In [36]:
vectorizer = TfidfVectorizer()
vectorizer.fit(X)

X = vectorizer.transform(X) 

In [39]:
# Splitting the dataset to training and test
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, stratify=Y, random_state=2)

In [40]:
# Training the model: Logistic Regression
model = LogisticRegression()
model.fit(X_train, Y_train)

LogisticRegression()

In [41]:
# Evaluation
X_train_prediction = model.predict(X_train)
training_data_accuracy = accuracy_score(X_train_prediction, Y_train)

In [42]:
print("Accuracy score of the training data: ", training_data_accuracy)

Accuracy score of the training data:  0.9865985576923076


In [43]:
# Evaluation
X_test_prediction = model.predict(X_test)
test_data_accuracy = accuracy_score(X_test_prediction, Y_test)

In [44]:
print("Accuracy score of the test data: ", test_data_accuracy)

Accuracy score of the test data:  0.9790865384615385


In [46]:
# Making a predictive system
X_new = X_test[0]
prediction = model.predict(X_new)

print(Y_test[0])

if prediction == 0:
    print("This is real news")
else:
    print("This is fake news")

1
This is fake news
