In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/fake-news-detection/data.h5
/kaggle/input/fake-news-detection/data.csv


In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# NLP libraries to clean the text data
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
import re

# Vectorization technique TF-IDF
from sklearn.feature_extraction.text import TfidfVectorizer

# For Splitting the dataset
from sklearn.model_selection import train_test_split

# Model libraries
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier

#Accuracy measuring library
from sklearn.metrics import accuracy_score

In [4]:
data = pd.read_csv("/kaggle/input/fake-news-detection/data.csv")

In [5]:
data.head(50)

Unnamed: 0,URLs,Headline,Body,Label
0,http://www.bbc.com/news/world-us-canada-414191...,Four ways Bob Corker skewered Donald Trump,Image copyright Getty Images\nOn Sunday mornin...,1
1,https://www.reuters.com/article/us-filmfestiva...,Linklater's war veteran comedy speaks to moder...,"LONDON (Reuters) - “Last Flag Flying”, a comed...",1
2,https://www.nytimes.com/2017/10/09/us/politics...,Trump’s Fight With Corker Jeopardizes His Legi...,The feud broke into public view last week when...,1
3,https://www.reuters.com/article/us-mexico-oil-...,Egypt's Cheiron wins tie-up with Pemex for Mex...,MEXICO CITY (Reuters) - Egypt’s Cheiron Holdin...,1
4,http://www.cnn.com/videos/cnnmoney/2017/10/08/...,Jason Aldean opens 'SNL' with Vegas tribute,"Country singer Jason Aldean, who was performin...",1
5,http://beforeitsnews.com/sports/2017/09/jetnat...,JetNation FanDuel League; Week 4,JetNation FanDuel League; Week 4\n% of readers...,0
6,https://www.nytimes.com/2017/10/10/us/politics...,Kansas Tried a Tax Plan Similar to Trump’s. It...,"In 2012, Kansas lawmakers, led by Gov. Sam Bro...",1
7,https://www.reuters.com/article/us-india-cenba...,"India RBI chief: growth important, but not at ...",The Reserve Bank of India (RBI) Governor Urjit...,1
8,https://www.reuters.com/article/us-climatechan...,EPA chief to sign rule on Clean Power Plan exi...,"Scott Pruitt, Administrator of the U.S. Enviro...",1
9,https://www.reuters.com/article/us-air-berlin-...,Talks on sale of Air Berlin planes to easyJet ...,FILE PHOTO - An Air Berlin sign is seen at an ...,1


In [None]:
# this is common steps to solve this Fake News Detections:-

# 1. Data Analysis (EDA) -- Data analyst + Data scientist/ML Engineer
# 2. Data preprocessing (Data cleaning)
#    a. Removal of null values
#    b. Remove unwanted features -- Feature selection
#.   c. Text processing --> removal of common words (stopwords), removal of , . etc
# 3. Splitting Dataset
# 4. Vectorization -- Only for text related data -- Converting text into numbers
# 5. Model fitting
# 6. Accuracy testing, Testing on new data

In [6]:
# 1. Data analysis

data.head()

Unnamed: 0,URLs,Headline,Body,Label
0,http://www.bbc.com/news/world-us-canada-414191...,Four ways Bob Corker skewered Donald Trump,Image copyright Getty Images\nOn Sunday mornin...,1
1,https://www.reuters.com/article/us-filmfestiva...,Linklater's war veteran comedy speaks to moder...,"LONDON (Reuters) - “Last Flag Flying”, a comed...",1
2,https://www.nytimes.com/2017/10/09/us/politics...,Trump’s Fight With Corker Jeopardizes His Legi...,The feud broke into public view last week when...,1
3,https://www.reuters.com/article/us-mexico-oil-...,Egypt's Cheiron wins tie-up with Pemex for Mex...,MEXICO CITY (Reuters) - Egypt’s Cheiron Holdin...,1
4,http://www.cnn.com/videos/cnnmoney/2017/10/08/...,Jason Aldean opens 'SNL' with Vegas tribute,"Country singer Jason Aldean, who was performin...",1


In [7]:
data.shape

(4009, 4)

In [8]:
data.columns

Index(['URLs', 'Headline', 'Body', 'Label'], dtype='object')

In [9]:
data.isnull().sum()

URLs         0
Headline     0
Body        21
Label        0
dtype: int64

In [10]:
# Handle these null values
# I am replacing with ""
df = data.copy()

In [11]:
df['Body'] = df['Body'].fillna('')

In [12]:
df.isnull().sum()

URLs        0
Headline    0
Body        0
Label       0
dtype: int64

In [13]:
df['News'] = df['Headline'] + " " + df['Body']

In [14]:
df.head()

Unnamed: 0,URLs,Headline,Body,Label,News
0,http://www.bbc.com/news/world-us-canada-414191...,Four ways Bob Corker skewered Donald Trump,Image copyright Getty Images\nOn Sunday mornin...,1,Four ways Bob Corker skewered Donald Trump Ima...
1,https://www.reuters.com/article/us-filmfestiva...,Linklater's war veteran comedy speaks to moder...,"LONDON (Reuters) - “Last Flag Flying”, a comed...",1,Linklater's war veteran comedy speaks to moder...
2,https://www.nytimes.com/2017/10/09/us/politics...,Trump’s Fight With Corker Jeopardizes His Legi...,The feud broke into public view last week when...,1,Trump’s Fight With Corker Jeopardizes His Legi...
3,https://www.reuters.com/article/us-mexico-oil-...,Egypt's Cheiron wins tie-up with Pemex for Mex...,MEXICO CITY (Reuters) - Egypt’s Cheiron Holdin...,1,Egypt's Cheiron wins tie-up with Pemex for Mex...
4,http://www.cnn.com/videos/cnnmoney/2017/10/08/...,Jason Aldean opens 'SNL' with Vegas tribute,"Country singer Jason Aldean, who was performin...",1,Jason Aldean opens 'SNL' with Vegas tribute Co...


In [15]:
df.columns

Index(['URLs', 'Headline', 'Body', 'Label', 'News'], dtype='object')

In [16]:
# Drop the features that are not needed

features_dropped = ['URLs', 'Headline', 'Body']
df = df.drop(features_dropped, axis = 1)

In [17]:
df.columns

Index(['Label', 'News'], dtype='object')

In [18]:
df.head()

Unnamed: 0,Label,News
0,1,Four ways Bob Corker skewered Donald Trump Ima...
1,1,Linklater's war veteran comedy speaks to moder...
2,1,Trump’s Fight With Corker Jeopardizes His Legi...
3,1,Egypt's Cheiron wins tie-up with Pemex for Mex...
4,1,Jason Aldean opens 'SNL' with Vegas tribute Co...


In [None]:
# Text processing (Cleaning)
# 1. Remove symbols (, . : ; ") --> Just keep a-z A-Z
# 2. Convert everythin to lowercase
# 3. Removing stopwords (most common words like is, are, the, a, an, etc)
# 4. Stemming --> 

# S1: Virat played well --> virat play well

# S2: Virat is playing well. --> virat play well
    
    
# S3: Virat plays very well, this is shown in the last match.
#     Virat play very well show last match

In [19]:
ps = PorterStemmer()

def wordopt(text):
    text = re.sub('[^a-zA-Z]', ' ', text)
    text = text.lower()
    text = text.split() # [viral, is, playing, well]
    text = [ps.stem(word) for word in text if not word in stopwords.words('english')] # ['virat', play, well]
    text = ' '.join(text)   # virat play well
    return text

In [20]:
print (wordopt('Virat is playing well'))

virat play well


In [21]:
wordopt('Virat is playing well.')

'virat play well'

In [22]:
wordopt('Virat plays very well, this is shown in the last match.')

'virat play well shown last match'

In [23]:
wordopt('Virat play very well show last match')

'virat play well show last match'

In [24]:
df['News'] = df['News'].apply(wordopt)

In [25]:
df.head()

Unnamed: 0,Label,News
0,1,four way bob corker skewer donald trump imag c...
1,1,linklat war veteran comedi speak modern americ...
2,1,trump fight corker jeopard legisl agenda feud ...
3,1,egypt cheiron win tie pemex mexican onshor oil...
4,1,jason aldean open snl vega tribut countri sing...


In [26]:
X = df['News']
Y = df['Label']

# Splitting

x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.25)

In [42]:
x_train

716     congo elect mid vote commiss say anger opposit...
3654    n l prep joke harvey weinstein shelv critic ha...
1803    life lisa ling seri sex seri exclus cnn digit ...
614     sometim need send messag sometim need send mes...
3025    stage grief sad stage grief sad reader think s...
                              ...                        
2667    homemad tool help north korea missil program s...
1516    japanes journalist die work hour overtim vietn...
43      fantasi footbal run back rank week fantasi foo...
2279    tech industri fight white supremaci fuel embat...
3457    palestinian rival talk uniti show divis hama s...
Name: News, Length: 3006, dtype: object

In [43]:
x_test

1089    melania trump hit back ivana first ladi jibe i...
824     andrew black road exchang bet one britain prof...
95      audio releas hotel worker warn shooter vega ma...
391     snl alec baldwin trump call san juan mayor nas...
3664    king salman favorit golden escal betray visit ...
                              ...                        
2551    happen use sound deflect laser beam video happ...
3276    california fire least dead hundr report miss c...
556     pass game film review good mccown week jaguar ...
1581    new washington race flee senat marco rubio dig...
2699    bill make easier peopl commit mass shoot bill ...
Name: News, Length: 1003, dtype: object

In [27]:
# Important step in text processing -- Vectorization

# Process of converting strings (seq of words) into vector of integers(float)

vectorization = TfidfVectorizer() # 1 hot encoding, WordToVec
xv_train = vectorization.fit_transform(x_train)
xv_test = vectorization.transform(x_test)

In [28]:
x_train

716     congo elect mid vote commiss say anger opposit...
3654    n l prep joke harvey weinstein shelv critic ha...
1803    life lisa ling seri sex seri exclus cnn digit ...
614     sometim need send messag sometim need send mes...
3025    stage grief sad stage grief sad reader think s...
                              ...                        
2667    homemad tool help north korea missil program s...
1516    japanes journalist die work hour overtim vietn...
43      fantasi footbal run back rank week fantasi foo...
2279    tech industri fight white supremaci fuel embat...
3457    palestinian rival talk uniti show divis hama s...
Name: News, Length: 3006, dtype: object

In [29]:
print (xv_train[0])

  (0, 198)	0.029680132198194827
  (0, 9984)	0.023358464905542248
  (0, 1235)	0.029150194388718957
  (0, 2850)	0.06886830258564972
  (0, 7107)	0.03966344730132401
  (0, 22582)	0.04939452115427487
  (0, 718)	0.01527158228539845
  (0, 25912)	0.034107626635224715
  (0, 15356)	0.052752529991567904
  (0, 1503)	0.03085405636758821
  (0, 21934)	0.020294722697141887
  (0, 6547)	0.05067117686344275
  (0, 6499)	0.03863402991691297
  (0, 11184)	0.05336628249561736
  (0, 4913)	0.03524024621266508
  (0, 6351)	0.02971669679984994
  (0, 5244)	0.021186906732384162
  (0, 193)	0.02751007271370141
  (0, 11752)	0.05849308401537834
  (0, 15838)	0.04290055079685083
  (0, 9054)	0.03752814043956347
  (0, 24574)	0.04098080023647552
  (0, 17892)	0.034621395605147716
  (0, 7944)	0.038851986412148376
  (0, 16387)	0.0240868152758198
  :	:
  (0, 11446)	0.05976973972454623
  (0, 3879)	0.027594726283679955
  (0, 26747)	0.04764861490082162
  (0, 6015)	0.0803310649331951
  (0, 20781)	0.0731840547907992
  (0, 7287)	0.225

In [None]:
# Model fitting
# Classification problem --> Binary classification
# 1. Logistic regression
# 2. SVM
# 3. RandomForestClassifier

In [38]:
# Method 1: Convert to dense array (only if matrix is small enough!)
dense_array = xv_test.toarray()

# Method 2: Get just the non-zero values
values = xv_test.data

# Method 3: Get coordinates and values together
rows, cols = xv_test.nonzero()
values = xv_test.data

# Method 4: Access specific elements
# For a single element at position (i,j):
# value = xv_test[i, j]

In [39]:
dense_array

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [30]:
# Initialise Logistic regressing
LR_model = LogisticRegression()

# Fitting training data
LR_model.fit(xv_train, y_train)

lr_y_pred = LR_model.predict(xv_test)

score = accuracy_score(y_test, lr_y_pred)

print ("Accuracy LR:", score)

Accuracy LR: 0.9700897308075773


In [40]:
lr_y_pred

array([1, 0, 0, ..., 0, 1, 0])

In [None]:
# 1. hello how are you doing is everything fine
# 2. how about you my day was amazing

In [74]:
# Put the text in a list
new_test = ['how about you my day was amazing']

In [78]:
# If you have a custom function wordopt, apply it to the string first
new_test = [wordopt('how about you my day was amazing')]

In [76]:
len(new_test)

1

In [79]:
new_test

['day amaz']

In [86]:
# Then transform
testing_= vectorization.transform(['how about you my day was amazing'])

In [87]:
testing_.toarray()

array([[0., 0., 0., ..., 0., 0., 0.]])

In [88]:
LR_model.predict(testing_)

array([0])

In [31]:
svm_model = SVC(kernel='linear')
svm_model.fit(xv_train, y_train)
svm_y_pred = svm_model.predict(xv_test)
score = accuracy_score(y_test, svm_y_pred)

print ("Accuracy SVM:", score)

Accuracy SVM: 0.9820538384845464


In [89]:
svm_model.predict(testing_)

array([1])

In [32]:
RFC_model = RandomForestClassifier(random_state = 0)
RFC_model.fit(xv_train, y_train)
rfc_y_pred = RFC_model.predict(xv_test)

score = accuracy_score(y_test, rfc_y_pred)

print ("Accuracy RFC:", score)

Accuracy RFC: 0.9780658025922233


In [90]:
RFC_model.predict(testing_)

array([0])