<a href="https://colab.research.google.com/github/himanshuchoudhary94/FakeNewsDetection/blob/main/FakeNewsClassificationNB.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [43]:
#loading the token for kaggle-collab 

!pip install --upgrade --force-reinstall --no-deps kaggle
from google.colab import files
files.upload()

Processing /root/.cache/pip/wheels/68/6d/9b/7a98271454edcba3b56328cbc78c037286e787d004c8afee71/kaggle-1.5.9-cp36-none-any.whl
Installing collected packages: kaggle
  Found existing installation: kaggle 1.5.9
    Uninstalling kaggle-1.5.9:
      Successfully uninstalled kaggle-1.5.9
Successfully installed kaggle-1.5.9


Saving kaggle.json to kaggle (3).json


{'kaggle.json': b'{"username":"himanshuchoudhary94","key":"c6d0da89704d5ed384ff4a64e2f851e2"}'}

In [44]:
#commands for data import

!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json
!kaggle competitions download -c fake-news
!unzip \fake-news.zip

fake-news.zip: Skipping, found more recently modified local copy (use --force to force download)
Archive:  fake-news.zip
replace submit.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: y
  inflating: submit.csv              
replace test.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: y
  inflating: test.csv                
replace train.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: y
  inflating: train.csv               


In [45]:
#initializing required libraries

import random
import nltk
import pandas as pd
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer

In [46]:
#reading csv file
train = pd.read_csv('train.csv')

In [47]:
#removing duplicates & filling NANs with string 'Not Available'

train = train.drop_duplicates(subset=['title'], keep = 'first')

In [48]:
#Using only title for classification

title_data = train[['title','label']]
title_data

Unnamed: 0,title,label
0,House Dem Aide: We Didn’t Even See Comey’s Let...,1
1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",0
2,Why the Truth Might Get You Fired,1
3,15 Civilians Killed In Single US Airstrike Hav...,1
4,Iranian woman jailed for fictional unpublished...,1
...,...,...
20794,Trump: Putin ’Very Smart’ to Not Retaliate ove...,0
20795,Rapper T.I.: Trump a ’Poster Child For White S...,0
20796,"N.F.L. Playoffs: Schedule, Matchups and Odds -...",0
20797,Macy’s Is Said to Receive Takeover Approach by...,0


In [49]:
#title_data[title_data['title']==].value_counts()
title_data.isna().sum()

title    1
label    0
dtype: int64

In [50]:
#droping the row of nan as only 1 such row
title_data.dropna(inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [51]:
## Converting the read dataset into a list of tuples, each tuple(row) contianing the news title and it's label
data_set = []
for index,row in title_data.iterrows():
    data_set.append((row['title'], row['label']))

In [71]:
#Checking data_set Built

print(data_set[:5])

[('House Dem Aide: We Didn’t Even See Comey’s Letter Until Jason Chaffetz Tweeted It', 1), ('FLYNN: Hillary Clinton, Big Woman on Campus - Breitbart', 0), ('Why the Truth Might Get You Fired', 1), ('15 Civilians Killed In Single US Airstrike Have Been Identified', 1), ('Iranian woman jailed for fictional unpublished story about woman stoned to death for adultery', 1)]


In [53]:
#additional downloads required

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [54]:
#initialise the inbuilt Stemmer and the Lemmatizer
stemmer = PorterStemmer()
wordnet_lemmatizer = WordNetLemmatizer()

In [55]:
def preprocess(document, stem=True):
    'changes document to lower case, removes stopwords and lemmatizes/stems the remainder of the sentence'

    # change sentence to lower case
    document = document.lower()

    # tokenize into words
    words = word_tokenize(document)

    # remove stop words
    words = [word for word in words if word not in stopwords.words("english")]

    if stem:
        words = [stemmer.stem(word) for word in words]
    else:
        words = [wordnet_lemmatizer.lemmatize(word, pos='v') for word in words]

    # join words to make sentence
    document = " ".join(words)

    return document

In [56]:
#Performing the preprocessing steps on all news titles
title_set = []
for (title, label) in data_set:
    words_filtered = [e.lower() for e in preprocess(title, stem=False).split() if len(e) >= 3]
    title_set.append((words_filtered, label))

In [60]:
#creating a single list of all words in the entire dataset for feature list creation

def get_words_in_title(titles):
    all_words = []
    for (title, label) in titles:
      all_words.extend(title)
    return all_words


#creating a final feature list using an intuitive FreqDist, to eliminate all the duplicate words

def get_word_features(wordlist):

    #print(wordlist[:10])
    wordlist = nltk.FreqDist(wordlist)
    word_features = wordlist.keys()
    return word_features

#creating the word features for the entire dataset
word_features = get_word_features(get_words_in_title(title_set))
print(len(word_features))

#creating slicing index at 80% threshold
sliceIndex = int((len(title_set)*.8))

#shuffle the pack to create a random and unbiased split of the dataset
random.shuffle(title_set)

train_titles, test_titles = title_set[:sliceIndex], title_set[sliceIndex:]

#creating a LazyMap of feature presence for each of the features with respect to each of the title
def extract_features(document):
    document_words = set(document)
    features = {}
    for word in word_features:
        features['contains(%s)' % word] = (word in document_words)
    return features

#creating the feature map of train and test data

training_set = nltk.classify.apply_features(extract_features, train_titles)
testing_set = nltk.classify.apply_features(extract_features, test_titles)



20589


In [61]:
print('Training set size : ', len(training_set))
print('Test set size : ', len(testing_set))

Training set size :  15842
Test set size :  3961


In [63]:
#Checking the training _set
print(training_set[:5])



In [65]:
#Training the classifier with NaiveBayes algorithm
fakeClassifier = nltk.NaiveBayesClassifier.train(training_set)

In [66]:
#storing the classifier on disk for later usage
import pickle
f = open('nb_fake_news_classifier.pickle', 'wb')
pickle.dump(fakeClassifier,f)
print('Classifier stored at ', f.name)
f.close()

Classifier stored at  nb_fake_news_classifier.pickle


In [67]:
from google.colab import files
files.download("nb_fake_news_classifier.pickle")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [68]:
!ls

 fake-news.zip	    kaggle.json			     submit.csv
'kaggle (1).json'   nb_fake_news_classifier.pickle   test.csv
'kaggle (2).json'   nb_spam_classifier.pickle	     train.csv
'kaggle (3).json'   sample_data


In [69]:
#print(nltk.classify.accuracy(spamClassifier, training_set))
print(nltk.classify.accuracy(spamClassifier, testing_set))

0.9522847765715728


In [70]:
print(spamClassifier.show_most_informative_features(50))

Most Informative Features
     contains(breitbart) = True                0 : 1      =    686.5 : 1.0
          contains(york) = True                0 : 1      =    221.2 : 1.0
        contains(finest) = True                1 : 0      =     44.6 : 1.0
          contains(time) = True                0 : 1      =     38.2 : 1.0
      contains(november) = True                1 : 0      =     38.0 : 1.0
         contains(brief) = True                0 : 1      =     27.2 : 1.0
          contains(milo) = True                0 : 1      =     26.6 : 1.0
       contains(comment) = True                1 : 0      =     26.5 : 1.0
         contains(alert) = True                1 : 0      =     21.7 : 1.0
     contains(halloween) = True                1 : 0      =     19.5 : 1.0
      contains(thursday) = True                0 : 1      =     18.4 : 1.0
      contains(clintons) = True                1 : 0      =     17.9 : 1.0
       contains(october) = True                1 : 0      =     17.0 : 1.0