In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

import nltk
nltk.download('punkt')
from nltk.corpus import stopwords
from nltk.util import ngrams
from nltk.tokenize import word_tokenize,sent_tokenize
import re
import string

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer

from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics
from sklearn.feature_extraction.text import TfidfVectorizer

import gensim 
from gensim.models import word2vec
from gensim.models import KeyedVectors
from nltk.tokenize import sent_tokenize,word_tokenize
from nltk.tokenize import RegexpTokenizer
word_vectors = KeyedVectors.load_word2vec_format('../input/googlenewsvectorsnegative300/GoogleNews-vectors-negative300.bin',binary=True)
tokenizer = RegexpTokenizer("[a-zA-Z0-9]+")

[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
df_fake = pd.read_csv('../input/fake-and-real-news-dataset/Fake.csv')
df_true = pd.read_csv('../input/fake-and-real-news-dataset/True.csv')

In [3]:
df_fake['Label'] = 1
df_true['Label'] = 0
df_total = pd.concat([df_true,df_fake],axis=0,ignore_index=True)

In [4]:
df_total['text'] = df_total['title'] + df_total['text']
df_total = df_total.drop(['title'], axis=1)

In [5]:
def clean_text(text):
    
    text = str(text).lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)
    text = re.sub('Reuters','',text)
    return text

In [6]:
df_total['text'] = df_total['text'].apply(lambda x:clean_text(x))

In [7]:
stop_words = stopwords.words('english')
df_total['text'] = df_total['text'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop_words)]))

In [8]:
def lemmatize_words(text):
    wnl = nltk.stem.WordNetLemmatizer()
    lem = ' '.join([wnl.lemmatize(word) for word in text.split()])    
    return lem

In [9]:
df_total['text'] = df_total['text'].apply(lemmatize_words)

In [10]:
print(df_total.shape)

(44898, 4)


In [11]:
def getStemmedReview(review):
    #   review = review.replace("<br /><br />"," ")
    #Tokenize
    
    tokens = tokenizer.tokenize(review)
  #  new_tokens = [token for token in tokens if token not in sw]
    new_tokens = [token for token in tokens if  token.isalpha()]
    new_tokens = [token for token in new_tokens if len(token)>1]
    return len(new_tokens)

In [12]:
no_of_tokens = 0
for i in range(0,len(df_total)):
    temp = getStemmedReview(df_total['text'][i])
    no_of_tokens+=temp
mean_tokens = no_of_tokens/len(df_total)

In [13]:
print(mean_tokens)

235.26673793932915


In [14]:
x_ = df_total['text']
y_ = df_total['Label']

In [15]:
print(x_.shape)
print(y_.shape)

(44898,)
(44898,)


In [16]:

def getStemmedReview(review, mean_tokens):
    #   review = review.replace("<br /><br />"," ")
    #Tokenize
    
    tokens = tokenizer.tokenize(review)
  #  new_tokens = [token for token in tokens if token not in sw]
    new_tokens = [token for token in tokens if  token.isalpha()]
    new_tokens = [token for token in new_tokens if len(token)>1]
    if len(new_tokens) > mean_tokens:
        new_tokens = new_tokens[0:mean_tokens]
    else:
        temp = mean_tokens - len(new_tokens)
        for i in range(0,temp):
            new_tokens.append(0)
            
#    cleaned_review = ' '.join(new_tokens)

    arr = np.zeros((mean_tokens,300))
    for i in range(0,mean_tokens):
        try:
            emb = word_vectors[new_tokens[i]]
        except:
            emb = [0]*300
        
        arr[i] = emb
        #for j in range(0,300):
        #    arr[i][j] = emb[j]
            
    return arr

In [17]:
mean_len = 20
embedding_out = np.zeros((len(df_total), mean_len, 300))
#tokens_out = []
for i in range(0,len(df_total)):
    temp = getStemmedReview(df_total['text'][i], mean_len)
    embedding_out[i] = temp
  #  for j in range(0,10):
   #     for k in range(0,300):
    #        embedding_out[i][j][k] = temp[j][k]

In [18]:
print(embedding_out.shape)

(44898, 20, 300)


In [19]:
embedding_out_2d = embedding_out.reshape(embedding_out.shape[0], embedding_out.shape[1]*embedding_out.shape[2])

In [20]:
print(embedding_out_2d.shape)

(44898, 6000)


In [21]:
x_train, x_test, y_train, y_test = train_test_split(embedding_out_2d, y_, test_size=0.20, random_state=8)

print(x_train.shape)
print(x_test.shape)
print(y_train.shape)
print(y_test.shape)

(35918, 6000)
(8980, 6000)
(35918,)
(8980,)


In [22]:
from sklearn.naive_bayes import GaussianNB
clf_gnb = GaussianNB()
clf_gnb.fit(x_train, y_train)


GaussianNB()

In [23]:
pred_gnb = clf_gnb.predict(x_test)

In [24]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score

In [25]:
print(accuracy_score(y_test, pred_gnb))
print(recall_score(y_test, pred_gnb))
print(precision_score(y_test, pred_gnb))
print(f1_score(y_test, pred_gnb))

0.9378619153674833
0.9680152510061427
0.9182238296162347
0.9424623633738916
