In [1]:
import pandas as pd
import numpy as np
import nltk as nlp
import sklearn
import matplotlib as plt
%matplotlib inline

from sklearn.naive_bayes import MultinomialNB
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn import metrics

In [2]:
vectorizer = TfidfVectorizer(stop_words = nlp.corpus.stopwords.words('english'))

In [3]:
def read_data():
    fake = pd.read_csv('./fake-and-real-news-dataset/Fake.csv')
    real = pd.read_csv('./fake-and-real-news-dataset/True.csv')
    fake['label'] = 0
    real['label'] = 1
    data = fake.append(real)
    data = data.sample(frac=0.05)
    return data

In [4]:
def split_data(data):
    X = data[['title', 'text']]
    y = data['label']
    X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.3)
    return X_train, X_test, y_train, y_test

In [5]:
data = read_data()
X_train, X_test, y_train, y_test = split_data(data)

In [6]:
vectors = vectorizer.fit_transform(X_train.values.ravel())

In [7]:
row, col = vectors.shape
vectors = vectors.reshape(y_train.shape[0], int((row*col)/y_train.shape[0]))

In [8]:
classifier = MultinomialNB(alpha = 0.1)

In [9]:
classifier.fit(vectors, y_train.values.ravel())

MultinomialNB(alpha=0.1)

In [10]:
test_vectors = vectorizer.transform(X_test.values.ravel())

In [11]:
row, col = test_vectors.shape
test_vectors = test_vectors.reshape(y_test.shape[0], int((row*col)/y_test.shape[0]))

In [12]:
prediction = classifier.predict(test_vectors)

In [13]:
accuracy = metrics.accuracy_score(y_test, prediction)
accuracy

0.9332344213649851