In [1]:
#Make Necessary imports
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.metrics import accuracy_score, confusion_matrix

In [2]:
#reading the dataset
df=pd.read_csv('news.csv')

In [3]:
#understanding data
df.head()

Unnamed: 0.1,Unnamed: 0,title,text,label
0,8476,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",FAKE
1,10294,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,FAKE
2,3608,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,REAL
3,10142,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",FAKE
4,875,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,REAL


In [4]:
#information About Dataset
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6335 entries, 0 to 6334
Data columns (total 4 columns):
Unnamed: 0    6335 non-null int64
title         6335 non-null object
text          6335 non-null object
label         6335 non-null object
dtypes: int64(1), object(3)
memory usage: 198.1+ KB


In [5]:
#Description of Dataset
df.describe()

Unnamed: 0.1,Unnamed: 0
count,6335.0
mean,5280.415627
std,3038.503953
min,2.0
25%,2674.5
50%,5271.0
75%,7901.0
max,10557.0


In [6]:
#removing unwanted rows
df.drop('Unnamed: 0',axis=1,inplace=True)

In [7]:
#if want to use Count vectorizer
#import string

#def text_process(mess):
   # """
   # Takes in a string of text, then performs the following:
   # 1. Remove all punctuation
   # 2. Remove all stopwords
   # 3. Returns a list of the cleaned text
   # """
   # # Check characters to see if they are in punctuation
   # nopunc = [char for char in mess if char not in string.punctuation]

   # # Join the characters again to form the string.
   # nopunc = ''.join(nopunc)
    
   # # Now just remove any stopwords
   # return [word for word in nopunc.split() if word.lower() not in stopwords.words('english')]

In [8]:
#raw documents into a matrix of TF-IDF features
Tfidf=TfidfVectorizer(stop_words='english',max_df=0.3)

In [9]:
#splitting Data For testing
X_train, X_test, y_train, y_test = train_test_split(df['text'], df['label'], test_size=0.2, random_state=0)

In [10]:
#transforming Data
tfidf_x_train=Tfidf.fit_transform(X_train)

In [11]:
#transtoming test data
tfidf_x_test=Tfidf.transform(X_test)

In [12]:
#initializing classifier
pac=PassiveAggressiveClassifier(C=1,max_iter=10)
pac.fit(tfidf_x_train,y_train)



PassiveAggressiveClassifier(C=1, average=False, class_weight=None,
                            early_stopping=False, fit_intercept=True,
                            loss='hinge', max_iter=10, n_iter_no_change=5,
                            n_jobs=None, random_state=None, shuffle=True,
                            tol=0.001, validation_fraction=0.1, verbose=0,
                            warm_start=False)

In [13]:
#predicting the output
y_pred=pac.predict(tfidf_x_test)

In [14]:
score=accuracy_score(y_test,y_pred)
score

0.9352801894238358

In [15]:
#finding the best parameters for PassiveAgressiveClassifier
from sklearn.model_selection import GridSearchCV
param={'C':[1,0.1,10,100,1000,10000],'max_iter':[1,10,20,50,100,200]}

In [16]:
grid=GridSearchCV(PassiveAggressiveClassifier(),param)
grid.fit(tfidf_x_train,y_train)



GridSearchCV(cv='warn', error_score='raise-deprecating',
             estimator=PassiveAggressiveClassifier(C=1.0, average=False,
                                                   class_weight=None,
                                                   early_stopping=False,
                                                   fit_intercept=True,
                                                   loss='hinge', max_iter=1000,
                                                   n_iter_no_change=5,
                                                   n_jobs=None,
                                                   random_state=None,
                                                   shuffle=True, tol=0.001,
                                                   validation_fraction=0.1,
                                                   verbose=0,
                                                   warm_start=False),
             iid='warn', n_jobs=None,
             param_grid={'C': [1, 0.1, 10, 100, 1

In [17]:
grid.best_params_

{'C': 1000, 'max_iter': 200}

In [18]:
pac=PassiveAggressiveClassifier(C=10,max_iter=20)

In [19]:
pac.fit(tfidf_x_train,y_train)

PassiveAggressiveClassifier(C=10, average=False, class_weight=None,
                            early_stopping=False, fit_intercept=True,
                            loss='hinge', max_iter=20, n_iter_no_change=5,
                            n_jobs=None, random_state=None, shuffle=True,
                            tol=0.001, validation_fraction=0.1, verbose=0,
                            warm_start=False)

In [20]:
pred3=pac.predict(tfidf_x_test)

In [21]:
#finding accuracy
accuracy_score(y_test,pred3)

0.936069455406472

In [22]:
#percent accuracy
round(score*100,2)

93.53

In [23]:
#testing other models
#Support vector classifier
from sklearn.svm import SVC
lm=SVC()
lm.fit(tfidf_x_train,y_train)
pred2=lm.predict(tfidf_x_test)
accuracy_score(y_test,pred2)



0.48539857932123126

In [24]:
#Support vector classifier
#naive_bayes
from sklearn.naive_bayes import MultinomialNB
fake_news_detect_model = MultinomialNB().fit(tfidf_x_train, y_train)
pred4=fake_news_detect_model.predict(tfidf_x_test)
accuracy_score(y_test,pred4)

0.8318863456985004