[Dataset](https://drive.google.com/file/d/1er9NJTLUA3qnRuyhfzuN0XUsoIC4a-_q/view)

In [1]:
import numpy as np
import pandas as pd
import itertools
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.metrics import accuracy_score, confusion_matrix

In [2]:
df=pd.read_csv('news.csv')

# EDA

## Data Understanding

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6335 entries, 0 to 6334
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Unnamed: 0  6335 non-null   int64 
 1   title       6335 non-null   object
 2   text        6335 non-null   object
 3   label       6335 non-null   object
dtypes: int64(1), object(3)
memory usage: 198.1+ KB


In [4]:
df.sample(5)

Unnamed: 0.1,Unnamed: 0,title,text,label
4896,4974,"Eric Trump: Dad's Campaign To Focus On, Fix In...","Donald Trump's son, Eric, said his father's pr...",REAL
5972,5594,Will James Comey Change the Outcome of the Ele...,Will James Comey Change the Outcome of the Ele...,FAKE
378,8263,David Duke wins Louisiana Senate Debate in Lan...,\nThe liberal media is going bat$hit crazy ove...,FAKE
2741,3939,Obama Pushes Castro on Human Rights During Joi...,President Barack Obama and Cuban President Rau...,REAL
5549,1258,Republican Debate: Trump's absence leaves thre...,The intentional winner of the Fox News Republi...,REAL


## Data Preparation

In [5]:
df.drop(columns=['Unnamed: 0'], inplace=True)

In [6]:
df.isnull().sum()

title    0
text     0
label    0
dtype: int64

In [7]:
df.loc[df.duplicated()]

Unnamed: 0,title,text,label
1492,"University of Missouri, please immediately fir...",To watch the video of photographer Tim Tai get...,REAL
1954,Loretta Lynch becomes first African-American w...,Washington (CNN) Loretta Lynch was sworn in as...,REAL
2336,Obamacare Enrollees Anxiously Await Supreme Co...,"“I’ve got my six-month, regular cancer checkup...",REAL
2856,Donald Trump is blatantly racist — and the med...,"Donald Trump, the actual Republican candidate ...",REAL
2952,"Black Agenda Report for Week of Oct 31, 2016","News, information and analysis from the black ...",FAKE
3081,The College Loan Bombshell Hidden in the Budget,In obscure data tables buried deep in its 2016...,REAL
3292,This astonishing chart shows how moderate Repu...,Political scientists have known for years that...,REAL
3301,OnPolitics | 's politics blog,Who has Trump appointed to his cabinet so far?...,REAL
3324,Senate GOP prepared to replace Obamacare subsi...,"Killing Obama administration rules, dismantlin...",REAL
4140,We Republicans Lost On Gay Rights. That’s A Go...,I’m not among those Republicans who have “evol...,REAL


In [8]:
df.drop_duplicates(inplace=True)

In [9]:
df.shape

(6306, 3)

# Model

In [10]:
labels=df.label
x_train,x_test,y_train,y_test=train_test_split(df['text'], labels, test_size=0.2, random_state=7)

In [11]:
#DataFlair - Initialize a TfidfVectorizer
tfidf_vectorizer=TfidfVectorizer(stop_words='english', max_df=0.7)
#DataFlair - Fit and transform train set, transform test set
tfidf_train=tfidf_vectorizer.fit_transform(x_train) 
tfidf_test=tfidf_vectorizer.transform(x_test)

In [12]:
tfidf_train

<5044x61672 sparse matrix of type '<class 'numpy.float64'>'
	with 1324215 stored elements in Compressed Sparse Row format>

In [13]:
#DataFlair - Initialize a PassiveAggressiveClassifier
pac=PassiveAggressiveClassifier(max_iter=50)
pac.fit(tfidf_train,y_train)
#DataFlair - Predict on the test set and calculate accuracy
y_pred=pac.predict(tfidf_test)
score=accuracy_score(y_test,y_pred)
print(f'Accuracy: {round(score*100,2)}%')

Accuracy: 93.98%


In [14]:
confusion_matrix(y_test,y_pred, labels=['FAKE','REAL'])

array([[622,  40],
       [ 36, 564]], dtype=int64)