In [1]:
import pandas as pd
import numpy as np
import nltk
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
pd.set_option('display.max_columns', None)
pd.set_option("max_rows", None)

In [2]:
url1 = 'https://github.com/hanamoii/test/raw/main/REALnews.csv'
url2 = 'https://github.com/hanamoii/test/raw/main/FAKEnews.csv'
datareal = pd.read_csv(url1)
datafake = pd.read_csv(url2)

In [3]:
datareal = datareal.astype(str) 
datafake = datafake.astype(str)

In [4]:
print(datareal.shape)
datareal.head()

(3150, 4)


Unnamed: 0,ID,TITLE,TEXT,LABEL
0,3608,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,REAL
1,875,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,REAL
2,95,‘Britain’s Schindler’ Dies at 106,A Czech stockbroker who saved more than 650 Je...,REAL
3,4869,Fact check: Trump and Clinton at the 'commande...,Hillary Clinton and Donald Trump made some ina...,REAL
4,2909,Iran reportedly makes new push for uranium con...,Iranian negotiators reportedly have made a las...,REAL


In [5]:
print(datafake.shape)
datafake.head()

(3150, 4)


Unnamed: 0,ID,TITLE,TEXT,LABEL
0,8476,You Can Smell Hillaryโ€s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",FAKE
1,10294,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,FAKE
2,10142,Bernie supporters on Twitter erupt in anger ag...,"โ€” Kaydee King (@KaydeeKing) November 9, 2016...",FAKE
3,6903,"Tehran, USA","\nIโ€m not an immigrant, but my grandparent...",FAKE
4,7341,Girl Horrified At What She Watches Boyfriend D...,"Share This Baylee Luciani (left), Screenshot o...",FAKE


In [6]:
df = pd.concat([datareal, datafake])
df['NEWS'] = df['TITLE'] + ' ' + df['TEXT'] + ' ' +df['LABEL']
df.sample(frac = 1)

Unnamed: 0,ID,TITLE,TEXT,LABEL,NEWS
2895,5300,"""Russia is concentrating"" 2.0: Putin's new pol...","October 28, 2016 - By Eduard Popov for Fort R...",FAKE,"""Russia is concentrating"" 2.0: Putin's new pol..."
1596,4162,Battle over Scalia’s replacement already spill...,Conservative and liberal advocacy groups are g...,REAL,Battle over Scalia’s replacement already spill...
1995,8128,The Dream Team Of Hillary Clinton and Michelle...,3:18 pm The Democratic dream team of Hillary...,FAKE,The Dream Team Of Hillary Clinton and Michelle...
356,9275,Sesame Seeds for Knee Osteoarthritis,Sesame Seeds for Knee Osteoarthritis VN:F [1.9...,FAKE,Sesame Seeds for Knee Osteoarthritis Sesame Se...
1451,3152,Anti-Semitism growing in Europe,While I understand this sentiment is coming fr...,REAL,Anti-Semitism growing in Europe While I unders...
2790,3617,Defiant Charlie Hebdo to print 3M copies of la...,Muhammad will be back on the cover of the next...,REAL,Defiant Charlie Hebdo to print 3M copies of la...
2890,10160,Another Project Veritas Bombshell: Pro-Clinton...,We Are Change \nIn the fourth undercover video...,FAKE,Another Project Veritas Bombshell: Pro-Clinton...
193,3226,Is the GOP losing Walmart?,"(CNN) As goes Walmart , so goes the nation?\n\...",REAL,Is the GOP losing Walmart? (CNN) As goes Walma...
1761,7548,Itโ€s A Setup: Dems Claim Russians Will Under...,Tweet Home ยป Headlines ยป World News ยป Itโ€...,FAKE,Itโ€s A Setup: Dems Claim Russians Will Under...
1035,2695,Reddit administrators accused of censorship,Administrators at the popular online forum Red...,REAL,Reddit administrators accused of censorship Ad...


In [7]:
df = df[df['LABEL']!='']
print(df['LABEL'].unique())

['REAL' 'FAKE']


In [8]:
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(df['NEWS'].values)
X = X.toarray()

In [9]:
y = df['LABEL'].values

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle=True, test_size=0.2, random_state=42)

In [11]:
model = MultinomialNB()
model.fit(X_train, y_train)

MultinomialNB()

In [12]:
predictions = model.predict(X_test)

In [13]:
cm = confusion_matrix(y_test, predictions)

In [14]:
df[['NEWS', 'LABEL']].sample(n=20)

Unnamed: 0,NEWS,LABEL
3127,Are You Ready for Aftermath of the Election? W...,FAKE
835,How well-meaning political reformers are helpi...,REAL
308,The media unload on Donald Trump With Trump's ...,REAL
1736,Top 10 toxins that are poisoning your kids Top...,FAKE
1575,Jim Rogers: Itโ€s Time To Prepare; Economic A...,FAKE
1282,"After riots, Baltimore residents take to stree...",REAL
1005,How To Repair Strained Or Broken Relationships...,FAKE
1534,CHARLESTON โ€ON EGGSHELLSโ€ ON EVE OF TWO RA...,FAKE
2861,Hillary Clinton tributes fuel 2016 buzz Despit...,REAL
2201,"Over 20 Injured, Almost 40 Detained in Venezue...",FAKE


In [15]:
inputdata = input()
print(input)

Think the Iowa polls were bad? Wait until New
<bound method Kernel.raw_input of <ipykernel.ipkernel.IPythonKernel object at 0x000002275E67A790>>


In [16]:
vectorized_sentence = vectorizer.transform([inputdata]).toarray()
model.predict(vectorized_sentence) 

array(['REAL'], dtype='<U4')

In [17]:
score=model.score(X_test, y_test)
if model.predict(vectorized_sentence)  == 'REAL':
    print('This news is real news.')
    print(f'Accuracy: {round(score*100,2)}%')
else:
    print('This news is fake news.')
    print(f'Accuracy : {round(score*100,2)} %')

This news is real news.
Accuracy: 95.95%
