In [1]:
import numpy as np
import pandas as pd

In [2]:
# Reading the csv files and storing the datasets in rn and fn

In [3]:
rn=pd.read_csv('Real.csv')
fn=pd.read_csv('Fake.csv')

In [4]:
# Creating an additional attribute 'label' and putting  for real news and 1 for fake news

In [5]:
rn['label']=0
fn['label']=1

In [6]:
# Lets take our training columns as 'Text' for our target column 'label'

In [7]:
dataset1=rn[['Text','label']]
dataset2=fn[['Text','label']]

In [8]:
# Concatenating dataset1 and dataset2 in a dataset

In [9]:
dataset=pd.concat((dataset1,dataset2))

In [10]:
#To check whether data points have any null values

In [11]:
dataset.isnull().sum()

Text     0
label    0
dtype: int64

In [12]:
#Checking no of 1's or 0's

In [13]:
dataset['label'].value_counts()

label
0    10
1    10
Name: count, dtype: int64

In [14]:
#Shuffling the data

In [15]:
dataset=dataset.sample(frac=1)

In [16]:
# For using NLP, we import following libraries

In [17]:
import nltk
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

In [18]:
# Creating an instance for wordnetlemmatizer and stopwords for english language

In [19]:
ps=WordNetLemmatizer()
stopwords=stopwords.words('english')

In [20]:
# To convert our words in lower case and remove special case characters, we use clean_row function

In [21]:
def clean_row(row):
    row=row.lower()
    row=re.sub('[^a-zA-Z]',' ',row)
    token=row.split()
    news=[ps.lemmatize(word) for word in token if word not in stopwords]
    cleanned_news=' '.join(news)
    return cleanned_news

In [22]:
# To clean the text section, we type: 

In [23]:
dataset['Text']=dataset['Text'].apply(lambda x:clean_row(x))

In [24]:
print(dataset['Text'])

8           doomsday prophecy claim world end tomorrow
6    government announced new education policy focu...
3      government announced ban social medium platform
7    two major tech company announced merger promis...
2    researcher made breakthrough quantum computing...
4    stock market reached record high today boostin...
4    celebrity reportedly seen two different place ...
6    new scheme claim give away free money anyone sign
3    national election concluded peacefully high vo...
5    historic space mission launched today aiming e...
8    new gene therapy developed treat rare genetic ...
5     scientist allegedly invented time travel machine
0    report claim alien invaded major city causing ...
2       miracle cure discovered cure disease instantly
9    ancient fountain youth discovered promising et...
1    new pill claim help people lose weight without...
7       report indicate zombie outbreak remote village
0    world leader reached historic agreement combat...
1    scien

In [25]:
# To convert the cleaned data into vectors, we import

In [26]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [27]:
# and create its instance in 'vectorizer'

In [28]:
vectorizer=TfidfVectorizer(max_features=56,lowercase=False,ngram_range=(1,2))

In [29]:
# To put first and second columns in variable x,y

In [30]:
X=dataset.iloc[:20,0]
y=dataset.iloc[:20,1]

In [31]:
# To split the dataset into training and testing sets, we import the library

In [32]:
from sklearn.model_selection import train_test_split

In [33]:
# Now to have 20% of the rows in test set

In [34]:
train_data,test_data,train_label,test_label=train_test_split(X,y,test_size=0.2,random_state=0)

In [35]:
# To vectorize the training data and convert the returned matrix to array

In [36]:
vec_train_data=vectorizer.fit_transform(train_data)
vec_train_data=vec_train_data.toarray()

In [37]:
print(vec_train_data.shape)

(16, 56)


In [38]:
# Same with test data

In [39]:
vec_test_data=vectorizer.fit_transform(test_data)
vec_test_data=vec_test_data.toarray()

In [40]:
print(vec_test_data.shape)

(4, 56)


In [41]:
# To convert these final training and test data to Dataframe

In [42]:
train_data=pd.DataFrame(vec_train_data,columns=vectorizer.get_feature_names_out())
test_data=pd.DataFrame(vec_test_data,columns=vectorizer.get_feature_names_out())

In [43]:
#Model

In [44]:
# To train the model using naive bayes, MultinomialNB library is imported

In [45]:
from sklearn.naive_bayes import MultinomialNB

In [46]:
# and its instance is created

In [47]:
clf=MultinomialNB()

In [48]:
# To fit the train data and train label

In [49]:
clf.fit(train_data,train_label)

In [50]:
# To predict the values of train data and store its predicted values in a variable 'y_pred'

In [51]:
y_pred_train=clf.predict(train_data)

In [52]:
print(y_pred_train)

[0 0 1 1 0 1 0 1 0 1 1 1 0 1 1 1]


In [53]:
print(train_label)

8    0
0    0
4    1
2    1
2    0
3    1
4    0
9    1
5    0
6    1
7    1
5    1
7    0
8    1
1    1
0    1
Name: label, dtype: int64


In [54]:
# To measure the accuracy of our model in predicting values, we import accuracy_score library as:

In [55]:
from sklearn.metrics import accuracy_score

In [56]:
# To print its accuracy on training data

In [57]:
print(accuracy_score(test_label,y_pred))

NameError: name 'y_pred' is not defined

In [63]:
txt =input("Enter the news text: ")
new=clean_row(txt)
pred=clf.predict(vectorizer.transform([new]).toarray())
if pred==1:
    print("Fake news !")
elif pred==0:
    print("Real news 

Enter the news text:  Yeh exists !


[1]




In [58]:
y_pred=clf.predict(test_data)

In [59]:
print(accuracy_score(test_label,y_pred))

0.25


In [60]:
print(y_pred)

[1 1 1 0]


In [62]:
print(test_label)

1    0
8    0
1    1
9    0
Name: label, dtype: int64
