importing the dependecies

In [1]:
import numpy as np
import pandas as pd
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [2]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\parth\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
# printing english stopwords
print(stopwords.words('english'))

['a', 'about', 'above', 'after', 'again', 'against', 'ain', 'all', 'am', 'an', 'and', 'any', 'are', 'aren', "aren't", 'as', 'at', 'be', 'because', 'been', 'before', 'being', 'below', 'between', 'both', 'but', 'by', 'can', 'couldn', "couldn't", 'd', 'did', 'didn', "didn't", 'do', 'does', 'doesn', "doesn't", 'doing', 'don', "don't", 'down', 'during', 'each', 'few', 'for', 'from', 'further', 'had', 'hadn', "hadn't", 'has', 'hasn', "hasn't", 'have', 'haven', "haven't", 'having', 'he', "he'd", "he'll", 'her', 'here', 'hers', 'herself', "he's", 'him', 'himself', 'his', 'how', 'i', "i'd", 'if', "i'll", "i'm", 'in', 'into', 'is', 'isn', "isn't", 'it', "it'd", "it'll", "it's", 'its', 'itself', "i've", 'just', 'll', 'm', 'ma', 'me', 'mightn', "mightn't", 'more', 'most', 'mustn', "mustn't", 'my', 'myself', 'needn', "needn't", 'no', 'nor', 'not', 'now', 'o', 'of', 'off', 'on', 'once', 'only', 'or', 'other', 'our', 'ours', 'ourselves', 'out', 'over', 'own', 're', 's', 'same', 'shan', "shan't", 'she

Data Preprocessing

In [4]:
# load dataset in pandas
fake = pd.read_csv('Fake.csv')
real = pd.read_csv('True.csv')

In [5]:
#create the label column
real['label'] = 'real'
fake['label'] = 'fake'

In [6]:
# combining the datasets into one
combined_dataset = pd.concat([real, fake], ignore_index = True)

In [7]:
combined_dataset = combined_dataset.sample(frac=1, random_state = 42)

In [8]:
combined_dataset.shape

(44898, 5)

In [9]:
combined_dataset.head(10)

Unnamed: 0,title,text,subject,date,label
22216,BREAKING: GOP Chairman Grassley Has Had Enoug...,"Donald Trump s White House is in chaos, and th...",News,"July 21, 2017",fake
27917,Failed GOP Candidates Remembered In Hilarious...,Now that Donald Trump is the presumptive GOP n...,News,"May 7, 2016",fake
25007,Mike Pence’s New DC Neighbors Are HILARIOUSLY...,Mike Pence is a huge homophobe. He supports ex...,News,"December 3, 2016",fake
1377,California AG pledges to defend birth control ...,SAN FRANCISCO (Reuters) - California Attorney ...,politicsNews,"October 6, 2017",real
32476,AZ RANCHERS Living On US-Mexico Border Destroy...,Twisted reasoning is all that comes from Pelos...,politics,"Apr 25, 2017",fake
5766,"As private lawyer, Trump high court pick was f...",WASHINGTON (Reuters) - As a lawyer in private ...,politicsNews,"February 1, 2017",real
16230,Yemeni Salafist imam killed in Aden: sources,ADEN (Reuters) - A Salafist imam was shot dead...,worldnews,"October 28, 2017",real
20911,FBI says witnesses in U.S. probe into Malaysia...,KUALA LUMPUR (Reuters) - Potential witnesses t...,worldnews,"September 6, 2017",real
35838,An Easy To Read Chart Shows How Bernie Sanders...,The goal of socialism is communism. -Vladimi...,politics,"Feb 24, 2016",fake
33155,MMA FIGHTER JAKE SHIELDS Embarrasses Cowards I...,Opposing views and beliefs has much of this co...,politics,"Feb 4, 2017",fake


In [10]:
#checking for missing values
combined_dataset.isnull().sum()

title      0
text       0
subject    0
date       0
label      0
dtype: int64

In [11]:
# merging the title and text
combined_dataset['content'] = combined_dataset['title'] + ' ' + combined_dataset['text']

In [12]:
print(combined_dataset['content'])

22216     BREAKING: GOP Chairman Grassley Has Had Enoug...
27917     Failed GOP Candidates Remembered In Hilarious...
25007     Mike Pence’s New DC Neighbors Are HILARIOUSLY...
1377     California AG pledges to defend birth control ...
32476    AZ RANCHERS Living On US-Mexico Border Destroy...
                               ...                        
11284    Nigeria says U.S. agrees delayed $593 million ...
44732    Boiler Room #62 – Fatal Illusions Tune in to t...
38158    ATHEISTS SUE GOVERNOR OF TEXAS Over Display on...
860      Republican tax plan would deal financial hit t...
15795    U.N. refugee commissioner says Australia must ...
Name: content, Length: 44898, dtype: object


In [13]:
#separating the data and label
X = combined_dataset.drop(columns = 'label', axis = 1)
Y = combined_dataset['label']

In [14]:
print(X)
print(Y)

                                                   title  \
22216   BREAKING: GOP Chairman Grassley Has Had Enoug...   
27917   Failed GOP Candidates Remembered In Hilarious...   
25007   Mike Pence’s New DC Neighbors Are HILARIOUSLY...   
1377   California AG pledges to defend birth control ...   
32476  AZ RANCHERS Living On US-Mexico Border Destroy...   
...                                                  ...   
11284  Nigeria says U.S. agrees delayed $593 million ...   
44732                  Boiler Room #62 – Fatal Illusions   
38158  ATHEISTS SUE GOVERNOR OF TEXAS Over Display on...   
860    Republican tax plan would deal financial hit t...   
15795  U.N. refugee commissioner says Australia must ...   

                                                    text          subject  \
22216  Donald Trump s White House is in chaos, and th...             News   
27917  Now that Donald Trump is the presumptive GOP n...             News   
25007  Mike Pence is a huge homophobe. He suppor

In [15]:
# stemming
port_stem = PorterStemmer()

In [16]:
def stemming(content):
  stemmed_content = re.sub('[^a-zA-Z]', ' ', content)
  stemmed_content = stemmed_content.lower()
  stemmed_content = stemmed_content.split()
  stemmed_content = [port_stem.stem(word) for word in stemmed_content if not word in stopwords.words('english')]
  stemmed_content = ' '.join(stemmed_content)
  return stemmed_content

In [17]:
combined_dataset['content'] = combined_dataset['content'].apply(stemming)

In [18]:
print(combined_dataset['content'])

22216    break gop chairman grassley enough demand trum...
27917    fail gop candid rememb hilari mock eulog video...
25007    mike penc new dc neighbor hilari troll homopho...
1377     california ag pledg defend birth control insur...
32476    az rancher live us mexico border destroy nanci...
                               ...                        
11284    nigeria say u agre delay million fighter plane...
44732    boiler room fatal illus tune altern current ra...
38158    atheist sue governor texa display capitol grou...
860      republican tax plan would deal financi hit u u...
15795    u n refuge commission say australia must stop ...
Name: content, Length: 44898, dtype: object


In [19]:
X = combined_dataset['content'].values
Y = combined_dataset['label'].values

In [20]:
print(X)
print(Y)

['break gop chairman grassley enough demand trump jr testimoni donald trump white hous chao tri cover russia problem mount hour refus acknowledg problem surround fake news hoax howev fact bear thing differ seem crack congression public leadership chuck grassley r iowa head senat judiciari committe fed demand donald trump jr former trump campaign manag paul manafort testifi committe regard infam shadi meet donald trump shadi russian lawyer promis dirt democrat presidenti nomine hillari clinton fact inform due well demand send signal team trump notabl fire special counsel robert mueller circumst despit fact seem seem trump white hous lay groundwork speak speak tweet regard grassley warn also anyon think senat grassley rest senat seriou need look warn alreadi given trump jr manafort either follow order serv subpoena forc compli refus held contempt congress carri seriou jail time even cruel craven creatur within gop sick donald trump corrupt scandal ridden white hous angri stage hostil tak

In [21]:
#converting textual data to numerical data
vectorizer = TfidfVectorizer()
vectorizer.fit(X)

X = vectorizer.transform(X)

In [22]:
print(X)

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 6904081 stored elements and shape (44898, 89868)>
  Coords	Values
  (0, 570)	0.06055486453372135
  (0, 2286)	0.04253552689623891
  (0, 2301)	0.025666202113816185
  (0, 3031)	0.063093661763644
  (0, 3388)	0.047259753130124114
  (0, 6574)	0.07041583382179394
  (0, 7853)	0.0898677958851911
  (0, 7886)	0.059913473329118644
  (0, 9454)	0.048097596645320734
  (0, 10324)	0.08231690448262517
  (0, 11174)	0.03296114815015033
  (0, 11631)	0.04919300578107092
  (0, 12522)	0.05138750652591612
  (0, 12642)	0.06680808165549404
  (0, 13496)	0.06811316396970238
  (0, 13713)	0.07112856852592032
  (0, 13991)	0.053000278097355684
  (0, 14098)	0.037972261733148126
  (0, 14836)	0.0869210456407381
  (0, 14928)	0.06842646468851935
  (0, 15149)	0.04165264736324387
  (0, 15153)	0.051876038877662234
  (0, 15333)	0.08494945735141231
  (0, 15663)	0.056535937666043194
  (0, 15765)	0.029199968119368196
  :	:
  (44897, 77773)	0.03757368913834899
  (44897,

Splitting dataset into training and test data

In [23]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, stratify=Y, random_state = 2)

Training Logistic Regression Model

In [24]:
model = LogisticRegression()

In [25]:
model.fit(X_train, Y_train)

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'lbfgs'
,max_iter,100


Evaluation

In [26]:
#accuracy score on training data
X_train_prediction = model.predict(X_train)
training_data_accuracy = accuracy_score(X_train_prediction, Y_train)

In [27]:
print('Accuracy score of training data: ', training_data_accuracy)

Accuracy score of training data:  0.9919260537891865


In [28]:
#accuracy score on test data
X_test_prediction = model.predict(X_test)
test_data_accuracy = accuracy_score(X_test_prediction, Y_test)

In [29]:
print('Accuracy score of test data: ', test_data_accuracy)

Accuracy score of test data:  0.9865256124721603


In [30]:
X_new = X_test[5]

prediction = model.predict(X_new)

if (prediction[0]=='real'):
  print('The news is Real')
else:
  print('The news is Fake')

The news is Fake


In [31]:
print(Y_test[5])

fake


In [37]:
sample_title = "India's GDP Grows by 7.8% in Q1 FY25, Says Finance Ministry"
sample_text = "India's economy grew at a rate of 7.8% in the first quarter of the financial year 2024-25, according to data released by the Ministry of Finance. The growth was driven by strong performances in the manufacturing and services sectors. Finance Minister Nirmala Sitharaman said the numbers reflect India's resilient economic fundamentals, despite global challenges such as rising oil prices and geopolitical tensions."

sample_input = sample_title + " " + sample_text
sample_input = stemming(sample_input)
sample_vector = vectorizer.transform([sample_input])
prediction = model.predict(sample_vector)
print(prediction)

['real']
