In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import  accuracy_score
from sklearn.metrics import  classification_report
import re
import string
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer


In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Create an instance of TfidfVectorizer
vectorizer = TfidfVectorizer()

# Example text documents
documents = [
    "This is the first document.",
    "This document is the second document.",
    "And this is the third one.",
    "Is this the first document?"
]

# Fit and transform the documents
tfidf_matrix = vectorizer.fit_transform(documents)

# Get the feature names (words)
feature_names = vectorizer.get_feature_names()

# Print the feature names and TF-IDF matrix
print("Feature Names:", feature_names)
print("TF-IDF Matrix:")
print(tfidf_matrix.toarray())


Feature Names: ['and', 'document', 'first', 'is', 'one', 'second', 'the', 'third', 'this']
TF-IDF Matrix:
[[0.         0.46979139 0.58028582 0.38408524 0.         0.
  0.38408524 0.         0.38408524]
 [0.         0.6876236  0.         0.28108867 0.         0.53864762
  0.28108867 0.         0.28108867]
 [0.51184851 0.         0.         0.26710379 0.51184851 0.
  0.26710379 0.51184851 0.26710379]
 [0.         0.46979139 0.58028582 0.38408524 0.         0.
  0.38408524 0.         0.38408524]]




In [3]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Lakshya\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [4]:
print(stopwords.words('english'))

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

<h4>Data Pre processing :</h4>

In [5]:
df_fake = pd.read_csv("G:\Fake news analysis\Dataset\Fake.csv")
df_true = pd.read_csv("G:\Fake news analysis\Dataset\True.csv")


In [6]:
df_fake.head()

Unnamed: 0,title,text,subject,date
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,News,"December 31, 2017"
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,News,"December 31, 2017"
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",News,"December 30, 2017"
3,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ...",News,"December 29, 2017"
4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,News,"December 25, 2017"


In [7]:
df_true.head()

Unnamed: 0,title,text,subject,date
0,"As U.S. budget fight looms, Republicans flip t...",WASHINGTON (Reuters) - The head of a conservat...,politicsNews,"December 31, 2017"
1,U.S. military to accept transgender recruits o...,WASHINGTON (Reuters) - Transgender people will...,politicsNews,"December 29, 2017"
2,Senior U.S. Republican senator: 'Let Mr. Muell...,WASHINGTON (Reuters) - The special counsel inv...,politicsNews,"December 31, 2017"
3,FBI Russia probe helped by Australian diplomat...,WASHINGTON (Reuters) - Trump campaign adviser ...,politicsNews,"December 30, 2017"
4,Trump wants Postal Service to charge 'much mor...,SEATTLE/WASHINGTON (Reuters) - President Donal...,politicsNews,"December 29, 2017"


In [8]:
df_fake.shape, df_true.shape

((23481, 4), (21417, 4))

<h4>Inserting column 'label' as target feature</h4>

In [9]:
df_fake['label'] = 0
df_true['label'] = 1

<u><b>conclusion : </b></U>new column is created named 'label'.

In [10]:
df_fake.shape, df_true.shape

((23481, 5), (21417, 5))

In [11]:
# counting the number for missing values in the fake news dataset 
df_fake.isnull().sum()

title      0
text       0
subject    0
date       0
label      0
dtype: int64

In [12]:
# counting the number for missing values in the true news dataset 
df_fake.isnull().sum()

title      0
text       0
subject    0
date       0
label      0
dtype: int64

<u><b>conclusion: </b></U>there is no missing values are present. 

Merging Two Dataframes:

In [13]:
df_merge = pd.concat([df_fake,df_true], axis=0)
df_merge.tail()

Unnamed: 0,title,text,subject,date,label
21412,'Fully committed' NATO backs new U.S. approach...,BRUSSELS (Reuters) - NATO allies on Tuesday we...,worldnews,"August 22, 2017",1
21413,LexisNexis withdrew two products from Chinese ...,"LONDON (Reuters) - LexisNexis, a provider of l...",worldnews,"August 22, 2017",1
21414,Minsk cultural hub becomes haven from authorities,MINSK (Reuters) - In the shadow of disused Sov...,worldnews,"August 22, 2017",1
21415,Vatican upbeat on possibility of Pope Francis ...,MOSCOW (Reuters) - Vatican Secretary of State ...,worldnews,"August 22, 2017",1
21416,Indonesia to buy $1.14 billion worth of Russia...,JAKARTA (Reuters) - Indonesia will buy 11 Sukh...,worldnews,"August 22, 2017",1


In [14]:
df_merge.columns

Index(['title', 'text', 'subject', 'date', 'label'], dtype='object')

Title can provide some initial insights, but it may be designed to be attention-grabbing or provocative without reflecting the accuracy of the article's content.
so more reliable is to focus on analysing the content of news rather than title.

In [15]:
# dropping un-necessary column
# axis=0 would mean dropping rows.
# axis=1 means dropping columns.

df = df_merge.drop(['title','subject','date'],axis=1)

In [16]:
df.head()

Unnamed: 0,text,label
0,Donald Trump just couldn t wish all Americans ...,0
1,House Intelligence Committee Chairman Devin Nu...,0
2,"On Friday, it was revealed that former Milwauk...",0
3,"On Christmas day, Donald Trump announced that ...",0
4,Pope Francis used his annual Christmas Day mes...,0


In [17]:
# shuffell the data frame

In [18]:
df = df.sample(frac = 1)
df

Unnamed: 0,text,label
19804,Notify the CDC. It's spreading. #BenCarson #Mo...,0
4603,WASHINGTON (Reuters) - A Native American tribe...,1
20828,BEIRUT (Reuters) - The Syrian army and its all...,1
12390,"Glenn Beck has just proven once again, (to any...",0
17698,The decision of actress Mila Kunis to make mon...,0
...,...,...
19740,Those who were shocked by the latest ABC / W...,0
6098,WASHINGTON (Reuters) - President Donald Trump’...,1
16289,UNITED NATIONS (Reuters) - The United Nations ...,1
13948,DUBLIN (Reuters) - Ireland s finance minister ...,1


In [19]:
df.reset_index(inplace = True)


In [20]:
df.columns



Index(['index', 'text', 'label'], dtype='object')

porter stemming : reduces words to their root word or base word.
example- word = easily, root word = easy

In [22]:
port_stem = PorterStemmer()

In [23]:
# defining a function
def stemming (content):
    # [^a-zA-Z] removes all part like no. or code or any numerical data, punctuatuion marks and all except the alphabets
    stemmed_content = re.sub('[^a-zA-Z]',' ',content)  
    stemmed_content = stemmed_content.lower()
    stemmed_content = stemmed_content.split()
    # removing all stopwords if present
    stemmed_content = [port_stem.stem(word) for word in stemmed_content if not word in stopwords.words('english')]
    stemmed_content = ' '.join(stemmed_content)
    return stemmed_content
    

In [24]:
df['text'] = df['text'].apply(stemming)

In [25]:
df['text']

0        notifi cdc spread bencarson morningjo trumpflu...
1        washington reuter nativ american tribe montana...
2        beirut reuter syrian armi alli fight secur cor...
3        glenn beck proven anyon still give darn say li...
4        decis actress mila kuni make monthli donat abo...
                               ...                        
44893    shock latest abc washington post goal seek rep...
44894    washington reuter presid donald trump press se...
44895    unit nation reuter unit nation secur council d...
44896    dublin reuter ireland financ minist said prosp...
44897    thought muslim corner market abus gay sinc see...
Name: text, Length: 44898, dtype: object

defining dependent and independent variable

In [31]:
X = df['text']
Y = df['label']

In [32]:
X.shape, Y.shape

((44898,), (44898,))

Converting Textual Data to Numerical Data

In [33]:
vectorizer = TfidfVectorizer()
vectorizer.fit(X)

X = vectorizer.transform(X)

In [35]:
print(X)

  (0, 79383)	0.3758084880540249
  (0, 73045)	0.16452688543051303
  (0, 72583)	0.11484828914099661
  (0, 56692)	0.3758084880540249
  (0, 55334)	0.1397576348704474
  (0, 54419)	0.20698663011448898
  (0, 50704)	0.3369084506488162
  (0, 33814)	0.1284148610624582
  (0, 30499)	0.35126529704003606
  (0, 14339)	0.12030279199222421
  (0, 12072)	0.26857354807862105
  (0, 6949)	0.3758084880540249
  (0, 6492)	0.365622143431256
  (1, 89118)	0.23016049573338118
  (1, 87825)	0.02522318934982076
  (1, 87706)	0.014394060915214009
  (1, 86404)	0.013198573668441103
  (1, 85953)	0.0711427107900979
  (1, 84829)	0.0442027095992614
  (1, 84544)	0.033107223183994464
  (1, 84468)	0.01811002115992825
  (1, 84357)	0.04858639931049042
  (1, 83779)	0.020767780207669742
  (1, 83385)	0.029994668458244878
  (1, 81406)	0.017722824578288377
  :	:
  (44897, 14819)	0.025809439577667997
  (44897, 14787)	0.029840544980012925
  (44897, 13699)	0.025815296351911595
  (44897, 13254)	0.05706083074582626
  (44897, 12726)	0.04882

Splitting Data to test and train data

In [36]:
X_train,X_test,Y_train,Y_test = train_test_split(X,Y, test_size=0.2,stratify=Y,random_state=2)

Train the Model

In [40]:
model = LogisticRegression()

In [41]:
model.fit(X_train,Y_train)

Evaluation

In [42]:
# accuracy score on training data
X_train_prediction = model.predict(X_train)
training_data_accuracy = accuracy_score(X_train_prediction,Y_train) 

In [43]:
print("accuracy score on training data",training_data_accuracy)

accuracy score on training data 0.9907288824544797


Conclusion: accuracy - 99%

In [44]:
# accuracy score on testing data
X_test_prediction = model.predict(X_test)
test_data_accuracy = accuracy_score(X_test_prediction,Y_test) 

In [45]:
print("accuracy score on testing data",test_data_accuracy)

accuracy score on testing data 0.9859688195991091


Conclusion: Accuracy-98%

<h4>Making a Predictive System</h4>

In [66]:
new_news = X_test[2]
prediction = model.predict(new_news)
if (prediction[0]==0):
    print("news is fake",prediction)
else:
    print("news is True",prediction)

news is fake [0]
