In [1]:
import pandas as pd
import numpy as np


In [2]:
import re  # used to search specific texts from paragraph
# regular expression

In [3]:
from nltk.corpus import stopwords # nltk stands for natural language toolkit   
# used for getting the frequently used words and doesn't add much value in data (e.g. the, a etc)

In [4]:
from nltk.stem.porter import PorterStemmer  # used to remove prefix and suffix of words and gives root words

In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer   # converts text into feature vectors and metrics

In [6]:
import matplotlib.pyplot as plt
import seaborn as sns

In [7]:
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [8]:
# loading data
df= pd.read_csv("train.csv")

In [9]:
df.shape

(20800, 5)

In [10]:
# checking null values
df.isnull().sum()

id           0
title      558
author    1957
text        39
label        0
dtype: int64

In [11]:
# replacing null values with empty string 
df= df.fillna("")

In [12]:
df.isnull().sum()

id        0
title     0
author    0
text      0
label     0
dtype: int64

In [13]:
# downloading stopwords for later use

import nltk
nltk.download("stopwords")

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [14]:
# stemming (removing prefix and suffix to its root word)
port_stemmer=PorterStemmer()

def stemming(content):
    stemmed_content= re.sub('[^a-zA-Z]',' ', content)     # searching and substituting perticular values and only taking alphabets , " " is replaced in place where letters expect alphabets are present
    stemmed_content= stemmed_content.lower()    # converting all words into lowercase
    stemmed_content= stemmed_content.split()    # splitting the words into lists
    stemmed_content= [port_stemmer.stem(words) for words in stemmed_content if not words in stopwords.words("english")]   # removing all the stopwords from data using for loop and choosing words without stopwords
    stemmed_content= " ".join(stemmed_content)  # joining all words which remain
    return stemmed_content
    

In [15]:
df["content"]= df["title"]+ " " + df["author"]
df

Unnamed: 0,id,title,author,text,label,content
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1,House Dem Aide: We Didn’t Even See Comey’s Let...
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0,"FLYNN: Hillary Clinton, Big Woman on Campus - ..."
2,2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1,Why the Truth Might Get You Fired Consortiumne...
3,3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,1,15 Civilians Killed In Single US Airstrike Hav...
4,4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,1,Iranian woman jailed for fictional unpublished...
...,...,...,...,...,...,...
20795,20795,Rapper T.I.: Trump a ’Poster Child For White S...,Jerome Hudson,Rapper T. I. unloaded on black celebrities who...,0,Rapper T.I.: Trump a ’Poster Child For White S...
20796,20796,"N.F.L. Playoffs: Schedule, Matchups and Odds -...",Benjamin Hoffman,When the Green Bay Packers lost to the Washing...,0,"N.F.L. Playoffs: Schedule, Matchups and Odds -..."
20797,20797,Macy’s Is Said to Receive Takeover Approach by...,Michael J. de la Merced and Rachel Abrams,The Macy’s of today grew from the union of sev...,0,Macy’s Is Said to Receive Takeover Approach by...
20798,20798,"NATO, Russia To Hold Parallel Exercises In Bal...",Alex Ansary,"NATO, Russia To Hold Parallel Exercises In Bal...",1,"NATO, Russia To Hold Parallel Exercises In Bal..."


In [16]:
df["content"]= df["content"].apply(stemming)

In [17]:
# separating data into input and output
X= df["content"].values
y= df["label"].values


In [18]:
# converting textual data into numerical data
vectorizer= TfidfVectorizer()

Tf = Term frequency
idf= inverse document frequency


it basically counts the number of times a perticular word repeating in a paragraph and implements a perticular number to that word

In [19]:
vectorizer.fit(X)

In [20]:
X=vectorizer.transform(X)

In [21]:
print(X)

  (np.int32(0), np.int32(267))	0.2701012497770876
  (np.int32(0), np.int32(2483))	0.36765196867972083
  (np.int32(0), np.int32(2959))	0.24684501285337127
  (np.int32(0), np.int32(3600))	0.3598939188262558
  (np.int32(0), np.int32(3792))	0.27053324808454915
  (np.int32(0), np.int32(4973))	0.23331696690935097
  (np.int32(0), np.int32(7005))	0.2187416908935914
  (np.int32(0), np.int32(7692))	0.24785219520671598
  (np.int32(0), np.int32(8630))	0.2921251408704368
  (np.int32(0), np.int32(8909))	0.36359638063260746
  (np.int32(0), np.int32(13473))	0.2565896679337956
  (np.int32(0), np.int32(15686))	0.2848506356272864
  (np.int32(1), np.int32(1497))	0.2939891562094648
  (np.int32(1), np.int32(1894))	0.15521974226349364
  (np.int32(1), np.int32(2223))	0.3827320386859759
  (np.int32(1), np.int32(2813))	0.19094574062359204
  (np.int32(1), np.int32(3568))	0.26373768806048464
  (np.int32(1), np.int32(5503))	0.7143299355715573
  (np.int32(1), np.int32(6816))	0.1904660198296849
  (np.int32(1), np.in

In [22]:
# splitting data 
x_train, x_test, y_train, y_test= train_test_split(X,y, test_size=0.2, random_state=42, stratify=y)

In [23]:
# training model
lr= LogisticRegression()
lr.fit(x_train,y_train)

## checking accuracy for trainning data

In [24]:
y_pred_train= lr.predict(x_train)

In [25]:
accuracy_score(y_pred_train, y_train)

0.9871995192307692

## checking accuracy for test data

In [26]:
y_pred= lr.predict(x_test)

In [27]:
accuracy_score(y_pred, y_test)

0.9752403846153846

## checking cross_val_score

In [28]:
np.mean(cross_val_score(lr, X,y, scoring="accuracy", cv=50))

np.float64(0.979375)

## cross checking data

In [41]:
print(x_test[0], y_test[0])

  (np.int32(0), np.int32(1691))	0.27334097688945147
  (np.int32(0), np.int32(3173))	0.30220187476862226
  (np.int32(0), np.int32(10889))	0.4074828305477443
  (np.int32(0), np.int32(11054))	0.43934964127276355
  (np.int32(0), np.int32(11811))	0.2736526668277463
  (np.int32(0), np.int32(12672))	0.29724177794048273
  (np.int32(0), np.int32(14625))	0.3334903874308643
  (np.int32(0), np.int32(15154))	0.28147520515507646
  (np.int32(0), np.int32(16226))	0.2197304744445337
  (np.int32(0), np.int32(16820))	0.2700450057325363 0


In [34]:
input=x_test[0]

In [39]:
pred=lr.predict(input)
if pred==0:
    print("News is real")
else:
    print("News is fake")

News is real
