In [1]:
import numpy as np
import pandas as pd
import re
from nltk.corpus import stopwords  #remove a an the
from nltk.stem.porter import PorterStemmer #   gives the root word like processing root word is process 
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import warnings
warnings.filterwarnings('ignore')

In [2]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\RJANKAR\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
#prrinting the stopwords in english
print(stopwords.words('english'))

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

### Data Pre-Processing

In [4]:
news_dataset = pd.read_csv('train.csv')

In [5]:
news_dataset.head(3)

Unnamed: 0,id,title,author,text,label
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0
2,2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1


In [6]:
news_dataset.shape

(20800, 5)

In [7]:
#counting the missing value in the dataset 
news_dataset.isnull().sum()

id           0
title      558
author    1957
text        39
label        0
dtype: int64

In [8]:
# replacing the null values with empty string
news_dataset = news_dataset.fillna('')

In [9]:
news_dataset.head(5)

Unnamed: 0,id,title,author,text,label
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0
2,2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1
3,3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,1
4,4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,1


In [10]:
# merging the author name and news title 
news_dataset['content']= news_dataset['author']+' '+news_dataset['title']

In [11]:
news_dataset['content'].head(3)

0    Darrell Lucus House Dem Aide: We Didn’t Even S...
1    Daniel J. Flynn FLYNN: Hillary Clinton, Big Wo...
2    Consortiumnews.com Why the Truth Might Get You...
Name: content, dtype: object

In [12]:
#seperating the data & label
x= news_dataset.drop(columns='label',axis=1)
y = news_dataset['label']

In [13]:
x.head(3)

Unnamed: 0,id,title,author,text,content
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus House Dem Aide: We Didn’t Even S...
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,"Daniel J. Flynn FLYNN: Hillary Clinton, Big Wo..."
2,2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",Consortiumnews.com Why the Truth Might Get You...


In [14]:
y.head(3)

0    1
1    0
2    1
Name: label, dtype: int64

In [15]:
# Stemming 
# Stemming is the process of reducing a word to its Root Word
# example : actor,actress,acting--> root word is--> act 

In [16]:
Port_stem = PorterStemmer()

In [17]:
def stemming(input_content):  # creating a funtion
    stemmed_content = re.sub('[^a-zA-Z]',' ',input_content)      #sub means substitude , ^ exclusion means exclude num1eric value and sybols from input contant   
    stemmed_content = stemmed_content.lower()                 
    stemmed_content = stemmed_content.split()     #split and conver into list 
    stemmed_content = [Port_stem.stem(word) for word in stemmed_content if not word in stopwords.words('english')]
    stemmed_content = ' '.join(stemmed_content)
    return stemmed_content

In [None]:
news_dataset['content']  = news_dataset['content'].apply(stemming)

In [None]:
print(news_dataset['content'])  # no (upper case,numbers,symbols) and all are root words and remove stopwards

In [None]:
# seperating the data and label
x = news_dataset['content'].values
y = news_dataset['label'].values

In [None]:
x

In [None]:
y

In [None]:
y.shape

In [None]:
# converting textual data to numerical data
vectorizer = TfidfVectorizer()  #TfidfVectorizer  = tf--> teram frequcny (repitation word) , i--> inverse (not significant)
vectorizer.fit(x)
x = vectorizer.transform(x)

In [None]:
print(x)

 ### splitting the dataset
 #### x and y split into train and test data

In [None]:
#from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size= 0.2,stratify=y,random_state=2)

### Model Building
#### training the model using LogisticRegression

In [None]:
#from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression()
classifier.fit(x_train, y_train) # train our model using fit method

### Accuracy of the model

In [None]:
#from sklearn.metrics import confusion_matrix, accuracy_score
# accuracy score on the traning data
x_train_prediction= classifier.predict(x_train)
traning_data_accuracy = accuracy_score(x_train_prediction, y_train)

In [None]:
print('accuracy score of training data:',traning_data_accuracy)

#### test our model on test data to see the accuracy

In [None]:
#from sklearn.metrics import confusion_matrix, accuracy_score
# accuracy score on the test data
x_test_prediction= classifier.predict(x_test)
test_data_accuracy = accuracy_score(x_test_prediction, y_test)

In [None]:
print('accuracy score of test data:',test_data_accuracy)

### making predicting system

In [None]:
x_news = x_test[5]
prediction = classifier.predict(x_news)
print(prediction)
if(prediction[0]==0):
    print("The news is Real")
else:
    print("the news is fake")

In [None]:
print(y_test[1])