In [None]:
import pandas as pd
import re
import numpy as np
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [None]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
print(stopwords.words('english'))

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [None]:
# load data
news_data=pd.read_csv('/content/train.csv')
news_data.shape

(20800, 5)

In [None]:
# missing values
news_data.isnull().sum()

id           0
title      558
author    1957
text        39
label        0
dtype: int64

In [None]:
# replace null values with empty string
news_data=news_data.fillna('')

In [None]:
# merge title and author name
news_data['content']=news_data['author']+' '+news_data['title']
print(news_data['content'])

0        Darrell Lucus House Dem Aide: We Didn’t Even S...
1        Daniel J. Flynn FLYNN: Hillary Clinton, Big Wo...
2        Consortiumnews.com Why the Truth Might Get You...
3        Jessica Purkiss 15 Civilians Killed In Single ...
4        Howard Portnoy Iranian woman jailed for fictio...
                               ...                        
20795    Jerome Hudson Rapper T.I.: Trump a ’Poster Chi...
20796    Benjamin Hoffman N.F.L. Playoffs: Schedule, Ma...
20797    Michael J. de la Merced and Rachel Abrams Macy...
20798    Alex Ansary NATO, Russia To Hold Parallel Exer...
20799              David Swanson What Keeps the F-35 Alive
Name: content, Length: 20800, dtype: object


In [None]:
# separate label and data
X=news_data.drop(columns='label',axis=1)
Y=news_data['label']
# print(X)
# print(Y)

In [None]:
# stemming reducing word into root words
port_stem=PorterStemmer()

In [None]:
def stem(content):
  stem_content=re.sub('[^a-zA-Z]',' ',content)
  stem_content=stem_content.lower()
  stem_content=stem_content.split()
  stem_content=[port_stem.stem(word) for word in stem_content if not word in stopwords.words('english')]
  stem_content= ' '.join(stem_content)
  return stem_content

In [None]:
news_data['content']=news_data['content'].apply(stem)
print(news_data['content'])

0        darrel lucu hou dem aid even see comey letter ...
1        daniel j flynn flynn hillari clinton big woman...
2                   consortiumnew com truth might get fire
3        jessica purkiss civilian kill singl us airstri...
4        howard portnoy iranian woman jail fiction unpu...
                               ...                        
20795    jerom hudson rapper trump poster child white s...
20796    benjamin hoffman n f l playoff schedul matchup...
20797    michael j de la merc rachel abram maci said re...
20798    alex ansari nato russia hold parallel exerci b...
20799                            david swanson keep f aliv
Name: content, Length: 20800, dtype: object


In [None]:
# separate data and label
X=news_data['content'].values
Y=news_data['label'].values


[1 0 1 ... 0 1 1]


In [None]:
# convert text into numerical data
vector=TfidfVectorizer()
vector.fit(X)

X=vector.transform(X)

In [None]:
# Split into train and test
X_train, X_test, Y_train, Y_test=train_test_split(X,Y, test_size=0.2, stratify=Y, random_state=2) 

In [None]:
# train model logistic regression
md=LogisticRegression()

In [None]:
md.fit(X_train,Y_train)

LogisticRegression()

In [None]:
# accuracy score for training data
X_train_prediciton=md.predict(X_train)
training_data_accuracy=accuracy_score(X_train_prediciton,Y_train)
print(training_data_accuracy)

0.9866586538461538


In [None]:
# accuracy score for testing data
X_testing_prediciton=md.predict(X_test)
testing_data_accuracy=accuracy_score(X_testing_prediciton,Y_test)
print(testing_data_accuracy)


0.9790865384615385


In [None]:
# Prediction System
X_new=X_test[0]
prediction=md.predict(X_new)
print(prediction)

if(prediction[0]==0):
  print("News is real")
else:
  print("News is fake")
  

[1]
News is fake
