In [1]:
# Import scikit-learn and NLTK
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Jamshaid\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
import pandas as pd

df = pd.read_csv(r'D:\FIVERR projects\Fake News Classification using LR\news.csv')

df.head(4)


Unnamed: 0.1,Unnamed: 0,title,text,label
0,8476,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",FAKE
1,10294,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,FAKE
2,3608,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,REAL
3,10142,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",FAKE


In [4]:
# check missing values


df.isnull().sum()

Unnamed: 0    0
title         0
text          0
label         0
dtype: int64

In [8]:
# replace missing values with empty string


#df = df.fillna('')

#df.isnull().sum()



df['label'] = df['label'].replace({'FAKE': 0, 'REAL': 1})

In [9]:
#df['content'] = df['title']+' '+ df['author']

df.head(4)

Unnamed: 0.1,Unnamed: 0,title,text,label
0,8476,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",0
1,10294,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,0
2,3608,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,1
3,10142,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",0


In [5]:
# Initialize Porter Stemmer and stop words
port_stem = PorterStemmer()
stop_words = set(stopwords.words('english'))
import re

In [10]:
def stemming(text):

    stemmed_content = re.sub('[^a-zA-Z]', ' ', text)
    stemmed_content = stemmed_content.lower()
    stemmed_content = stemmed_content.split()
    stemmed_content = [port_stem.stem(word) for word in stemmed_content if word not in stop_words ]
    stemmed_content = ' '.join(stemmed_content)
    return stemmed_content



df['text'] = df['text'].apply(stemming)


In [12]:
# seprate the data labels


X = df['text'].values
y = df['label'].values

y

array([0, 0, 1, ..., 0, 1, 1], dtype=int64)

In [13]:
# convert text data into the numerical data 


vectorizer = TfidfVectorizer()

vectorizer.fit(X)

X = vectorizer.transform(X)

print(X)

  (0, 43266)	0.0305089294798837
  (0, 43143)	0.018330936254574128
  (0, 42836)	0.03723430652002139
  (0, 42832)	0.024123281735309754
  (0, 42805)	0.03342157965058952
  (0, 42772)	0.02373158199733929
  (0, 42770)	0.008769006159949829
  (0, 42704)	0.01586413299776395
  (0, 42645)	0.016735394976427433
  (0, 42558)	0.028489938333701406
  (0, 42557)	0.01687960148746075
  (0, 42520)	0.0383844928542425
  (0, 42478)	0.04282388601939193
  (0, 42381)	0.034991614640544406
  (0, 42326)	0.03598071033413201
  (0, 42312)	0.01846435959961643
  (0, 42233)	0.013971188044141832
  (0, 42202)	0.06213972123547004
  (0, 42097)	0.011535614274685978
  (0, 42066)	0.027174758879218847
  (0, 41990)	0.028859093884229654
  (0, 41929)	0.032891625478617195
  (0, 41881)	0.01652269636508778
  (0, 41833)	0.053741305697756876
  (0, 41801)	0.07118481130240924
  :	:
  (6334, 3905)	0.02940423648335081
  (6334, 3904)	0.027151902202961768
  (6334, 3727)	0.03642146927330242
  (6334, 3720)	0.02107984191394252
  (6334, 3444)	0.0

In [14]:
# Split the data into training and testing 


X_train, X_test, y_train, y_test = train_test_split(X,y, test_size= 0.2, random_state= 42)

In [15]:
# train the model 

model = LogisticRegression()

model.fit(X_train,y_train)



In [16]:
train_data_prediction = model.predict(X_train)


training_accuracy_score = accuracy_score(y_train,train_data_prediction)

print("model accuracy with train data", training_accuracy_score)


model accuracy with train data 0.9530386740331491


In [17]:
train_data_prediction = model.predict(X_test)


training_accuracy_score = accuracy_score(y_test,train_data_prediction)

print("model accuracy with test data", training_accuracy_score)

model accuracy with test data 0.9194948697711128
