Fake News Detection


In [None]:
# Import the files

import numpy as np
import pandas as pd
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer 
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [None]:
import nltk
nltk.download('stopwords')

In [None]:
print(stopwords.words('english'))

Data Preprocessing

In [None]:
news_dataset = pd.read_csv('.\\dataset\\train.csv')

In [None]:
news_dataset.shape

In [None]:
news_dataset[:5]     #### real : 0, Fake : 1

In [None]:
## find the missing value
news_dataset.isnull().sum()

In [None]:
## replacing null values with empty string
news_dataset = news_dataset.fillna('')

In [None]:
# merging the author and title column
news_dataset['content'] = news_dataset['author'] + ' ' +news_dataset['title']
news_dataset['content'].head()

In [None]:
## seprate the data and label column
X = news_dataset.drop(columns='label', axis=1)
y = news_dataset['label']

Stemming : It is process of reducing a word to its root word. Remove prefix, suffix from their word.
Ex: acting, actress, actor ----------> act

In [None]:
port_stem = PorterStemmer()

In [None]:
def stemming(content):
    stemmed_content = re.sub('[^a-zA-Z]', ' ', content)
    stemmed_content = stemmed_content.lower()
    stemmed_content = stemmed_content.split()
    stemmed_content = [port_stem.stem(word) for word in stemmed_content if not word in stopwords.words('english')]
    stemmed_content = ' '.join(stemmed_content)
    return stemmed_content

In [None]:
news_dataset['content'] = news_dataset['content'].apply(stemming)

In [None]:
print(news_dataset['content'])

In [None]:
X = news_dataset['content'].values
y = news_dataset['label'].values

In [None]:
## convert text into integer value
vectorizer = TfidfVectorizer()
vectorizer.fit(X)

X = vectorizer.transform(X)

In [None]:
print(X)

In [None]:
## split dataset into train and test data

X_train, X_test, Y_train, Y_test = train_test_split(X, y, random_state=2, test_size=0.2, stratify=y)

Logistic Regression

In [None]:
model = LogisticRegression()

In [None]:
model.fit(X_train, Y_train)

In [None]:
### Prediction on training data

X_train_pred = model.predict(X_train)
training_acc = accuracy_score(X_train_pred, Y_train)

In [None]:
print("Training accuracy", training_acc)

In [None]:
X_test_pred = model.predict(X_test)
testing_acc = accuracy_score(X_test_pred, Y_test)
print("testing accuracy", testing_acc)

For testing of test.csv

In [None]:
test_data = pd.read_csv('.\\dataset\\test.csv')
# print(test_data.head())

### check missing value
test_data.isnull().sum()
test_data = test_data.fillna('')

# merging the author and title column
test_data['content'] = test_data['author'] + ' ' +test_data['title']
# test_data['content'].head()

test_data.head()
# test_data['content'] = test_data['content'].apply(stemming)
# X = test_data['content'].values
# y = test_data['label'].values
# print(X)
# print(y)
# vectorizer = TfidfVectorizer()
# vectorizer.fit(X)

# X = vectorizer.transform(X)

# X_train_pred = model.predict(X)
# training_acc = accuracy_score(X_train_pred, y)
# print("testing accuracy", testing_acc)