In [1]:
import pandas as pd
import numpy as np
import re

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegressionCV
from sklearn.model_selection import train_test_split

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

df = pd.read_csv(r'C:\Users\mhdha\Desktop\corona_fake.csv')

In [2]:
df.shape

(1164, 4)

In [3]:
df.head()

Unnamed: 0,title,text,source,label
0,Due to the recent outbreak for the Coronavirus...,"You just need to add water, and the drugs and ...",coronavirusmedicalkit.com,Fake
1,,Hydroxychloroquine has been shown to have a 10...,RudyGiuliani,Fake
2,,Fact: Hydroxychloroquine has been shown to hav...,CharlieKirk,Fake
3,,The Corona virus is a man made virus created i...,JoanneWrightForCongress,Fake
4,,Doesn’t @BillGates finance research at the Wuh...,JoanneWrightForCongress,Fake


In [4]:
df.isnull().sum()

title     82
text      10
source    20
label      5
dtype: int64

In [5]:
df.fillna('', inplace=True)
df.head()

Unnamed: 0,title,text,source,label
0,Due to the recent outbreak for the Coronavirus...,"You just need to add water, and the drugs and ...",coronavirusmedicalkit.com,Fake
1,,Hydroxychloroquine has been shown to have a 10...,RudyGiuliani,Fake
2,,Fact: Hydroxychloroquine has been shown to hav...,CharlieKirk,Fake
3,,The Corona virus is a man made virus created i...,JoanneWrightForCongress,Fake
4,,Doesn’t @BillGates finance research at the Wuh...,JoanneWrightForCongress,Fake


In [6]:
df['label'].value_counts()

TRUE    584
Fake    345
fake    230
          5
Name: label, dtype: int64

## Label pre-processing

In [7]:
df.loc[df['label'] == 'fake', 'label'] = 'FAKE'
df.loc[df['label'] == 'Fake', 'label'] = 'FAKE'
df['label'].value_counts()

TRUE    584
FAKE    575
          5
Name: label, dtype: int64

In [8]:
# Drop empty string
df = df[df['label']!='']
df.shape

(1159, 4)

In [9]:
def convert_label(label):
    if label =='FAKE':
        return 0
    elif label =='TRUE':
        return 1
    
df['label'] = df['label'].apply(convert_label)
df['label'].value_counts()

1    584
0    575
Name: label, dtype: int64

## Text pre-processing

In [10]:
def clean(text):
    # Set of stopwords in English
    stop_words = set(stopwords.words('english'))
    
    # delete numbers
    text = re.sub('[^a-zA-Z]',' ', text)
    
    # All in lower case
    text = text.lower()
    
    # delete html tags
    text = re.sub(r'<[^>]*>', '', text)
    
    # delete twitter usernames
    text = re.sub(r'@[A-Za-z0-9]+','', text)
    
    # delete urls
    text = re.sub('https?://[A-za-z0-9]','', text)
    
    # delete numbers
    text = re.sub('[^a-zA-Z]',' ', text)
    
    # seperate strings
    word_tokens = word_tokenize(text)
    
    # delete stopwords
    filtered_sentence = []
    for word_token in word_tokens:
        if word_token not in stop_words:
            filtered_sentence.append(word_token)
            
    text = (' '.join(filtered_sentence))
    return text

In [11]:
word_tokenize('Hello world 1 2')

['Hello', 'world', '1', '2']

In [12]:
for column in df.columns:
    if column != 'label':
        df[column] = df[column].apply(clean)        
df.head()

Unnamed: 0,title,text,source,label
0,due recent outbreak coronavirus covid world he...,need add water drugs vaccines ready administer...,coronavirusmedicalkit com,0
1,,hydroxychloroquine shown effective rate treati...,rudygiuliani,0
2,,fact hydroxychloroquine shown effective rate t...,charliekirk,0
3,,corona virus man made virus created wuhan labo...,joannewrightforcongress,0
4,,billgates finance research wuhan lab corona vi...,joannewrightforcongress,0


In [13]:
# Train_Test split
X_train, X_test, y_train, y_test = train_test_split(df.iloc[:,:2], df['label'], test_size=0.2, random_state=11)

print(X_train.shape)
print(X_test.shape)

(927, 2)
(232, 2)


In [14]:
# Vectorize
title_vectorizer = CountVectorizer()
text_vectorizer = CountVectorizer()

In [15]:
X_train_title = title_vectorizer.fit_transform(X_train['title']).toarray()
X_train_text = text_vectorizer.fit_transform(X_train['text']).toarray()

print('Title shape\t:', X_train_title.shape)
print('Title text\t:', X_train_text.shape)

Title shape	: (927, 1942)
Title text	: (927, 19351)


In [16]:
X_test_title = title_vectorizer.transform(X_test['title']).toarray()
X_test_text = text_vectorizer.transform(X_test['text']).toarray()

print(X_test_title.shape)
print(X_test_text.shape)

(232, 1942)
(232, 19351)


In [17]:
X_train_title_text = np.hstack((X_train_title, X_train_text))
X_test_title_text = np.hstack((X_test_title, X_test_text))

print(X_train_title_text.shape)
print(X_test_title_text.shape)

(927, 21293)
(232, 21293)


## Machine Learning model

In [18]:
clf = MultinomialNB()
clf.fit(X_train_title_text, y_train)
print('Accuracy on train data\t:', clf.score(X_train_title_text, y_train))
print('Accuracy on test data\t:', clf.score(X_test_title_text, y_test))

Accuracy on train data	: 0.9676375404530745
Accuracy on test data	: 0.9267241379310345


## Testing

In [21]:
test_title = 'Covid 19 found in toilet paper'
test_text = 'strain of deadly virus breeds rapidly in tissue-fibres'

In [20]:
test_title = clean(test_title)
test_text = clean(test_text)

print(test_title)
print(test_text)

covid found toilet paper
strain deadly virus vreeds rapidly tissue fibres


In [25]:
test_title_vec = title_vectorizer.transform([test_title]).toarray()
test_text_vec = text_vectorizer.transform([test_text]).toarray()

print(X_test_title_vec.shape)
print(X_test_text_vec.shape)

(1, 1942)
(1, 19351)


In [27]:
test_title_text = np.hstack((test_title_vec, test_text_vec))
print(test_title_text.shape)

(1, 21293)


In [28]:
print('Prediction \t\t:', clf.predict(test_title_text))
print('Prediction per class \t:', clf.predict_proba(test_title_text))

Prediction 		: [0]
Prediction per class 		: [[0.73354526 0.26645474]]


The output is 0, which means the news is fake with probability about 73.35%