In [3]:
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
import pickle
from sklearn.linear_model import LogisticRegressionCV
import re
import pandas as pd
import warnings
warnings.filterwarnings("ignore")

In [4]:
df = pd.read_csv('covid_fake.csv')

In [7]:
df.head()

Unnamed: 0,title,text,source,label
0,Due to the recent outbreak for the Coronavirus...,"You just need to add water, and the drugs and ...",coronavirusmedicalkit.com,Fake
1,,Hydroxychloroquine has been shown to have a 10...,RudyGiuliani,Fake
2,,Fact: Hydroxychloroquine has been shown to hav...,CharlieKirk,Fake
3,,The Corona virus is a man made virus created i...,JoanneWrightForCongress,Fake
4,,Doesn’t @BillGates finance research at the Wuh...,JoanneWrightForCongress,Fake


In [9]:
df.shape

(1164, 4)

In [11]:
df['label'].value_counts()

label
TRUE    584
Fake    345
fake    230
Name: count, dtype: int64

In [13]:
df.isna().sum()

title     82
text      10
source    20
label      5
dtype: int64

In [17]:
def preprocessor(text):

    text = re.sub('<[^>]*>', '', text)

    text = re.sub(r'[^\w\s]','', text)

    text = re.sub(r'[\n]', '', text)

    text = text.lower()

    return text

    

df['title_text'] = df['title_text'].apply(preprocessor)

df['title_text'][3]

'missing the corona virus is a man made virus created in a wuhan laboratory ask billgates who financed it'

In [19]:
porter = PorterStemmer()

def tokenizer_porter(text):

    return [porter.stem(word) for word in text.split()]

In [21]:
tfidf = TfidfVectorizer(strip_accents=None,

                        lowercase=False,

                        preprocessor=None,

                        tokenizer=tokenizer_porter,

                        use_idf=True,

                        norm='l2',

                        smooth_idf=True)

X = tfidf.fit_transform(df['title_text'])

y = df.label.values

In [23]:
X.shape

(1164, 27020)

In [25]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0, \

                                                    test_size=0.3, shuffle=False)

In [28]:
clf = LogisticRegressionCV(cv=5, scoring='accuracy', random_state=0, n_jobs=-1, \

                           verbose=0, max_iter=300)

clf.fit(X_train, y_train)

 

fake_news_model = open('fake_news_model.sav', 'wb')

pickle.dump(clf, fake_news_model)

fake_news_model.close()

In [30]:
filename = 'fake_news_model.sav'

saved_clf = pickle.load(open(filename, 'rb'))

 

saved_clf.score(X_test, y_test)

0.9314285714285714

In [32]:
from sklearn.metrics import classification_report, accuracy_score

y_pred = clf.predict(X_test)

print("---Test Set Results---")

print(classification_report(y_test, y_pred))

---Test Set Results---
              precision    recall  f1-score   support

        FAKE       0.92      0.89      0.91       132
        TRUE       0.94      0.95      0.95       218

    accuracy                           0.93       350
   macro avg       0.93      0.92      0.93       350
weighted avg       0.93      0.93      0.93       350

