In [1]:
from sklearn.model_selection import train_test_split
import json
import pandas as pd
from sklearn import preprocessing, model_selection, feature_extraction, naive_bayes, metrics

In [2]:
# Json to csv
def json2df(json_obj):
    
    ls = []
    for row in json_obj:
        ls.append([
            row['title'],
            row['content'],
            row['author'],
            row['brand_id'],
            row['date'],
            row['url']
        ])
    
    col = [
            'title',
            'content',
            'author',
            'date',
            'url',
            'brand_id'
        ]
    
    return pd.DataFrame(ls, columns=col)

In [3]:
# Import dataset

## Read news
with open('./data/real_news_dump.json') as file:
    real_news = json.load(file)

real_news_df = json2df(real_news)

## Fake news
with open('./data/fake_news_dump.json') as file:
    fake_news = json.load(file)
    
fake_news_df = json2df(fake_news)

# label dataset

In [13]:
# label real dataset
real_label = [True]*len(real_news_df)
real_news_df['label'] = real_label
real_news_df = real_news_df.dropna()

In [112]:
len(real_news_df)

1000

In [14]:
# label fake dataset
fake_label = [False]*len(fake_news_df)
fake_news_df['label'] = fake_label
fake_news_df = fake_news_df.dropna()

In [111]:
len(fake_news_df)

674

In [15]:
# compact
compact_df = pd.concat([real_news_df, fake_news_df], ignore_index=True)

In [16]:
# X, y
X = compact_df['title']
y = compact_df['label']

In [17]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=2)

# Pipeline

In [18]:
from sklearn.pipeline import Pipeline
text_clf = Pipeline([
    ('tfidf', feature_extraction.text.TfidfVectorizer()),
    ('clf', naive_bayes.BernoulliNB())
])
text_clf.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('tfidf',
                 TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=True, max_df=1.0, max_features=None,
                                 min_df=1, ngram_range=(1, 1), norm='l2',
                                 preprocessor=None, smooth_idf=True,
                                 stop_words=None, strip_accents=None,
                                 sublinear_tf=False,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, use_idf=True,
                                 vocabulary=None)),
                ('clf',
                 BernoulliNB(alpha=1.0, binarize=0.0, class_prior=None,
                             fit_prior=True))],
         verbose=False)

In [19]:
import numpy as np
pred = text_clf.predict(X_test)

In [20]:
np.mean(pred == y_test)

0.6157517899761337

In [21]:
from sklearn import metrics

In [25]:
print(metrics.classification_report(y_test, pred, target_names=['False', 'True']))

              precision    recall  f1-score   support

       False       0.00      0.00      0.00       161
        True       0.62      1.00      0.76       258

    accuracy                           0.62       419
   macro avg       0.31      0.50      0.38       419
weighted avg       0.38      0.62      0.47       419



In [27]:
metrics.confusion_matrix(y_test, pred, labels=[True, False])

array([[258,   0],
       [161,   0]])

419

# SGDClassifier

In [81]:
from sklearn.pipeline import Pipeline
from sklearn.linear_model import SGDClassifier
text_clf = Pipeline([
    ('tfidf', feature_extraction.text.TfidfVectorizer()),
    ('clf', SGDClassifier())
])
text_clf.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('tfidf',
                 TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=True, max_df=1.0, max_features=None,
                                 min_df=1, ngram_range=(1, 1), norm='l2',
                                 preprocessor=None, smooth_idf=True,
                                 stop_words=None, strip_accents=None,
                                 sublinear_tf=False,
                                 token_pattern='...
                 SGDClassifier(alpha=0.0001, average=False, class_weight=None,
                               early_stopping=False, epsilon=0.1, eta0=0.0,
                               fit_intercept=True, l1_ratio=0.15,
                               learning_rate='optimal', loss='hinge',
           

In [82]:
import numpy as np
pred = text_clf.predict(X_test)

In [83]:
np.mean(pred == y_test)

0.8305489260143198

In [84]:
print(metrics.classification_report(y_test, pred, target_names=['False', 'True']))

              precision    recall  f1-score   support

       False       0.70      0.99      0.82       161
        True       0.99      0.73      0.84       258

    accuracy                           0.83       419
   macro avg       0.85      0.86      0.83       419
weighted avg       0.88      0.83      0.83       419



In [85]:
y_test.iloc[1]

True

In [86]:
TP, TN, FP, FN = ([], [], [], [])
for i in range(len(pred)):
    # TP
    if y_test.iloc[i] == True and pred[i] == True:
        TP.append(i)
    # TN
    if y_test.iloc[i] == False and pred[i] == False:
        TN.append(i)
    # FP
    if y_test.iloc[i] == True and pred[i] == False:
        FP.append(i)
    # FN
    if y_test.iloc[i] == False and pred[i] == True:
        FN.append(i)
print('TP: {}, TN: {}, FP: {}, FN:{}'.format(len(TP)/len(pred), len(TN)/len(pred), len(FP)/len(pred), len(FN)/len(pred)))

TP: 0.4486873508353222, TN: 0.3818615751789976, FP: 0.16706443914081145, FN:0.002386634844868735


In [87]:
X_test.iloc[FN]

1129    加州／舊金山／台灣政府要求戴口罩，但是沒有要求要穿衣服
Name: title, dtype: object

# AdaBoostClassifier

In [89]:
from sklearn.pipeline import Pipeline
from sklearn.ensemble import AdaBoostClassifier
text_clf = Pipeline([
    ('tfidf', feature_extraction.text.TfidfVectorizer()),
    ('clf', AdaBoostClassifier())
])
text_clf.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('tfidf',
                 TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=True, max_df=1.0, max_features=None,
                                 min_df=1, ngram_range=(1, 1), norm='l2',
                                 preprocessor=None, smooth_idf=True,
                                 stop_words=None, strip_accents=None,
                                 sublinear_tf=False,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, use_idf=True,
                                 vocabulary=None)),
                ('clf',
                 AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None,
                                    learning_rate=1.0, n_estimators=5

In [90]:
import numpy as np
pred = text_clf.predict(X_test)

In [91]:
np.mean(pred == y_test)

0.7279236276849642

In [92]:
print(metrics.classification_report(y_test, pred, target_names=['False', 'True']))

              precision    recall  f1-score   support

       False       0.59      1.00      0.74       161
        True       1.00      0.56      0.72       258

    accuracy                           0.73       419
   macro avg       0.79      0.78      0.73       419
weighted avg       0.84      0.73      0.72       419



In [93]:
TP, TN, FP, FN = ([], [], [], [])
for i in range(len(pred)):
    # TP
    if y_test.iloc[i] == True and pred[i] == True:
        TP.append(i)
    # TN
    if y_test.iloc[i] == False and pred[i] == False:
        TN.append(i)
    # FP
    if y_test.iloc[i] == True and pred[i] == False:
        FP.append(i)
    # FN
    if y_test.iloc[i] == False and pred[i] == True:
        FN.append(i)
print('TP: {}, TN: {}, FP: {}, FN:{}'.format(len(TP)/len(pred), len(TN)/len(pred), len(FP)/len(pred), len(FN)/len(pred)))

TP: 0.3436754176610978, TN: 0.38424821002386633, FP: 0.2720763723150358, FN:0.0


# Decision Tree

In [100]:
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeClassifier
text_clf = Pipeline([
    ('tfidf', feature_extraction.text.TfidfVectorizer()),
    ('clf', DecisionTreeClassifier())
])
text_clf.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('tfidf',
                 TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=True, max_df=1.0, max_features=None,
                                 min_df=1, ngram_range=(1, 1), norm='l2',
                                 preprocessor=None, smooth_idf=True,
                                 stop_words=None, strip_accents=None,
                                 sublinear_tf=False,
                                 token_pattern='...b',
                                 tokenizer=None, use_idf=True,
                                 vocabulary=None)),
                ('clf',
                 DecisionTreeClassifier(class_weight=None, criterion='gini',
                                        max_depth=None, max_features=None,
        

In [101]:
import numpy as np
pred = text_clf.predict(X_test)

In [102]:
np.mean(pred == y_test)

0.8257756563245824

In [103]:
print(metrics.classification_report(y_test, pred, target_names=['False', 'True']))

              precision    recall  f1-score   support

       False       0.69      1.00      0.82       161
        True       1.00      0.72      0.84       258

    accuracy                           0.83       419
   macro avg       0.84      0.86      0.83       419
weighted avg       0.88      0.83      0.83       419



In [104]:
TP, TN, FP, FN = ([], [], [], [])
for i in range(len(pred)):
    # TP
    if y_test.iloc[i] == True and pred[i] == True:
        TP.append(i)
    # TN
    if y_test.iloc[i] == False and pred[i] == False:
        TN.append(i)
    # FP
    if y_test.iloc[i] == True and pred[i] == False:
        FP.append(i)
    # FN
    if y_test.iloc[i] == False and pred[i] == True:
        FN.append(i)
print('TP: {}, TN: {}, FP: {}, FN:{}'.format(len(TP)/len(pred), len(TN)/len(pred), len(FP)/len(pred), len(FN)/len(pred)))

TP: 0.441527446300716, TN: 0.38424821002386633, FP: 0.17422434367541767, FN:0.0


# Random Forest

In [66]:
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
text_clf = Pipeline([
    ('tfidf', feature_extraction.text.TfidfVectorizer()),
    ('clf', RandomForestClassifier())
])
text_clf.fit(X_train, y_train)



Pipeline(memory=None,
         steps=[('tfidf',
                 TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=True, max_df=1.0, max_features=None,
                                 min_df=1, ngram_range=(1, 1), norm='l2',
                                 preprocessor=None, smooth_idf=True,
                                 stop_words=None, strip_accents=None,
                                 sublinear_tf=False,
                                 token_pattern='...
                 RandomForestClassifier(bootstrap=True, class_weight=None,
                                        criterion='gini', max_depth=None,
                                        max_features='auto',
                                        max_leaf_nodes=None,
                               

In [67]:
import numpy as np
pred = text_clf.predict(X_test)

In [68]:
np.mean(pred == y_test)

0.7995226730310262

In [69]:
print(metrics.classification_report(y_test, pred, target_names=['False', 'True']))

              precision    recall  f1-score   support

       False       0.66      1.00      0.79       161
        True       1.00      0.67      0.81       258

    accuracy                           0.80       419
   macro avg       0.83      0.84      0.80       419
weighted avg       0.87      0.80      0.80       419



In [70]:
TP, TN, FP, FN = ([], [], [], [])
for i in range(len(pred)):
    # TP
    if y_test.iloc[i] == True and pred[i] == True:
        TP.append(i)
    # TN
    if y_test.iloc[i] == False and pred[i] == False:
        TN.append(i)
    # FP
    if y_test.iloc[i] == True and pred[i] == False:
        FP.append(i)
    # FN
    if y_test.iloc[i] == False and pred[i] == True:
        FN.append(i)
print('TP: {}, TN: {}, FP: {}, FN:{}'.format(len(TP), len(TN), len(FP), len(FN)))

TP: 174, TN: 161, FP: 84, FN:0


# ComplementNB

In [105]:
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import ComplementNB
text_clf = Pipeline([
    ('tfidf', feature_extraction.text.TfidfVectorizer()),
    ('clf', ComplementNB())
])
text_clf.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('tfidf',
                 TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=True, max_df=1.0, max_features=None,
                                 min_df=1, ngram_range=(1, 1), norm='l2',
                                 preprocessor=None, smooth_idf=True,
                                 stop_words=None, strip_accents=None,
                                 sublinear_tf=False,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, use_idf=True,
                                 vocabulary=None)),
                ('clf',
                 ComplementNB(alpha=1.0, class_prior=None, fit_prior=True,
                              norm=False))],
         verbose=False)

In [106]:
import numpy as np
pred = text_clf.predict(X_test)

In [107]:
np.mean(pred == y_test)

0.8305489260143198

In [108]:
print(metrics.classification_report(y_test, pred, target_names=['False', 'True']))

              precision    recall  f1-score   support

       False       0.70      0.99      0.82       161
        True       0.99      0.73      0.84       258

    accuracy                           0.83       419
   macro avg       0.85      0.86      0.83       419
weighted avg       0.88      0.83      0.83       419



In [109]:
TP, TN, FP, FN = ([], [], [], [])
for i in range(len(pred)):
    # TP
    if y_test.iloc[i] == True and pred[i] == True:
        TP.append(i)
    # TN
    if y_test.iloc[i] == False and pred[i] == False:
        TN.append(i)
    # FP
    if y_test.iloc[i] == True and pred[i] == False:
        FP.append(i)
    # FN
    if y_test.iloc[i] == False and pred[i] == True:
        FN.append(i)
print('TP: {}, TN: {}, FP: {}, FN:{}'.format(len(TP)/len(pred), len(TN)/len(pred), len(FP)/len(pred), len(FN)/len(pred)))

TP: 0.4486873508353222, TN: 0.3818615751789976, FP: 0.16706443914081145, FN:0.002386634844868735
