In [35]:
import pandas as pd
import numpy as np
import pickle as pkl
import os

from glob import glob
from collections import Counter

## Defining variables

In [2]:
PATH = '../'

PROCESSED_DATA_PATH = os.path.join(PATH, 'data/processed/')

DF_FAKE_PATH = glob(PROCESSED_DATA_PATH + '*')[0]
DF_LEGIT_PATH = glob(PROCESSED_DATA_PATH + '*')[1]

## Loading DataFrames

In [3]:
df_fake = pkl.load(open(DF_FAKE_PATH, 'rb'))

df_legit = pkl.load(open(DF_LEGIT_PATH, 'rb'))

### DataFrame with Fake News

In [4]:
df_fake.head()

Unnamed: 0,DATE,FAKE,TEXT_CLEAN,TITLE_CLEAN
0,26/04/2019,True,comica nao nojenta cena aconteceu ultima terca...,maria rosario perde dentadura durante votacao ...
1,22/04/2019,True,finalmente professoraheleypresente reuniao min...,bolsonaro condecorou professora heley abreu he...
2,24/04/2019,True,audiencia ser grande hoje horario brasilia rec...,hoje record vai entrevistar bolsonaro minutos ...
3,25/04/2019,True,ibaneis rocha governador distrito federal fica...,ibaneis rocha governador viaja bebado vexame a...
4,21/04/2019,True,mandou dinheiro narcotrafico rio farc sequestr...,marcelo odebrecht diz governo deu dinheiro tra...


In [5]:
df_fake.shape

(1153, 4)

### DataFrame with Legit News

In [6]:
df_legit.head()

Unnamed: 0,DATE,FAKE,TEXT_CLEAN,TITLE_CLEAN
0,04/06/2019,False,atual mandato presidencial brasil comecou pouc...,comecam soar alarmes sobre sustentabilidade pr...
1,02/06/2019,False,poucas horas antes milhares manifestantes irem...,corte contingenciamento certo guerra narrativa...
2,03/06/2019,False,tamanho figura publica mede importancia inimig...,trump insulta prefeito londres inicio visita r...
3,26/05/2019,False,tantos fenomenos imparaveis trazidos revolucao...,lider ninguem meio
4,03/06/2019,False,apos semanas audiencias publicas projeto refor...,reforma previdencia pesara sobre pobres ricos


### Unifying DataFrames

In [8]:
df = pd.concat((df_fake, df_legit), axis=0)

## Modeling

In [19]:
from imblearn.under_sampling import RandomUnderSampler

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBClassifier

from sklearn.pipeline import Pipeline

In [23]:
y_all = df['FAKE']

In [38]:
vectorizer = TfidfVectorizer(ngram_range=(1, 2), min_df=5, max_df=0.9)

In [39]:
vectorizer.fit(df['TEXT_CLEAN'] + df['TITLE_CLEAN'])

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=0.9, max_features=None, min_df=5,
        ngram_range=(1, 2), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)

In [40]:
X_all_text = vectorizer.transform(df['TEXT_CLEAN'])

In [41]:
X_all_title = vectorizer.transform(df['TITLE_CLEAN'])

In [44]:
X_all_text.shape

(17179, 197598)

In [43]:
X_all_title.shape

(17179, 197598)

## Training with texts corpus

In [12]:
rus = RandomUnderSampler(random_state=42)

In [46]:
X_resampled, y_resampled = rus.fit_resample(X_all_text, y_all)

In [47]:
Counter(y_resampled)

Counter({False: 1153, True: 1153})

In [48]:
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.3, random_state=42)

In [49]:
clf = RandomForestClassifier(n_estimators=300, n_jobs=3, verbose=1)

In [50]:
clf.fit(X_train, y_train)

[Parallel(n_jobs=3)]: Using backend ThreadingBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done  44 tasks      | elapsed:    2.6s
[Parallel(n_jobs=3)]: Done 194 tasks      | elapsed:   10.8s
[Parallel(n_jobs=3)]: Done 300 out of 300 | elapsed:   16.6s finished


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=300, n_jobs=3,
            oob_score=False, random_state=None, verbose=1,
            warm_start=False)

In [51]:
y_pred = clf.predict(X_test)

[Parallel(n_jobs=3)]: Using backend ThreadingBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done  44 tasks      | elapsed:    0.0s
[Parallel(n_jobs=3)]: Done 194 tasks      | elapsed:    0.3s
[Parallel(n_jobs=3)]: Done 300 out of 300 | elapsed:    0.4s finished


### Testing with actual test set (with texts)

In [52]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

       False       0.98      0.96      0.97       356
        True       0.96      0.98      0.97       336

   micro avg       0.97      0.97      0.97       692
   macro avg       0.97      0.97      0.97       692
weighted avg       0.97      0.97      0.97       692



In [53]:
print(confusion_matrix(y_test, y_pred))

[[341  15]
 [  8 328]]


### Testing with entire texts corpus

In [54]:
y_pred_all_texts = clf.predict(X_all_text)

[Parallel(n_jobs=3)]: Using backend ThreadingBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done  44 tasks      | elapsed:    1.5s
[Parallel(n_jobs=3)]: Done 194 tasks      | elapsed:    6.9s
[Parallel(n_jobs=3)]: Done 300 out of 300 | elapsed:   10.8s finished


In [64]:
print(classification_report(y_all, y_pred_all_texts))

              precision    recall  f1-score   support

       False       1.00      0.95      0.98     16026
        True       0.61      0.99      0.75      1153

   micro avg       0.96      0.96      0.96     17179
   macro avg       0.80      0.97      0.86     17179
weighted avg       0.97      0.96      0.96     17179



In [65]:
print(confusion_matrix(y_all, y_pred_all_texts))

[[15285   741]
 [    8  1145]]


### Testing with entire titles corpus

In [58]:
y_pred_all_titles = clf.predict(X_all_title)

[Parallel(n_jobs=3)]: Using backend ThreadingBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done  44 tasks      | elapsed:    0.1s
[Parallel(n_jobs=3)]: Done 194 tasks      | elapsed:    0.8s
[Parallel(n_jobs=3)]: Done 300 out of 300 | elapsed:    1.3s finished


In [60]:
print(classification_report(y_all, y_pred_all_titles))

              precision    recall  f1-score   support

       False       0.00      0.00      0.00     16026
        True       0.07      1.00      0.13      1153

   micro avg       0.07      0.07      0.07     17179
   macro avg       0.03      0.50      0.06     17179
weighted avg       0.00      0.07      0.01     17179



  'precision', 'predicted', average, warn_for)


In [68]:
print(confusion_matrix(y_all, y_pred_all_titles))

[[    0 16026]
 [    0  1153]]


## Training with titles corpus

In [69]:
rus = RandomUnderSampler(random_state=42)

In [73]:
X_resampled, y_resampled = rus.fit_resample(X_all_title, y_all)

In [74]:
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.3, random_state=42)

In [75]:
clf = RandomForestClassifier(n_estimators=300, n_jobs=3, verbose=1)

In [76]:
clf.fit(X_train, y_train)

[Parallel(n_jobs=3)]: Using backend ThreadingBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done  44 tasks      | elapsed:    4.2s
[Parallel(n_jobs=3)]: Done 194 tasks      | elapsed:   17.3s
[Parallel(n_jobs=3)]: Done 300 out of 300 | elapsed:   27.2s finished


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=300, n_jobs=3,
            oob_score=False, random_state=None, verbose=1,
            warm_start=False)

In [77]:
y_pred = clf.predict(X_test)

[Parallel(n_jobs=3)]: Using backend ThreadingBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done  44 tasks      | elapsed:    0.0s
[Parallel(n_jobs=3)]: Done 194 tasks      | elapsed:    0.2s
[Parallel(n_jobs=3)]: Done 300 out of 300 | elapsed:    0.3s finished


### Testing with actual test set (with titles)

In [78]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

       False       0.97      0.98      0.98       356
        True       0.98      0.97      0.98       336

   micro avg       0.98      0.98      0.98       692
   macro avg       0.98      0.98      0.98       692
weighted avg       0.98      0.98      0.98       692



In [79]:
print(confusion_matrixon_matrix(y_test, y_pred))

[[349   7]
 [  9 327]]


### Testing with entire texts corpus

In [80]:
y_pred_all_texts = clf.predict(X_all_text)

[Parallel(n_jobs=3)]: Using backend ThreadingBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done  44 tasks      | elapsed:    1.8s
[Parallel(n_jobs=3)]: Done 194 tasks      | elapsed:    8.3s
[Parallel(n_jobs=3)]: Done 300 out of 300 | elapsed:   12.8s finished


In [81]:
print(classification_report(y_all, y_pred_all_texts))

              precision    recall  f1-score   support

       False       0.94      0.99      0.97     16026
        True       0.56      0.15      0.23      1153

   micro avg       0.94      0.94      0.94     17179
   macro avg       0.75      0.57      0.60     17179
weighted avg       0.92      0.94      0.92     17179



In [82]:
print(confusion_matrix(y_all, y_pred_all_texts))

[[15894   132]
 [  983   170]]


### Testing with entire titles corpus

In [83]:
y_pred_all_titles = clf.predict(X_all_title)

[Parallel(n_jobs=3)]: Using backend ThreadingBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done  44 tasks      | elapsed:    0.3s
[Parallel(n_jobs=3)]: Done 194 tasks      | elapsed:    1.7s
[Parallel(n_jobs=3)]: Done 300 out of 300 | elapsed:    2.6s finished


In [84]:
print(classification_report(y_all, y_pred_all_titles))

              precision    recall  f1-score   support

       False       1.00      0.98      0.99     16026
        True       0.81      0.99      0.89      1153

   micro avg       0.98      0.98      0.98     17179
   macro avg       0.90      0.99      0.94     17179
weighted avg       0.99      0.98      0.98     17179



In [85]:
print(confusion_matrix(y_all, y_pred_all_titles))

[[15757   269]
 [    9  1144]]


## Training with both corpus

In [113]:
X_all_text_title = vectorizer.transform(np.concatenate([df['TEXT_CLEAN'], df['TITLE_CLEAN']]))

In [114]:
rus = RandomUnderSampler(random_state=42)

In [115]:
X_resampled, y_resampled = rus.fit_resample(X_all_text_title, np.concatenate([y_all, y_all]))

In [116]:
Counter(y_resampled)

Counter({False: 2306, True: 2306})

In [117]:
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.3, random_state=42)

In [118]:
clf = RandomForestClassifier(n_estimators=300, n_jobs=3, verbose=1)

In [119]:
clf.fit(X_train, y_train)

[Parallel(n_jobs=3)]: Using backend ThreadingBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done  44 tasks      | elapsed:    8.1s
[Parallel(n_jobs=3)]: Done 194 tasks      | elapsed:   34.1s
[Parallel(n_jobs=3)]: Done 300 out of 300 | elapsed:   52.5s finished


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=300, n_jobs=3,
            oob_score=False, random_state=None, verbose=1,
            warm_start=False)

In [120]:
y_pred = clf.predict(X_test)

[Parallel(n_jobs=3)]: Using backend ThreadingBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done  44 tasks      | elapsed:    0.0s
[Parallel(n_jobs=3)]: Done 194 tasks      | elapsed:    0.4s
[Parallel(n_jobs=3)]: Done 300 out of 300 | elapsed:    0.6s finished


### Testing with actual test set (with texts)

In [121]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

       False       0.95      0.78      0.86       724
        True       0.80      0.96      0.87       660

   micro avg       0.87      0.87      0.87      1384
   macro avg       0.88      0.87      0.87      1384
weighted avg       0.88      0.87      0.87      1384



In [122]:
print(confusion_matrix(y_test, y_pred))

[[566 158]
 [ 28 632]]


### Testing with entire texts corpus

In [123]:
y_pred_all_texts = clf.predict(X_all_text)

[Parallel(n_jobs=3)]: Using backend ThreadingBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done  44 tasks      | elapsed:    1.7s
[Parallel(n_jobs=3)]: Done 194 tasks      | elapsed:    7.8s
[Parallel(n_jobs=3)]: Done 300 out of 300 | elapsed:   11.8s finished


In [124]:
print(classification_report(y_all, y_pred_all_texts))

              precision    recall  f1-score   support

       False       1.00      0.97      0.99     16026
        True       0.72      0.98      0.83      1153

   micro avg       0.97      0.97      0.97     17179
   macro avg       0.86      0.97      0.91     17179
weighted avg       0.98      0.97      0.97     17179



In [125]:
print(confusion_matrix(y_all, y_pred_all_texts))

[[15586   440]
 [   28  1125]]


### Testing with entire titles corpus

In [126]:
y_pred_all_titles = clf.predict(X_all_title)

[Parallel(n_jobs=3)]: Using backend ThreadingBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done  44 tasks      | elapsed:    0.4s
[Parallel(n_jobs=3)]: Done 194 tasks      | elapsed:    1.8s
[Parallel(n_jobs=3)]: Done 300 out of 300 | elapsed:    2.8s finished


In [127]:
print(classification_report(y_all, y_pred_all_titles))

              precision    recall  f1-score   support

       False       1.00      0.54      0.70     16026
        True       0.14      1.00      0.24      1153

   micro avg       0.57      0.57      0.57     17179
   macro avg       0.57      0.77      0.47     17179
weighted avg       0.94      0.57      0.67     17179



In [128]:
print(confusion_matrix(y_all, y_pred_all_titles))

[[8640 7386]
 [   0 1153]]
