In [1]:
import pandas as pd
import numpy as np
import pickle as pkl
import os

from glob import glob
from collections import Counter

## Defining variables

In [2]:
PATH = '../'

PROCESSED_DATA_PATH = os.path.join(PATH, 'data/processed/')

DF_FAKE_PATH = glob(PROCESSED_DATA_PATH + '*')[0]
DF_LEGIT_PATH = glob(PROCESSED_DATA_PATH + '*')[1]

## Loading DataFrames

In [3]:
df_fake = pkl.load(open(DF_FAKE_PATH, 'rb'))

df_legit = pkl.load(open(DF_LEGIT_PATH, 'rb'))

### DataFrame with Fake News

In [4]:
df_fake.head()

Unnamed: 0,DATE,FAKE,TEXT_CLEAN,TITLE_CLEAN
0,26/04/2019,True,comica nojenta cena aconteceu ultima terca fei...,maria rosario perde dentadura durante votacao ...
1,22/04/2019,True,finalmente professoraheleypresente reuniao min...,bolsonaro condecorou professora heley abreu he...
2,24/04/2019,True,audiencia ser grande hoje horario brasilia rec...,hoje record vai entrevistar bolsonaro minutos
3,25/04/2019,True,ibaneis rocha governador distrito federal fica...,ibaneis rocha governador viaja bebado vexame a...
4,21/04/2019,True,mandou dinheiro narcotrafico rio farc sequestr...,marcelo odebrecht diz governo deu dinheiro tra...


In [5]:
df_fake.shape

(1153, 4)

### DataFrame with Legit News

In [6]:
df_legit.head()

Unnamed: 0,DATE,FAKE,TEXT_CLEAN,TITLE_CLEAN
0,04/06/2019,False,atual mandato presidencial brasil comecou pouc...,comecam soar alarmes sobre sustentabilidade pr...
1,02/06/2019,False,poucas horas antes milhares manifestantes irem...,corte contingenciamento certo guerra narrativa...
2,03/06/2019,False,tamanho figura publica mede importancia inimig...,trump insulta prefeito londres inicio visita r...
3,26/05/2019,False,tantos fenomenos imparaveis trazidos revolucao...,lider ninguem meio
4,03/06/2019,False,apos semanas audiencias publicas projeto refor...,reforma previdencia pesara sobre pobres ricos


### Unifying DataFrames

In [7]:
df = pd.concat((df_fake, df_legit), axis=0)

## Modeling

In [8]:
from imblearn.under_sampling import RandomUnderSampler

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBClassifier

from sklearn.pipeline import Pipeline

In [9]:
y_all = df['FAKE']

In [10]:
vectorizer = TfidfVectorizer(ngram_range=(1, 2), min_df=5, max_df=0.9)

In [11]:
vectorizer.fit(df['TEXT_CLEAN'] + df['TITLE_CLEAN'])

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=0.9, max_features=None, min_df=5,
        ngram_range=(1, 2), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)

In [12]:
X_all_text = vectorizer.transform(df['TEXT_CLEAN'])

In [13]:
X_all_title = vectorizer.transform(df['TITLE_CLEAN'])

In [14]:
X_all_text.shape

(16858, 187777)

In [15]:
X_all_title.shape

(16858, 187777)

## Training with texts corpus

In [16]:
rus = RandomUnderSampler(random_state=42)

In [17]:
X_resampled, y_resampled = rus.fit_resample(X_all_text, y_all)

In [18]:
Counter(y_resampled)

Counter({False: 1153, True: 1153})

In [19]:
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.3, random_state=42)

In [20]:
clf = RandomForestClassifier(n_estimators=300, n_jobs=3, verbose=1)

In [21]:
clf.fit(X_train, y_train)

[Parallel(n_jobs=3)]: Using backend ThreadingBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done  44 tasks      | elapsed:    1.0s
[Parallel(n_jobs=3)]: Done 194 tasks      | elapsed:    4.9s
[Parallel(n_jobs=3)]: Done 300 out of 300 | elapsed:    7.1s finished


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=300, n_jobs=3,
            oob_score=False, random_state=None, verbose=1,
            warm_start=False)

In [22]:
y_pred = clf.predict(X_test)

[Parallel(n_jobs=3)]: Using backend ThreadingBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done  44 tasks      | elapsed:    0.0s
[Parallel(n_jobs=3)]: Done 194 tasks      | elapsed:    0.2s
[Parallel(n_jobs=3)]: Done 300 out of 300 | elapsed:    0.4s finished


### Testing with actual test set (with texts)

In [23]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

       False       0.98      0.97      0.97       356
        True       0.97      0.98      0.97       336

   micro avg       0.97      0.97      0.97       692
   macro avg       0.97      0.97      0.97       692
weighted avg       0.97      0.97      0.97       692



In [24]:
print(confusion_matrix(y_test, y_pred))

[[346  10]
 [  8 328]]


### Testing with entire texts corpus

In [25]:
y_pred_all_texts = clf.predict(X_all_text)

[Parallel(n_jobs=3)]: Using backend ThreadingBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done  44 tasks      | elapsed:    1.4s
[Parallel(n_jobs=3)]: Done 194 tasks      | elapsed:    6.8s
[Parallel(n_jobs=3)]: Done 300 out of 300 | elapsed:   10.4s finished


In [26]:
print(classification_report(y_all, y_pred_all_texts))

              precision    recall  f1-score   support

       False       1.00      0.96      0.98     15705
        True       0.65      0.99      0.79      1153

   micro avg       0.96      0.96      0.96     16858
   macro avg       0.83      0.98      0.88     16858
weighted avg       0.98      0.96      0.97     16858



In [27]:
print(confusion_matrix(y_all, y_pred_all_texts))

[[15096   609]
 [    8  1145]]


### Testing with entire titles corpus

In [28]:
y_pred_all_titles = clf.predict(X_all_title)

[Parallel(n_jobs=3)]: Using backend ThreadingBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done  44 tasks      | elapsed:    0.0s
[Parallel(n_jobs=3)]: Done 194 tasks      | elapsed:    0.5s
[Parallel(n_jobs=3)]: Done 300 out of 300 | elapsed:    0.7s finished


In [29]:
print(classification_report(y_all, y_pred_all_titles))

              precision    recall  f1-score   support

       False       0.00      0.00      0.00     15705
        True       0.07      1.00      0.13      1153

   micro avg       0.07      0.07      0.07     16858
   macro avg       0.03      0.50      0.06     16858
weighted avg       0.00      0.07      0.01     16858



  'precision', 'predicted', average, warn_for)


In [30]:
print(confusion_matrix(y_all, y_pred_all_titles))

[[    0 15705]
 [    0  1153]]


## Training with titles corpus

In [31]:
rus = RandomUnderSampler(random_state=42)

In [32]:
X_resampled, y_resampled = rus.fit_resample(X_all_title, y_all)

In [33]:
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.3, random_state=42)

In [34]:
clf = RandomForestClassifier(n_estimators=300, n_jobs=3, verbose=1)

In [35]:
clf.fit(X_train, y_train)

[Parallel(n_jobs=3)]: Using backend ThreadingBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done  44 tasks      | elapsed:    4.4s
[Parallel(n_jobs=3)]: Done 194 tasks      | elapsed:   19.8s
[Parallel(n_jobs=3)]: Done 300 out of 300 | elapsed:   30.7s finished


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=300, n_jobs=3,
            oob_score=False, random_state=None, verbose=1,
            warm_start=False)

In [36]:
y_pred = clf.predict(X_test)

[Parallel(n_jobs=3)]: Using backend ThreadingBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done  44 tasks      | elapsed:    0.0s
[Parallel(n_jobs=3)]: Done 194 tasks      | elapsed:    0.1s
[Parallel(n_jobs=3)]: Done 300 out of 300 | elapsed:    0.2s finished


### Testing with actual test set (with titles)

In [37]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

       False       0.80      0.92      0.85       356
        True       0.89      0.76      0.82       336

   micro avg       0.84      0.84      0.84       692
   macro avg       0.85      0.84      0.84       692
weighted avg       0.85      0.84      0.84       692



In [38]:
print(confusion_matrixon_matrix(y_test, y_pred))

NameError: name 'confusion_matrixon_matrix' is not defined

### Testing with entire texts corpus

In [None]:
y_pred_all_texts = clf.predict(X_all_text)

In [None]:
print(classification_report(y_all, y_pred_all_texts))

In [None]:
print(confusion_matrix(y_all, y_pred_all_texts))

### Testing with entire titles corpus

In [None]:
y_pred_all_titles = clf.predict(X_all_title)

In [None]:
print(classification_report(y_all, y_pred_all_titles))

In [None]:
print(confusion_matrix(y_all, y_pred_all_titles))

## Training with both corpus

In [None]:
X_all_text_title = vectorizer.transform(np.concatenate([df['TEXT_CLEAN'], df['TITLE_CLEAN']]))

In [None]:
rus = RandomUnderSampler(random_state=42)

In [None]:
X_resampled, y_resampled = rus.fit_resample(X_all_text_title, np.concatenate([y_all, y_all]))

In [None]:
Counter(y_resampled)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.3, random_state=42)

In [None]:
clf = RandomForestClassifier(n_estimators=300, n_jobs=3, verbose=1)

In [None]:
clf.fit(X_train, y_train)

In [None]:
y_pred = clf.predict(X_test)

### Testing with actual test set (with texts)

In [None]:
print(classification_report(y_test, y_pred))

In [None]:
print(confusion_matrix(y_test, y_pred))

### Testing with entire texts corpus

In [None]:
y_pred_all_texts = clf.predict(X_all_text)

In [None]:
print(classification_report(y_all, y_pred_all_texts))

In [None]:
print(confusion_matrix(y_all, y_pred_all_texts))

### Testing with entire titles corpus

In [None]:
y_pred_all_titles = clf.predict(X_all_title)

In [None]:
print(classification_report(y_all, y_pred_all_titles))

In [None]:
print(confusion_matrix(y_all, y_pred_all_titles))