In [1]:
import pandas as pd
import numpy as np
import pickle as pkl
import os

from glob import glob
from time import time
from collections import Counter

In [2]:
PATH = '../'

PROCESSED_DATA_PATH = os.path.join(PATH, 'data/processed/')

DF_FAKE_PATH = os.path.join(PROCESSED_DATA_PATH, 'df_fake_clean.pkl')
DF_LEGIT_PATH = os.path.join(PROCESSED_DATA_PATH, 'df_legit_clean.pkl')

In [3]:
df_fake = pkl.load(open(DF_FAKE_PATH, 'rb'))

df_legit = pkl.load(open(DF_LEGIT_PATH, 'rb'))

In [4]:
df = pd.concat((df_fake, df_legit), axis=0)

In [5]:
from imblearn.under_sampling import RandomUnderSampler

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.svm import LinearSVC
from xgboost import XGBClassifier

from sklearn.calibration import CalibratedClassifierCV

from sklearn.pipeline import Pipeline

In [6]:
df.head()

Unnamed: 0,DATE,FAKE,TEXT_CLEAN,TITLE_CLEAN,TEXT_LEN_CHAR,TEXT_LEN_TOKEN,TITLE_LEN_CHAR,TITLE_LEN_TOKEN
0,20/09/2019,1,bolsonar demit president caix exig result prox...,bolsonar demit president caix apos assessor ga...,256,39,62,10
1,25/09/2019,1,tod mund tinh certeza inclusiv mandante revel ...,marc valeri grav audi fal cas cels daniel edua...,134,22,65,12
2,02/10/2019,1,dilm diz assim janot mat mor suicid boat,dilm diz assim janot mat mor suicid boat,40,8,40,8
3,20/09/2019,1,bolsonar traz ultrassom destro celul cancerige...,bolsonar traz ultrassom destro celul cancerige...,59,8,59,8
4,06/10/2019,1,empres japa quer invest brasil expect govern b...,empres quer invest brasil caus bolsonar boat,354,52,44,7


In [19]:
y_all = df['FAKE']

In [20]:
vectorizer = TfidfVectorizer(ngram_range=(1, 2), min_df=5, max_df=0.9)

In [21]:
vectorizer.fit(df['TEXT_CLEAN'])

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.float64'>, encoding='utf-8',
                input='content', lowercase=True, max_df=0.9, max_features=None,
                min_df=5, ngram_range=(1, 2), norm='l2', preprocessor=None,
                smooth_idf=True, stop_words=None, strip_accents=None,
                sublinear_tf=False, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, use_idf=True, vocabulary=None)

In [31]:
sorted(vectorizer.vocabulary_, key=lambda x: vectorizer.vocabulary_[x])

['aara',
 'aara reis',
 'abad',
 'abadi',
 'abaf',
 'abaix',
 'abaix acord',
 'abaix algum',
 'abaix alguns',
 'abaix anos',
 'abaix apos',
 'abaix argument',
 'abaix assin',
 'abaix cad',
 'abaix cart',
 'abaix cinc',
 'abaix defes',
 'abaix dess',
 'abaix detalh',
 'abaix diss',
 'abaix ditadura',
 'abaix dois',
 'abaix dol',
 'abaix esper',
 'abaix esperado',
 'abaix expect',
 'abaix indic',
 'abaix inflaca',
 'abaix integr',
 'abaix lei',
 'abaix limit',
 'abaix linh',
 'abaix list',
 'abaix med',
 'abaix met',
 'abaix minim',
 'abaix ministr',
 'abaix nivel',
 'abaix not',
 'abaix outr',
 'abaix par',
 'abaix pel',
 'abaix polit',
 'abaix pont',
 'abaix prec',
 'abaix president',
 'abaix prev',
 'abaix princip',
 'abaix process',
 'abaix projet',
 'abaix qua',
 'abaix registr',
 'abaix reprodu',
 'abaix salari',
 'abaix segund',
 'abaix text',
 'abaix tod',
 'abaix transcrica',
 'abaix ultim',
 'abaix vot',
 'abaixo',
 'abaixo assin',
 'abaixo part',
 'abal',
 'abal alicerc',
 'ab

In [30]:
vectorizer.vocabulary_

{'bolsonar': 23830,
 'demit': 52242,
 'president': 160972,
 'caix': 27155,
 'exig': 79004,
 'result': 179512,
 'prox': 167744,
 'sortei': 194268,
 'sej': 185443,
 'aprov': 15493,
 'antes': 12422,
 'apos': 14634,
 'sair': 182918,
 'assessor': 17678,
 'sab': 182345,
 'ter': 198676,
 'ido': 98027,
 'grup': 94260,
 'part': 143236,
 'trabalh': 203169,
 'jair': 105308,
 'irrit': 104874,
 'resolv': 178716,
 'demiti': 52256,
 'demit president': 52254,
 'president caix': 161105,
 'result prox': 179663,
 'sej aprov': 185476,
 'aprov antes': 15509,
 'sair par': 182970,
 'par assessor': 139469,
 'apos sab': 14916,
 'ter ido': 199087,
 'ido par': 98030,
 'par grup': 140709,
 'assessor part': 17731,
 'part trabalh': 144206,
 'jair bolsonar': 105309,
 'tod': 201495,
 'mund': 126471,
 'tinh': 200645,
 'certeza': 32577,
 'inclusiv': 100052,
 'revel': 180341,
 'assim': 17769,
 'depoiment': 53880,
 'marc': 115936,
 'valerio': 208710,
 'ninguem': 129410,
 'ouv': 136906,
 'ainda': 6827,
 'valeri': 208708,


In [10]:
X_all = vectorizer.transform(df['TEXT_CLEAN'])

In [11]:
rus = RandomUnderSampler(random_state=42)

In [12]:
X_resampled, y_resampled = rus.fit_resample(X_all, y_all)

In [13]:
model_list = [
    LogisticRegression(),
    RandomForestClassifier(n_estimators=1000),
    GaussianNB(),
    LinearSVC(),
    SVC(),
    XGBClassifier(n_estimators=100)
]

In [14]:
n_folds = 10
n_jobs = 3

columns = ['model', 'mean'] + ['fold_{}'.format(i) for i in range(1, n_folds + 1)]
df_results = pd.DataFrame()
df_results

line = {'model': [], 'mean': []}
for i in range(n_folds):
    line['fold_{}'.format(i)] = []

for i, model in enumerate(model_list):
    print('Currently running {}'.format(type(model).__name__))
    time_start = time()
    try:
        cv = cross_val_score(model, X_resampled, y_resampled, n_jobs=n_jobs, cv=n_folds, verbose=0)
    except:
        cv = cross_val_score(model, X_resampled.toarray(), y_resampled, n_jobs=1, cv=n_folds, verbose=0)
        
    line['model'].append(type(model).__name__)
    line['mean'].append(cv.mean())
    
    for j, item in enumerate(cv):
        line['fold_{}'.format(j)].append(item)
        
    print('Finished running {}'.format(type(model).__name__))
    print('Time elapsed {}s'.format(time() - time_start))
    print()
df_results = df_results.append(pd.DataFrame(line))
df_results

Currently running LogisticRegression
Finished running LogisticRegression
Time elapsed 2.695570230484009s

Currently running RandomForestClassifier
Finished running RandomForestClassifier
Time elapsed 273.42269802093506s

Currently running GaussianNB
Finished running GaussianNB
Time elapsed 198.69913172721863s

Currently running LinearSVC
Finished running LinearSVC
Time elapsed 24.91645097732544s

Currently running SVC
Finished running SVC
Time elapsed 60.0497362613678s

Currently running XGBClassifier




Finished running XGBClassifier
Time elapsed 466.8922896385193s



Unnamed: 0,model,mean,fold_0,fold_1,fold_2,fold_3,fold_4,fold_5,fold_6,fold_7,fold_8,fold_9
0,LogisticRegression,0.978019,0.97619,0.98,0.992,0.968,0.988,0.96,0.944,0.992,0.992,0.988
1,RandomForestClassifier,0.973229,0.964286,0.976,0.988,0.956,0.984,0.98,0.936,0.984,0.984,0.98
2,GaussianNB,0.630743,0.571429,0.572,0.592,0.596,0.632,0.58,0.624,0.788,0.736,0.616
3,LinearSVC,0.982022,0.972222,0.98,0.992,0.98,0.992,0.976,0.952,0.996,0.992,0.988
4,SVC,0.922844,0.944444,0.948,0.948,0.948,0.988,0.624,0.904,0.996,0.964,0.964
5,XGBClassifier,0.964835,0.956349,0.972,0.98,0.948,0.98,0.96,0.94,0.964,0.984,0.964


In [15]:
df_results.sort_values(by='mean')

Unnamed: 0,model,mean,fold_0,fold_1,fold_2,fold_3,fold_4,fold_5,fold_6,fold_7,fold_8,fold_9
2,GaussianNB,0.630743,0.571429,0.572,0.592,0.596,0.632,0.58,0.624,0.788,0.736,0.616
4,SVC,0.922844,0.944444,0.948,0.948,0.948,0.988,0.624,0.904,0.996,0.964,0.964
5,XGBClassifier,0.964835,0.956349,0.972,0.98,0.948,0.98,0.96,0.94,0.964,0.984,0.964
1,RandomForestClassifier,0.973229,0.964286,0.976,0.988,0.956,0.984,0.98,0.936,0.984,0.984,0.98
0,LogisticRegression,0.978019,0.97619,0.98,0.992,0.968,0.988,0.96,0.944,0.992,0.992,0.988
3,LinearSVC,0.982022,0.972222,0.98,0.992,0.98,0.992,0.976,0.952,0.996,0.992,0.988


In [22]:
df

Unnamed: 0,DATE,FAKE,TEXT_CLEAN,TITLE_CLEAN,TEXT_LEN_CHAR,TEXT_LEN_TOKEN,TITLE_LEN_CHAR,TITLE_LEN_TOKEN
0,03/08/2019,1,vam assin petica pel cassaca mandat bolsonaro ...,petica par impeachment bolsonar precis milho a...,275,39,85,12
1,04/08/2019,1,lul vitim golp polit merec estar preso vam ass...,petica lul livr contribu par liberdad presiden...,221,34,56,9
2,05/08/2019,1,professor cont dilm matou mari kozel filh mari...,mari kozel filh assassin dilm tir boat,157,26,38,7
3,03/08/2019,1,vergonh president oab ment pai mort milit pra ...,felip sant cruz president oab ment sobr mort p...,135,25,80,15
4,06/08/2019,1,part diabolico band abrind burac estrad feder ...,esquerd abrind burac estrad nord par consegu v...,129,20,54,9
5,06/08/2019,1,entra aca stf par imped bolsonar reduz prec ga...,entra aca stf par imped bolsonar reduz prec ga...,771,125,59,11
6,06/08/2019,1,filh president oab anos idade receb mil rea le...,filh president oab receb mil lei rouanet par a...,500,80,63,12
7,08/08/2019,1,vest medico bolsonar brinca urologista vou met...,bolsonar vest medic diz urologista vou met ded...,346,50,63,10
8,06/08/2019,1,toffoli glenn greenwald dav mirand flagr conve...,toffoli glenn greenwald david mirand flagr con...,333,49,63,9
9,09/08/2019,1,bolsonar michell gast milho carta corpor set m...,bolsonar michell gast milho carta corpor set m...,53,9,53,9


In [29]:
list(zip(df.index, df['TEXT_CLEAN']))

[(0,
  'vam assin petica pel cassaca mandat bolsonaro comet crim respons suficient par abrir process impeachment vam pression assin peticao precis colh assinatur par pression abertur process impeachment contr president bolsonaro crim respons comet dev ser pun cassaca mandato assinem'),
 (1,
  'lul vitim golp polit merec estar preso vam assin petica favor liberdad noss companheir contribu par reconhec inocencia petica pel liberdad president lul vam lut pel liberdad lul mostr pov compactu injustic prisao lulalivr'),
 (2,
  'professor cont dilm matou mari kozel filh mari kozel filh estav sentinel quand presidant mat tir hoj homenag rua nom trist epoc depoiment mar document provam'),
 (3,
  'vergonh president oab ment pai mort milit pra ganh mil mes bols ditadur hor cort bols ditadur president oab ment pra receb mil mes mane'),
 (4,
  'part diabolico band abrind burac estrad feder nord par consegu verbas estrad boa andar maf petist atraves maquinari sabot governo'),
 (5,
  'entra aca stf p

In [9]:
rus = RandomUnderSampler(random_state=42)
X_resampled, y_resampled = rus.fit_resample(list(zip(df.index, df['TEXT_CLEAN'])), df['FAKE'])

In [10]:
X_all = [X[1] for X in X_resampled]

In [11]:

pipe = Pipeline(
    [('vect', TfidfVectorizer(ngram_range=(1, 2), min_df=5, max_df=0.9)),
     ('clf', CalibratedClassifierCV(LinearSVC()))]
)

In [12]:
pipe.fit(X_all, y_resampled)



Pipeline(memory=None,
         steps=[('vect',
                 TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=True, max_df=0.9, max_features=None,
                                 min_df=5, ngram_range=(1, 2), norm='l2',
                                 preprocessor=None, smooth_idf=True,
                                 stop_words=None, strip_accents=None,
                                 sublinear_tf=False,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, use_idf=True,
                                 vocabulary=None)),
                ('clf',
                 CalibratedClassifierCV(base_estimator=LinearSVC(C=1.0,
                                                                 class_weigh

In [13]:
y_pred = pipe.predict(df['TEXT_CLEAN'])

In [14]:
print(classification_report(df['FAKE'], y_pred))

              precision    recall  f1-score   support

           0       1.00      0.98      0.99     15701
           1       0.83      1.00      0.91      1301

    accuracy                           0.98     17002
   macro avg       0.92      0.99      0.95     17002
weighted avg       0.99      0.98      0.99     17002



In [15]:
print(confusion_matrix(df['FAKE'], y_pred))

[[15437   264]
 [    0  1301]]


In [30]:
n_jobs = 3
n_folds = 10
cv = cross_val_score(pipe, X_all, y_resampled, n_jobs=n_jobs, cv=n_folds, verbose=0)

In [31]:
cv.mean()

0.9777069876688197

In [32]:
cv

array([0.98091603, 0.98461538, 0.98846154, 0.97692308, 0.98076923,
       0.98076923, 0.93846154, 1.        , 0.97307692, 0.97307692])

In [49]:
with open('pipeline.pkl', 'wb') as f:
    pkl.dump(pipe, f)

In [53]:
len(X_all)

2502

In [55]:
len(y_resampled)

2502

In [56]:
y_pred = pipe.predict(X_all)