In [1]:

import pickle as pkl
import seaborn as sns
import matplotlib.pyplot as plt
import re
import pandas as pd
import numpy as np
import os
import seaborn as sns
import matplotlib.pyplot as plt
import math

from tqdm import tqdm_notebook
from collections import Counter
from glob import glob
from unidecode import unidecode
from nltk.corpus import stopwords
from datetime import datetime

import utils

%matplotlib inline

sns.set(style="darkgrid")

In [2]:
PATH = '../'

PROCESSED_DATA_PATH = os.path.join(PATH, 'data/processed/')

DF_PRIME_TRUNCATED_PATH = os.path.join(PROCESSED_DATA_PATH, 'df_prime_truncated.pkl')
DF_USP_TRUNCATED_PATH = os.path.join(PROCESSED_DATA_PATH, 'df_usp_truncated_clean.pkl')


STOPWORDS = utils.get_stopwords()

In [3]:
df_fakebr = pkl.load(open(DF_USP_TRUNCATED_PATH, 'rb'))

In [4]:
from imblearn.under_sampling import RandomUnderSampler

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score, confusion_matrix, classification_report

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.svm import LinearSVC

from sklearn.calibration import CalibratedClassifierCV

from sklearn.pipeline import Pipeline

In [5]:
PATH = '../'

PROCESSED_DATA_PATH = os.path.join(PATH, 'data/processed/')

DF_PRIME_TRUNCATED_PATH = os.path.join(PROCESSED_DATA_PATH, 'df_prime_truncated.pkl')
DF_USP_TRUNCATED_PATH = os.path.join(PROCESSED_DATA_PATH, 'df_usp_truncated_clean.pkl')


STOPWORDS = utils.get_stopwords()

df_prime = pkl.load(open(DF_PRIME_TRUNCATED_PATH, 'rb'))

df_fakebr = pkl.load(open(DF_USP_TRUNCATED_PATH, 'rb'))

In [9]:
X_train_prime, X_test_prime, y_train_prime, y_test_prime = \
    train_test_split(df_prime['TEXT_CLEAN'], df_prime['FAKE'], test_size=0.3, random_state=42)

In [10]:
X_train_fakebr, X_test_fakebr, y_train_fakebr, y_test_fakebr = \
    train_test_split(df_fakebr['TEXT_CLEAN'], df_fakebr['FAKE'], test_size=0.3, random_state=42)

In [11]:
X_train_all = list(X_train_fakebr) + list(X_train_prime)
y_train_all = list(y_train_fakebr) + list(y_train_prime)

In [12]:
X_aux = list(X_test_fakebr) + list(X_test_prime)
y_aux = list(y_test_fakebr) + list(y_test_prime)
src_aux = len(list(y_test_fakebr)) * ['fakebr'] + len(list(y_test_prime)) * ['prime']

df_resample = pd.DataFrame({'X': X_aux, 'y': y_aux, 'src': src_aux})
rus = RandomUnderSampler()
resampled = rus.fit_resample(df_resample[['X', 'y']], df_resample['src'])

X_test_all = [item[0] for item in resampled[0]]
y_test_all = [item[1] for item in resampled[0]]

In [39]:
vectorizer_list = [
    TfidfVectorizer(ngram_range=(1, 2), min_df=5, max_df=0.9, max_features=10000)
]


pipe = Pipeline([('vect', vectorizer_list[0]), ('clf', LogisticRegression())])

param_grid = {
    'clf__C': [0.001, 0.01, 0.1, 1, 10, 100, 1000],
    'clf__solver': ['lbfgs', 'saga'],
    'clf__tol': [1e-5, 1e-4, 1e-3, 1e-2]
}

In [40]:
gscv = GridSearchCV(pipe, param_grid, cv=5, n_jobs=4, verbose=10)
gscv.fit(X_train_fakebr, y_train_fakebr)

Fitting 5 folds for each of 56 candidates, totalling 280 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done   5 tasks      | elapsed:    3.9s
[Parallel(n_jobs=4)]: Done  10 tasks      | elapsed:    6.1s
[Parallel(n_jobs=4)]: Done  17 tasks      | elapsed:   10.4s
[Parallel(n_jobs=4)]: Done  24 tasks      | elapsed:   13.5s
[Parallel(n_jobs=4)]: Done  33 tasks      | elapsed:   20.7s
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:   25.3s
[Parallel(n_jobs=4)]: Done  53 tasks      | elapsed:   32.4s
[Parallel(n_jobs=4)]: Done  64 tasks      | elapsed:   38.1s
[Parallel(n_jobs=4)]: Done  77 tasks      | elapsed:   49.9s
[Parallel(n_jobs=4)]: Done  90 tasks      | elapsed:   59.2s
[Parallel(n_jobs=4)]: Done 105 tasks      | elapsed:  1.2min
[Parallel(n_jobs=4)]: Done 120 tasks      | elapsed:  1.3min
[Parallel(n_jobs=4)]: Done 137 tasks      | elapsed:  1.5min
[Parallel(n_jobs=4)]: Done 154 tasks      | elapsed:  1.7min
[Parallel(n_jobs=4)]: Done 173 tasks      | elapsed:  2.0min
[Parallel(

GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=Pipeline(memory=None,
                                steps=[('vect',
                                        TfidfVectorizer(analyzer='word',
                                                        binary=False,
                                                        decode_error='strict',
                                                        dtype=<class 'numpy.float64'>,
                                                        encoding='utf-8',
                                                        input='content',
                                                        lowercase=True,
                                                        max_df=0.9,
                                                        max_features=10000,
                                                        min_df=5,
                                                        ngram_range=(1, 2),
                                         

In [18]:
print("Best parameter (CV score={:.3f}):".format(gscv.best_score_))
print(gscv.best_params_)


Best parameter (CV score=0.903):
{'clf__C': 100, 'clf__solver': 'saga', 'clf__tol': 0.0001}


In [52]:
clf_kwargs = {'C': 100, 'solver': 'saga', 'tol': 0.0001}

pipe = Pipeline([
    ('vect', TfidfVectorizer(ngram_range=(1, 2), min_df=5, max_df=0.9, max_features=10000)), 
    ('clf', LogisticRegression(**clf_kwargs))
])


In [53]:
pipe.fit(X_train_fakebr, y_train_fakebr)

Pipeline(memory=None,
         steps=[('vect',
                 TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=True, max_df=1.0, max_features=10000,
                                 min_df=1, ngram_range=(1, 2), norm='l2',
                                 preprocessor=None, smooth_idf=True,
                                 stop_words=None, strip_accents=None,
                                 sublinear_tf=False,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, use_idf=True,
                                 vocabulary=None)),
                ('clf',
                 LogisticRegression(C=100, class_weight=None, dual=False,
                                    fit_intercept=True, intercept_scaling

In [54]:
y_pred = pipe.predict(X_test_all)

In [55]:
print(classification_report(y_test_all, y_pred))

              precision    recall  f1-score   support

           0       0.87      0.88      0.88       371
           1       0.88      0.87      0.87       367

    accuracy                           0.87       738
   macro avg       0.87      0.87      0.87       738
weighted avg       0.87      0.87      0.87       738



In [56]:
print(confusion_matrix(y_test_all, y_pred))

[[326  45]
 [ 48 319]]


In [28]:
accuracy_score(y_test_all, y_pred)

0.8766937669376694

In [None]:
# explicitar que para fins de treinamento os conjuntos seriam utilizados separadamente durante todo o trabalho