In [1]:
import pickle as pkl
import seaborn as sns
import matplotlib.pyplot as plt
import re
import pandas as pd
import numpy as np
import os
import seaborn as sns
import matplotlib.pyplot as plt
import math

from tqdm import tqdm_notebook
from collections import Counter
from glob import glob
from unidecode import unidecode
from nltk.corpus import stopwords
from datetime import datetime

import utils

%matplotlib inline

sns.set(style="darkgrid")

In [2]:
PATH = '../'

PROCESSED_DATA_PATH = os.path.join(PATH, 'data/processed/')

DF_PRIME_TRUNCATED_PATH = os.path.join(PROCESSED_DATA_PATH, 'df_prime_truncated.pkl')
DF_USP_TRUNCATED_PATH = os.path.join(PROCESSED_DATA_PATH, 'df_usp_truncated_clean.pkl')


STOPWORDS = utils.get_stopwords()

In [3]:
df_prime = pkl.load(open(DF_PRIME_TRUNCATED_PATH, 'rb'))

df_fakebr = pkl.load(open(DF_USP_TRUNCATED_PATH, 'rb'))

In [4]:
from imblearn.under_sampling import RandomUnderSampler

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score, confusion_matrix, classification_report

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.svm import LinearSVC

from sklearn.calibration import CalibratedClassifierCV

from sklearn.pipeline import Pipeline

In [5]:
vectorizer_list = [
    CountVectorizer,
    TfidfVectorizer
]

vect_kwargs = {
    'ngram_range': (1, 2),
#     'min_df': 5,
#     'max_df': 0.9,
    'max_features': 10000
}


model_list = [
    GaussianNB,
    LogisticRegression,
    RandomForestClassifier,
    SVC
]



In [6]:
for clf in model_list:
    print(type(clf()).__name__)

GaussianNB
LogisticRegression
RandomForestClassifier
SVC


In [7]:
from sklearn.base import TransformerMixin
class DenseTransformer(TransformerMixin):

    def fit(self, X, y=None, **fit_params):
        return self

    def transform(self, X, y=None, **fit_params):
        return X.todense()

In [8]:
from time import time 

def make_pipelines(X_train, y_train,
            vectorizer_list=[TfidfVectorizer],
            model_list=[LogisticRegression],
            vect_kwargs= {
                'ngram_range': (1, 2),
                'min_df': 5,
                'max_df': 0.9,
                'max_features': 10000
            },
            n_jobs=3):
    
    pipelines = {'vectorizer': [], 'model': [], 'pipeline': []}
            
    for vect in vectorizer_list:
        for clf in model_list:
            
            clf_name = type(clf()).__name__
            
            if clf_name == 'LogisticRegression':
                clf_kwargs = {'solver': 'lbfgs'}
            elif clf_name == 'RandomForestClassifier':
                clf_kwargs = {'n_estimators': 1000}
            elif clf_name == 'SVC':
                clf_kwargs = {'kernel': 'linear', 'C': 0.1}
            else:
                clf_kwargs = {}                
            
            time_start = time()
            pipe = Pipeline([('vect', vect(**vect_kwargs)), ('clf', clf(**clf_kwargs))])

            try:
                pipe.fit(X_train, y_train)
                
            except:
                pipe = Pipeline([('vect', vect(**vect_kwargs)),
                                 ('todense', DenseTransformer()),
                                 ('clf', clf(**clf_kwargs))])
                pipe.fit(X_train, y_train)
                

            
            pipelines['vectorizer'].append(type(vect()).__name__)
            pipelines['model'].append(type(clf()).__name__)
            pipelines['pipeline'].append(pipe)
            
            print('Finished running {} + {}'.format(type(vect()).__name__, type(clf()).__name__))
            print('Time elapsed {}s'.format(time() - time_start))
            print()
            
    return pipelines

In [9]:
def test_pipelines(X_test, y_test, pipelines):

    results = {
        'vectorizer': [],
        'model': [],
#         'y_pred': [],
        'accuracy': [],
        'f1_score': [],
#         'precision': [],
#         'recall': []
    }
    
    for i, row in pd.DataFrame(pipelines).iterrows():
        y_pred = row['pipeline'].predict(X_test)
        
        results['vectorizer'].append(row['vectorizer'])
        results['model'].append(row['model'])
#         results['y_pred'].append(y_pred)
        results['accuracy'].append(accuracy_score(y_test, y_pred))
        results['f1_score'].append(f1_score(y_test, y_pred))
#         results['precision'].append(precision_score(y_test, y_pred))
#         results['recall'].append(recall_score(y_test, y_pred))
        
    return results

In [10]:
X_train_fakebr, X_test_fakebr, y_train_fakebr, y_test_fakebr = \
    train_test_split(df_fakebr['TEXT_CLEAN'], df_fakebr['FAKE'], test_size=0.3)

In [11]:
X_train_prime, X_test_prime, y_train_prime, y_test_prime = \
    train_test_split(df_prime['TEXT_CLEAN'], df_prime['FAKE'], test_size=0.3)

In [26]:
len(y_test_fakebr), len(y_test_prime), (1254+ 369)

(1254, 369, 1623)

In [12]:
X_aux = list(X_test_fakebr) + list(X_test_prime)
y_aux = list(y_test_fakebr) + list(y_test_prime)
src_aux = len(list(y_test_fakebr)) * ['fakebr'] + len(list(y_test_prime)) * ['prime']

df_resample = pd.DataFrame({'X': X_aux, 'y': y_aux, 'src': src_aux})
rus = RandomUnderSampler()
resampled = rus.fit_resample(df_resample[['X', 'y']], df_resample['src'])

X_test_all = [item[0] for item in resampled[0]]
y_test_all = [item[1] for item in resampled[0]]

# Montando os pipelines para o Fake.Br corpus

In [13]:
pipelines_fakebr = make_pipelines(X_train_fakebr, y_train_fakebr, vectorizer_list, model_list)

Finished running CountVectorizer + GaussianNB
Time elapsed 3.448272943496704s

Finished running CountVectorizer + LogisticRegression
Time elapsed 1.462090015411377s

Finished running CountVectorizer + RandomForestClassifier
Time elapsed 16.926737546920776s

Finished running CountVectorizer + SVC
Time elapsed 4.502958297729492s

Finished running TfidfVectorizer + GaussianNB
Time elapsed 3.148686647415161s

Finished running TfidfVectorizer + LogisticRegression
Time elapsed 1.2826895713806152s

Finished running TfidfVectorizer + RandomForestClassifier
Time elapsed 20.344597578048706s

Finished running TfidfVectorizer + SVC
Time elapsed 10.229337692260742s



### Teste com o próprio conjunto de teste

In [14]:
aux = test_pipelines(X_test_fakebr, y_test_fakebr, pipelines_fakebr)
pd.DataFrame(aux)

Unnamed: 0,vectorizer,model,accuracy,f1_score
0,CountVectorizer,GaussianNB,0.8126,0.814522
1,CountVectorizer,LogisticRegression,0.88118,0.879742
2,CountVectorizer,RandomForestClassifier,0.866826,0.864996
3,CountVectorizer,SVC,0.864434,0.862237
4,TfidfVectorizer,GaussianNB,0.8126,0.81215
5,TfidfVectorizer,LogisticRegression,0.89075,0.889605
6,TfidfVectorizer,RandomForestClassifier,0.866826,0.864777
7,TfidfVectorizer,SVC,0.834131,0.845007


### Teste com o conjunto geral

In [15]:
aux = test_pipelines(X_test_all, y_test_all, pipelines_fakebr)
pd.DataFrame(aux)

Unnamed: 0,vectorizer,model,accuracy,f1_score
0,CountVectorizer,GaussianNB,0.777778,0.792929
1,CountVectorizer,LogisticRegression,0.845528,0.849604
2,CountVectorizer,RandomForestClassifier,0.853659,0.861538
3,CountVectorizer,SVC,0.819783,0.823373
4,TfidfVectorizer,GaussianNB,0.749322,0.760673
5,TfidfVectorizer,LogisticRegression,0.875339,0.881137
6,TfidfVectorizer,RandomForestClassifier,0.831978,0.841026
7,TfidfVectorizer,SVC,0.814363,0.838253


# Montando os pipelines para o conjunto primário

In [16]:
pipelines_prime = make_pipelines(X_train_prime, y_train_prime, vectorizer_list, model_list)

Finished running CountVectorizer + GaussianNB
Time elapsed 1.0033164024353027s

Finished running CountVectorizer + LogisticRegression
Time elapsed 0.5505266189575195s

Finished running CountVectorizer + RandomForestClassifier
Time elapsed 3.340068817138672s

Finished running CountVectorizer + SVC
Time elapsed 0.7270548343658447s

Finished running TfidfVectorizer + GaussianNB
Time elapsed 1.0223793983459473s

Finished running TfidfVectorizer + LogisticRegression
Time elapsed 0.48166608810424805s

Finished running TfidfVectorizer + RandomForestClassifier
Time elapsed 3.3294780254364014s

Finished running TfidfVectorizer + SVC
Time elapsed 1.004218339920044s



### Teste com o próprio conjunto de teste

In [17]:
aux = test_pipelines(X_test_prime, y_test_prime, pipelines_prime)
pd.DataFrame(aux)

Unnamed: 0,vectorizer,model,accuracy,f1_score
0,CountVectorizer,GaussianNB,0.818428,0.79257
1,CountVectorizer,LogisticRegression,0.921409,0.924675
2,CountVectorizer,RandomForestClassifier,0.921409,0.920981
3,CountVectorizer,SVC,0.910569,0.914729
4,TfidfVectorizer,GaussianNB,0.794038,0.75641
5,TfidfVectorizer,LogisticRegression,0.937669,0.938992
6,TfidfVectorizer,RandomForestClassifier,0.932249,0.931507
7,TfidfVectorizer,SVC,0.926829,0.926829


### Teste com o conjunto geral

In [18]:
aux = test_pipelines(X_test_all, y_test_all, pipelines_prime)
pd.DataFrame(aux)

Unnamed: 0,vectorizer,model,accuracy,f1_score
0,CountVectorizer,GaussianNB,0.715447,0.673913
1,CountVectorizer,LogisticRegression,0.813008,0.832117
2,CountVectorizer,RandomForestClassifier,0.804878,0.8125
3,CountVectorizer,SVC,0.802168,0.821516
4,TfidfVectorizer,GaussianNB,0.697832,0.642055
5,TfidfVectorizer,LogisticRegression,0.846883,0.854569
6,TfidfVectorizer,RandomForestClassifier,0.821138,0.826772
7,TfidfVectorizer,SVC,0.831978,0.839378


In [48]:
vect_kwargs = {
    'ngram_range': (1, 2),
    'max_features': 10000,
}

clf_kwargs = {'C': 10, 'class_weight': 'balanced', 'solver': 'saga', 'tol': 0.001}

pipe = Pipeline([
    ('vect', TfidfVectorizer(**vect_kwargs)),
    ('clf', LogisticRegression(**clf_kwargs))
])

pipe.fit(X_train_fakebr, y_train_fakebr)

y_pred = pipe.predict(X_test_fakebr)

print(classification_report(y_test_fakebr, y_pred))

print(confusion_matrix(y_test_fakebr, y_pred))

              precision    recall  f1-score   support

           0       0.90      0.89      0.89       643
           1       0.88      0.90      0.89       611

    accuracy                           0.89      1254
   macro avg       0.89      0.89      0.89      1254
weighted avg       0.89      0.89      0.89      1254

[[570  73]
 [ 61 550]]


In [49]:
vect_kwargs = {
    'ngram_range': (1, 2),
    'max_features': 10000,
}

clf_kwargs = {'C': 10, 'class_weight': None, 'solver': 'saga', 'tol': 0.001}

pipe = Pipeline([
    ('vect', TfidfVectorizer(**vect_kwargs)),
    ('clf', LogisticRegression(**clf_kwargs))
])

pipe.fit(X_train_fakebr, y_train_fakebr)

y_pred = pipe.predict(X_test_fakebr)

print(classification_report(y_test_fakebr, y_pred))

print(confusion_matrix(y_test_fakebr, y_pred))

              precision    recall  f1-score   support

           0       0.90      0.88      0.89       643
           1       0.88      0.90      0.89       611

    accuracy                           0.89      1254
   macro avg       0.89      0.89      0.89      1254
weighted avg       0.89      0.89      0.89      1254

[[569  74]
 [ 60 551]]
