In [1]:
import pickle as pkl
import seaborn as sns
import matplotlib.pyplot as plt
import re
import pandas as pd
import numpy as np
import os
import seaborn as sns
import matplotlib.pyplot as plt
import math

from tqdm import tqdm_notebook
from collections import Counter
from glob import glob
from unidecode import unidecode
from nltk.corpus import stopwords
from datetime import datetime

import utils

%matplotlib inline

sns.set(style="darkgrid")

In [2]:
PATH = '../'

PROCESSED_DATA_PATH = os.path.join(PATH, 'data/processed/')

DF_PRIME_TRUNCATED_PATH = os.path.join(PROCESSED_DATA_PATH, 'df_prime_truncated.pkl')
DF_USP_TRUNCATED_PATH = os.path.join(PROCESSED_DATA_PATH, 'df_usp_truncated_clean.pkl')


STOPWORDS = utils.get_stopwords()

In [3]:
df_prime = pkl.load(open(DF_PRIME_TRUNCATED_PATH, 'rb'))

df_fakebr = pkl.load(open(DF_USP_TRUNCATED_PATH, 'rb'))

In [4]:
from imblearn.under_sampling import RandomUnderSampler

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score, confusion_matrix, classification_report

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.svm import LinearSVC

from sklearn.calibration import CalibratedClassifierCV

from sklearn.pipeline import Pipeline

In [5]:
vectorizer_list = [
    CountVectorizer,
    TfidfVectorizer
]

vect_kwargs = {
    'ngram_range': (1, 2),
#     'min_df': 5,
#     'max_df': 0.9,
    'max_features': 10000
}


model_list = [
    GaussianNB,
    LogisticRegression,
    RandomForestClassifier,
    SVC
]



In [6]:
for clf in model_list:
    print(type(clf()).__name__)

GaussianNB
LogisticRegression
RandomForestClassifier
SVC


In [7]:
from sklearn.base import TransformerMixin
class DenseTransformer(TransformerMixin):

    def fit(self, X, y=None, **fit_params):
        return self

    def transform(self, X, y=None, **fit_params):
        return X.todense()

In [8]:
from time import time 

def make_pipelines(X_train, y_train,
            vectorizer_list=[TfidfVectorizer],
            model_list=[LogisticRegression],
            vect_kwargs= {
                'ngram_range': (1, 2),
                'min_df': 5,
                'max_df': 0.9,
                'max_features': 10000
            },
            n_jobs=3):
    
    pipelines = {'vectorizer': [], 'model': [], 'pipeline': []}
            
    for vect in vectorizer_list:
        for clf in model_list:
            
            clf_name = type(clf()).__name__
            
            if clf_name == 'LogisticRegression':
                clf_kwargs = {'solver': 'lbfgs'}
            elif clf_name == 'RandomForestClassifier':
                clf_kwargs = {'n_estimators': 1000}
            elif clf_name == 'SVC':
                clf_kwargs = {'kernel': 'linear', 'C': 0.1}
            else:
                clf_kwargs = {}                
            
            time_start = time()
            pipe = Pipeline([('vect', vect(**vect_kwargs)), ('clf', clf(**clf_kwargs))])

            try:
                pipe.fit(X_train, y_train)
                
            except:
                pipe = Pipeline([('vect', vect(**vect_kwargs)),
                                 ('todense', DenseTransformer()),
                                 ('clf', clf(**clf_kwargs))])
                pipe.fit(X_train, y_train)
                

            
            pipelines['vectorizer'].append(type(vect()).__name__)
            pipelines['model'].append(type(clf()).__name__)
            pipelines['pipeline'].append(pipe)
            
            print('Finished running {} + {}'.format(type(vect()).__name__, type(clf()).__name__))
            print('Time elapsed {}s'.format(time() - time_start))
            print()
            
    return pipelines

In [9]:
def test_pipelines(X_test, y_test, pipelines):

    results = {
        'vectorizer': [],
        'model': [],
#         'y_pred': [],
        'accuracy': [],
        'f1_score': [],
#         'precision': [],
#         'recall': []
    }
    
    for i, row in pd.DataFrame(pipelines).iterrows():
        y_pred = row['pipeline'].predict(X_test)
        
        results['vectorizer'].append(row['vectorizer'])
        results['model'].append(row['model'])
#         results['y_pred'].append(y_pred)
        results['accuracy'].append(accuracy_score(y_test, y_pred))
        results['f1_score'].append(f1_score(y_test, y_pred))
#         results['precision'].append(precision_score(y_test, y_pred))
#         results['recall'].append(recall_score(y_test, y_pred))
        
    return results

In [10]:
X_train_fakebr, X_test_fakebr, y_train_fakebr, y_test_fakebr = \
    train_test_split(df_fakebr['TEXT_CLEAN'], df_fakebr['FAKE'], test_size=0.3, random_state=42)

In [11]:
X_train_prime, X_test_prime, y_train_prime, y_test_prime = \
    train_test_split(df_prime['TEXT_CLEAN'], df_prime['FAKE'], test_size=0.3, random_state=42)

In [12]:
X_train_all = list(X_train_fakebr) + list(X_train_prime)
y_train_all = list(y_train_fakebr) + list(y_train_prime)

In [13]:
X_aux = list(X_test_fakebr) + list(X_test_prime)
y_aux = list(y_test_fakebr) + list(y_test_prime)
src_aux = len(list(y_test_fakebr)) * ['fakebr'] + len(list(y_test_prime)) * ['prime']

df_resample = pd.DataFrame({'X': X_aux, 'y': y_aux, 'src': src_aux})
rus = RandomUnderSampler()
resampled = rus.fit_resample(df_resample[['X', 'y']], df_resample['src'])

X_test_all = [item[0] for item in resampled[0]]
y_test_all = [item[1] for item in resampled[0]]

# Montando os pipelines para o Fake.Br corpus

In [14]:
pipelines_fakebr = make_pipelines(X_train_fakebr, y_train_fakebr, vectorizer_list, model_list)

Finished running CountVectorizer + GaussianNB
Time elapsed 3.048135757446289s

Finished running CountVectorizer + LogisticRegression
Time elapsed 1.3238298892974854s

Finished running CountVectorizer + RandomForestClassifier
Time elapsed 15.843529462814331s

Finished running CountVectorizer + SVC
Time elapsed 4.3408522605896s

Finished running TfidfVectorizer + GaussianNB
Time elapsed 3.0571601390838623s

Finished running TfidfVectorizer + LogisticRegression
Time elapsed 1.2846944332122803s

Finished running TfidfVectorizer + RandomForestClassifier
Time elapsed 16.31509280204773s

Finished running TfidfVectorizer + SVC
Time elapsed 7.7526161670684814s



### Teste com o próprio conjunto de teste

In [15]:
aux = test_pipelines(X_test_fakebr, y_test_fakebr, pipelines_fakebr)
pd.DataFrame(aux)

Unnamed: 0,vectorizer,model,accuracy,f1_score
0,CountVectorizer,GaussianNB,0.802233,0.813814
1,CountVectorizer,LogisticRegression,0.880383,0.884793
2,CountVectorizer,RandomForestClassifier,0.860447,0.866106
3,CountVectorizer,SVC,0.870813,0.875385
4,TfidfVectorizer,GaussianNB,0.794258,0.800926
5,TfidfVectorizer,LogisticRegression,0.88118,0.885824
6,TfidfVectorizer,RandomForestClassifier,0.855662,0.861727
7,TfidfVectorizer,SVC,0.855662,0.866814


### Teste com o conjunto geral

In [16]:
aux = test_pipelines(X_test_all, y_test_all, pipelines_fakebr)
pd.DataFrame(aux)

Unnamed: 0,vectorizer,model,accuracy,f1_score
0,CountVectorizer,GaussianNB,0.742547,0.750656
1,CountVectorizer,LogisticRegression,0.838753,0.836314
2,CountVectorizer,RandomForestClassifier,0.846883,0.850331
3,CountVectorizer,SVC,0.822493,0.822252
4,TfidfVectorizer,GaussianNB,0.726287,0.727763
5,TfidfVectorizer,LogisticRegression,0.861789,0.862903
6,TfidfVectorizer,RandomForestClassifier,0.837398,0.840426
7,TfidfVectorizer,SVC,0.833333,0.844501


# Montando os pipelines para o conjunto primário

In [17]:
pipelines_prime = make_pipelines(X_train_prime, y_train_prime, vectorizer_list, model_list)

Finished running CountVectorizer + GaussianNB
Time elapsed 0.8237624168395996s

Finished running CountVectorizer + LogisticRegression
Time elapsed 0.4497408866882324s

Finished running CountVectorizer + RandomForestClassifier
Time elapsed 2.944206476211548s

Finished running CountVectorizer + SVC
Time elapsed 0.6370797157287598s

Finished running TfidfVectorizer + GaussianNB
Time elapsed 0.9766843318939209s

Finished running TfidfVectorizer + LogisticRegression
Time elapsed 0.4059126377105713s

Finished running TfidfVectorizer + RandomForestClassifier
Time elapsed 3.0864310264587402s

Finished running TfidfVectorizer + SVC
Time elapsed 0.9278669357299805s



### Teste com o próprio conjunto de teste

In [18]:
aux = test_pipelines(X_test_prime, y_test_prime, pipelines_prime)
pd.DataFrame(aux)

Unnamed: 0,vectorizer,model,accuracy,f1_score
0,CountVectorizer,GaussianNB,0.799458,0.771605
1,CountVectorizer,LogisticRegression,0.915989,0.917333
2,CountVectorizer,RandomForestClassifier,0.888889,0.883853
3,CountVectorizer,SVC,0.905149,0.906166
4,TfidfVectorizer,GaussianNB,0.769648,0.728435
5,TfidfVectorizer,LogisticRegression,0.899729,0.898072
6,TfidfVectorizer,RandomForestClassifier,0.886179,0.881356
7,TfidfVectorizer,SVC,0.891599,0.888889


### Teste com o conjunto geral

In [19]:
aux = test_pipelines(X_test_all, y_test_all, pipelines_prime)
pd.DataFrame(aux)

Unnamed: 0,vectorizer,model,accuracy,f1_score
0,CountVectorizer,GaussianNB,0.714092,0.659128
1,CountVectorizer,LogisticRegression,0.800813,0.807843
2,CountVectorizer,RandomForestClassifier,0.785908,0.783562
3,CountVectorizer,SVC,0.791328,0.8
4,TfidfVectorizer,GaussianNB,0.708672,0.638655
5,TfidfVectorizer,LogisticRegression,0.810298,0.807163
6,TfidfVectorizer,RandomForestClassifier,0.791328,0.785515
7,TfidfVectorizer,SVC,0.798103,0.784993


# Montando os pipelines com ambos os conjuntos

In [20]:
pipelines_all = make_pipelines(X_train_all, y_train_all, vectorizer_list, model_list)

Finished running CountVectorizer + GaussianNB
Time elapsed 4.306814193725586s

Finished running CountVectorizer + LogisticRegression
Time elapsed 1.758148431777954s

Finished running CountVectorizer + RandomForestClassifier
Time elapsed 23.827953338623047s

Finished running CountVectorizer + SVC
Time elapsed 6.686055660247803s

Finished running TfidfVectorizer + GaussianNB
Time elapsed 4.261123895645142s

Finished running TfidfVectorizer + LogisticRegression
Time elapsed 1.6262803077697754s

Finished running TfidfVectorizer + RandomForestClassifier
Time elapsed 24.30170774459839s

Finished running TfidfVectorizer + SVC
Time elapsed 13.58487319946289s



### Teste com o conjunto geral

In [21]:
aux = test_pipelines(X_test_all, y_test_all, pipelines_all)
pd.DataFrame(aux)

Unnamed: 0,vectorizer,model,accuracy,f1_score
0,CountVectorizer,GaussianNB,0.822493,0.829205
1,CountVectorizer,LogisticRegression,0.897019,0.897297
2,CountVectorizer,RandomForestClassifier,0.850949,0.848485
3,CountVectorizer,SVC,0.892954,0.894244
4,TfidfVectorizer,GaussianNB,0.815718,0.814208
5,TfidfVectorizer,LogisticRegression,0.898374,0.898236
6,TfidfVectorizer,RandomForestClassifier,0.857724,0.856753
7,TfidfVectorizer,SVC,0.898374,0.900398
