# Basic approach of vectorization and classification

In [1]:
import pandas as pd
import numpy as np
import pickle as pkl
import os

from glob import glob
from collections import Counter

## Defining variables

In [2]:
PATH = '../'

PROCESSED_DATA_PATH = os.path.join(PATH, 'data/processed/')

DF_FAKE_PATH = os.path.join(PROCESSED_DATA_PATH, 'df_fake_clean.pkl')
DF_LEGIT_PATH = os.path.join(PROCESSED_DATA_PATH, 'df_legit_clean.pkl')

## Loading DataFrames

In [3]:
df_fake = pkl.load(open(DF_FAKE_PATH, 'rb'))

df_legit = pkl.load(open(DF_LEGIT_PATH, 'rb'))

### DataFrame with Fake News

In [4]:
df_fake.head()

Unnamed: 0,DATE,FAKE,TEXT_CLEAN,TITLE_CLEAN,TEXT_LEN_CHAR,TEXT_LEN_TOKEN,TITLE_LEN_CHAR,TITLE_LEN_TOKEN
0,03/08/2019,1,vamos assinar peticao cassacao mandato bolsona...,peticao impeachment bolsonaro precisa milhoes ...,328,36,81,9
1,04/08/2019,1,lula vitima golpe politico merece estar preso ...,peticao lula livre contribui liberdade preside...,242,30,54,7
2,05/08/2019,1,professor contou dilma matou mario kozel filho...,mario kozel filho assassinado dilma tiros,170,24,41,6
3,03/08/2019,1,vergonha presidente oab mentiu pai morto milit...,felipe santa cruz presidente oab mentiu sobre ...,158,25,82,13
4,06/08/2019,1,partido diabolico bandidos abrindo buracos est...,esquerda abrindo buracos estradas nordeste con...,155,19,59,7


### DataFrame with Legit News

In [5]:
df_legit.head()

Unnamed: 0,DATE,FAKE,TEXT_CLEAN,TITLE_CLEAN,TEXT_LEN_CHAR,TEXT_LEN_TOKEN,TITLE_LEN_CHAR,TITLE_LEN_TOKEN
0,04/06/2019,0,atual mandato presidencial brasil comecou pouc...,comecam soar alarmes sobre sustentabilidade pr...,3992,495,65,7
1,02/06/2019,0,poucas horas antes milhares manifestantes irem...,corte contingenciamento certo guerra narrativa...,4665,579,56,6
2,03/06/2019,0,tamanho figura publica mede importancia inimig...,trump insulta prefeito londres inicio visita r...,3245,398,56,8
3,26/05/2019,0,tantos fenomenos imparaveis trazidos revolucao...,lider ninguem meio,2071,246,18,3
4,03/06/2019,0,apos semanas audiencias publicas projeto refor...,reforma previdencia pesara sobre pobres ricos,10220,1256,45,6


### Unifying DataFrames

In [6]:
df = pd.concat((df_fake, df_legit), axis=0)

## Modeling

In [7]:
from imblearn.under_sampling import RandomUnderSampler

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBClassifier

from sklearn.pipeline import Pipeline

In [8]:
y_all = df['FAKE']

### Vectorization

This step is necessary because models don't behave well with words. 

In this step, each token will be transformed into a number and, for that, the TF-IDF vectorization will be used.

TF-IDF vectorization consists in the product of two terms:

- <b>TF or Term Frequency:</b> increases the importance of a token as it appears more in the corpus

- <b>IDF or Inverse Document Frequency:</b> penalizes the importance of a token as they appear in more documents
    - The idea here is that, for instance, if a token appears in every document, it probably won't help to differ one document class from another
    
For a deeper insight of the method, I reccomend checking these sites:
- <a href="http://www.tfidf.com/">tfidf</a>
- <a href="https://pt.wikipedia.org/wiki/Tf%E2%80%93idf">Wikipedia</a>

In [9]:
vectorizer = TfidfVectorizer(ngram_range=(1, 2), min_df=5, max_df=0.9)

In [10]:
vectorizer.fit(df['TEXT_CLEAN'])

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.float64'>, encoding='utf-8',
                input='content', lowercase=True, max_df=0.9, max_features=None,
                min_df=5, ngram_range=(1, 2), norm='l2', preprocessor=None,
                smooth_idf=True, stop_words=None, strip_accents=None,
                sublinear_tf=False, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, use_idf=True, vocabulary=None)

It's important to mention that I made tests with both the TEXT corpus and the TITLE corpus. The latter didn't perform as expected.

In [11]:
X_all_text = vectorizer.transform(df['TEXT_CLEAN'])

In [12]:
X_all_text.shape

(16950, 185645)

In [13]:
X_all_title = vectorizer.transform(df['TITLE_CLEAN'])

In [14]:
X_all_title.shape

(16950, 185645)

### Resampling

As the samples of fake news and legit news aren't balanced (the legit news corpus has a LOT more entries than the fake news corpus), I opted for a resampling technique.

For that, I used <a href="https://imbalanced-learn.readthedocs.io/en/stable/index.html">imblearn's</a> RandomUnderSampler, which will randomly choose entries from the largest corpus so that both corpora will have the same amount of samples.

In [15]:
rus = RandomUnderSampler(random_state=42)

In [16]:
X_resampled, y_resampled = rus.fit_resample(X_all_text, y_all)

In [17]:
Counter(y_resampled)

Counter({0: 1251, 1: 1251})

### Splitting the data for TRAIN and TEST


In [18]:
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.3, random_state=42)

### Cross validation
<a href="https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.cross_val_score.html">Cross validation</a> will help to determine if the model is performing well. It will divide the data in folds, train and test them, ensuring that each fold will be tested once.

In [19]:
cv = cross_val_score(RandomForestClassifier(n_estimators=1000, n_jobs=3), X_train, y_train, n_jobs=-1, cv=5, verbose=10)
print(cv)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:  1.1min remaining:  1.6min
[Parallel(n_jobs=-1)]: Done   3 out of   5 | elapsed:  1.1min remaining:   42.3s


[0.97150997 0.96       0.99142857 0.98285714 0.97428571]


[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:  1.3min remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:  1.3min finished


### Training model

For this test, the <a href="https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html">RandomForestClassifier</a> was used. It is an ensemble method that consists in generating <b>n</b> decision trees based on random features.

In [20]:
clf = RandomForestClassifier(n_estimators=1000, n_jobs=3, verbose=0)

In [21]:
clf.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=1000,
                       n_jobs=3, oob_score=False, random_state=None, verbose=0,
                       warm_start=False)

### Making pipeline

This part isn't totally necessary, but, for practical reasons, a pipeline was made.

In [23]:
pipe = Pipeline((['vect', vectorizer], ['clf', clf]))

with open('pipe.pkl', 'wb') as f:
    pkl.dump(pipe, f)

In [24]:
cv.mean()

0.9760162800162799

### Testing with actual test set (with texts)

In [22]:
y_pred = clf.predict(X_test)

In [25]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.98      0.98      0.98       371
           1       0.98      0.98      0.98       380

    accuracy                           0.98       751
   macro avg       0.98      0.98      0.98       751
weighted avg       0.98      0.98      0.98       751



In [26]:
print(confusion_matrix(y_test, y_pred))

[[362   9]
 [  7 373]]


### Testing with entire texts corpus

In [27]:
y_pred_all_texts = clf.predict(X_all_text)

In [28]:
print(classification_report(y_all, y_pred_all_texts))

              precision    recall  f1-score   support

           0       1.00      0.97      0.98     15699
           1       0.70      0.99      0.82      1251

    accuracy                           0.97     16950
   macro avg       0.85      0.98      0.90     16950
weighted avg       0.98      0.97      0.97     16950



In [29]:
print(confusion_matrix(y_all, y_pred_all_texts))

[[15154   545]
 [    7  1244]]


## Simulating real life uses

### Text cleaning

In [30]:
from unidecode import unidecode
import re
from nltk.corpus import stopwords

STOPWORDS = set(unidecode(sw) for sw in stopwords.words('portuguese'))
def clean_text(text):
    # Making sure we're dealing with strings and lowering the characters
    text = str(text).lower()
    
    # Stripping accents
    text = unidecode(text)
    
    # Removing characters that aren't alphanumeric
    text = re.sub(r'[^a-zA-Z0-9 ]', ' ', text)
    
    # Removing tokens that intercalate between letters and digits
    text = re.sub(r'\w*([a-zA-Z][0-9]|[0-9][a-zA-Z])\w*', ' ', text)
    
    # Removing digits
    text = re.sub(r'\d', ' ', text)
    
    # Removing tokens with letters that appear more than twice in a row
    text = re.sub(r'\w*([a-zA-Z])\1{2,}\w*', ' ', text)
    
    # Removing extra spaces
    text = re.sub('\s+', ' ', text)
    
    # Removing words with length equal or lower than 2 or are in STOPWORDS
    return ' '.join([token for token in text.split() if len(token) > 2 and token not in STOPWORDS])

In [31]:
def evaluate(msg):

    input_test = vectorizer.transform([clean_text(msg)])

    out = clf.predict(input_test)
    proba = int(max(clf.predict_proba(input_test)[0]) * 100)

    if out:
        print('Tenho {}% de certeza que esta notícia é FALSA'.format(proba))
    else:
        print('Tenho {}% de certeza que esta notícia é VERDADEIRA'.format(proba))
    


In [32]:
msg = """

O presidente Jair Bolsonaro assinou nesta segunda-feira (19) a medida provisória que irá transferir o Coaf (Conselho de Controle de Atividades Financeiras) do Ministério da Economia para o Banco Central. Decisão será publicada no Diário Oficial da União desta terça-feira (20).

Até o fim da gestão Temer, o Coaf era ligado ao antigo Ministério da Fazenda. Com a reforma ministerial do início do governo Bolsonaro, foi transferido para o Ministério da Justiça. O Congresso, no entanto, devolveu o órgão para o Ministério da Economia.

O órgão está em crise desde que decisão do ministro do Supremo Tribunal Federal, Dias Toffoli, suspendeu investigações com base no compartilhamento de dados detalhados de órgãos de controle, como o Coaf, sem autorização judicial.

"""
evaluate(msg)

Tenho 58% de certeza que esta notícia é VERDADEIRA


In [33]:
pipe = Pipeline((['vect', vectorizer], ['clf', clf]))

In [35]:
from lime.lime_text import LimeTextExplainer
explainer = LimeTextExplainer(class_names=[0, 1])

ModuleNotFoundError: No module named 'lime'

In [None]:
msg = """

PT entra com ação no STF para impedir que Bolsonaro reduza o preço do gás. De acordo com uma fonte de dentro do Congresso, 
integrantes do Partido dos Trabalhadores enviaram nesta segunda-feira (05) para o STF, uma ação exigindo que o presidente 
Jair Bolsonaro seja impedido de prosseguir com o projeto de redução do preço do gás no país. Para eles, a medida do presidente 
é inconstitucional e tem objetivo único de aumentar sua popularidade perante a população mais pobre. Bolsonaro, em parceria 
com o ministro da Economia Paulo Guedes, pretende reduzir o preço do gás doméstico em até 50% em 2 meses. 

Essa é uma das medidas do ministro para melhorar a economia do país de curto a médio prazo. A presidente do PT, 
Gleisi Hoffmann, afirmou que o partido fará de tudo para que o preço do gás continue como está. “Nós acabamos de enviar 
uma ação para que o STF barre essa medida populista e descabida desse fascista. Essa redução no preço do gás trará grandes 
prejuízos para a nossa querida Petrobrás e para os cofres públicos. Em qualquer país sério isso seria considerado crime de 
improbidade. Se depender de nós, o preço do gás continuará como está, pois a população pobre não precisa de migalhas desse 
presidente autoritário.” Disse a deputada durante entrevista. […]

"""

msg = clean_text(msg)

result = pipe.predict([msg])

if result == 1:
    veredict = "falsa"
else:
    veredict = "verdadeira"

print("Tenho {:.0f}% de certeza de que a notícia é {}.".format(max(pipe.predict_proba([msg])[0]) * 100, veredict))

exp = explainer.explain_instance(msg, pipe.predict_proba, num_features=10)
exp.show_in_notebook()

In [None]:
exp.as_list()

In [None]:
# To use other features
# https://stackoverflow.com/questions/41937786/add-column-to-a-sparse-matrix

# import scipy
# X_all_text_title = scipy.sparse.hstack((X_all_text_title, X_length[:, None]))

In [None]:
# https://www.datacamp.com/community/tutorials/introduction-t-sne

In [36]:
Counter(y_all[1001:1501])

Counter({1: 250, 0: 250})

In [37]:
df[1001:1501]

Unnamed: 0,DATE,FAKE,TEXT_CLEAN,TITLE_CLEAN,TEXT_LEN_CHAR,TEXT_LEN_TOKEN,TITLE_LEN_CHAR,TITLE_LEN_TOKEN
1001,11/08/2014,1,boato aecio neves concede entrevista bebado es...,aecio neves concede entrevista bebado estadao,51,7,45,6
1002,11/05/2014,1,montagem steven seagal apoia campanha dilma,montagem steven seagal apoia campanha dilma,43,6,43,6
1003,13/05/2014,1,alerta falso projeto acabar familia votado semana,alerta falso projeto acabar familia votado semana,49,7,49,7
1004,09/05/2014,1,balela amante dilma entra justica cobra pensao...,balela amante dilma entra justica cobra pensao...,57,8,57,8
1005,24/05/2014,1,boato jair bolsonaro vai candidatar presidente...,jair bolsonaro vai candidatar presidente eleicoes,55,7,49,6
1006,28/05/2014,1,primeiro ministro china sugere solucoes melhor...,ministro china aponta sugestoes melhorar brasil,469,52,47,6
1007,11/05/2014,1,informacao errada presidente paraguai elimina ...,informacao errada presidente paraguai elimina ...,59,7,59,7
1008,12/05/2014,1,balela sbt faz enquete dar programa rachel she...,balela sbt faz enquete dar programa rachel she...,54,8,54,8
1009,29/05/2014,1,hoax carta aberta presidente dilma rousseff se...,hoax carta aberta presidente dilma rousseff se...,56,8,56,8
1010,25/05/2014,1,informacao errada ministros vao ganhar bolsa c...,informacao errada ministros vao ganhar bolsa c...,57,8,57,8


In [38]:
import seaborn as sns
np.array(sns.color_palette("hls", 2))

array([[0.86  , 0.3712, 0.34  ],
       [0.34  , 0.8288, 0.86  ]])

In [39]:
def fashion_scatter(x, colors):
    # choose a color palette with seaborn.
    num_classes = len(np.unique(colors))
    palette = np.array(sns.color_palette(["#c52f33", "#2A8DC7"], num_classes))

    # create a scatter plot.
    f = plt.figure(figsize=(8, 8))
    ax = plt.subplot(aspect='equal')
    sc = ax.scatter(x[:,0], x[:,1], lw=0, s=40, c=palette[colors.astype(np.int)])
    plt.xlim(-25, 25)
    plt.ylim(-25, 25)
    ax.axis('off')
    ax.axis('tight')

    # add the labels for each digit corresponding to the label
    txts = []

    for i in range(num_classes):

        # Position of each label at median of data points.

        xtext, ytext = np.median(x[colors == i, :], axis=0)
        txt = ax.text(xtext, ytext, str(i), fontsize=24, color="#222222", bbox=dict(facecolor='#eeeeee', alpha=0.8))
        txts.append(txt)

    return f, ax, sc, txts

In [40]:
idx_min = 0
idx_max = 2502
features = X_all_text[idx_min:idx_max]

In [41]:
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import time
time_start = time.time()

pca = PCA(n_components=4)
pca_result = pca.fit_transform(features.toarray())
print('PCA done! Time elapsed: {} seconds'.format(time.time()-time_start))

MemoryError: 

In [None]:
print('Variance explained per principal component: {}'.format(pca.explained_variance_ratio_))

In [None]:
df_pca = pd.DataFrame(columns = ['pca1','pca2'])

df_pca['pca1'] = pca_result[:,0]
df_pca['pca2'] = pca_result[:,1]

print('Variance explained per principal component: {}'.format(pca.explained_variance_ratio_))

In [None]:
fashion_scatter(df_pca.values, df['FAKE'][idx_min:idx_max])

In [None]:
Counter(y_all[:2502])

In [None]:
from sklearn.manifold import TSNE
import time
time_start = time.time()

fashion_tsne = TSNE(random_state=42).fit_transform(X_all_text[:2502].toarray())

print('t-SNE done! Time elapsed: {} seconds'.format(time.time()-time_start))

In [None]:
fashion_scatter(fashion_tsne, df['FAKE'][:2502])