# Treinando um Classificador de Notícias Falsas no Corpus Fake.br

_O trabalho aqui realizado é uma replicação dos experimentos descritos em [1]._

## Os Dados

In [1]:
import pandas as pd
import numpy as np
import os
import utils

In [2]:
noticias_fake = utils.import_metadata("fake")
noticias_fake["true"] = 0
noticias_true = utils.import_metadata("true")
noticias_true["true"] = 1

In [3]:
noticias = noticias_fake.append(noticias_true)
noticias = noticias.drop(["author", "link", "category", "date_of_publication", "number_of_tokens", "words_without_punct", "number_of_types",
               "number_of_links", "upper_case_words", "id"], axis = 1)

In [4]:
noticias_fake_corpo = utils.import_texto("fake")
noticias_true_corpo = utils.import_texto("true")

In [5]:
noticias_fake_corpo = utils.normaliza_texto(noticias_fake_corpo)
noticias_true_corpo = utils.normaliza_texto(noticias_true_corpo)

In [6]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer()

corpus = []

for noticia in sorted(noticias_fake_corpo.keys()):
    corpus.append(" ".join(noticias_fake_corpo[noticia]))
for nocicia in sorted(noticias_true_corpo.keys()):
    corpus.append(" ".join(noticias_true_corpo[noticia]))
    
X = vectorizer.fit_transform(corpus) 

In [7]:
bow = pd.DataFrame(X.toarray(), columns = vectorizer.get_feature_names())
bow = bow.iloc[:,520:]

In [8]:
result = pd.concat([noticias, bow], axis=1, join_axes=[noticias.index])
result["non_immediacy"] = noticias.sing_first_sec_personal_pronouns + noticias.plural_first_personal_pronouns

In [9]:
# Liberando memória
del noticias_fake, noticias_true, noticias_fake_corpo, noticias_true_corpo, vectorizer, corpus, X, bow

In [10]:
import ml

# Modelo 1: POS tags
metricas_pos_f = ml.train_evaluate(result, features = "pos")

In [11]:
# Modelo 2: Bag of Words
metricas_bow = ml.train_evaluate(result, features = "bow")

  'precision', 'predicted', average, warn_for)


In [23]:
# Modelo 3: POS + BoW
metricas_pos_bow = ml.train_evaluate(result, features = "pos+bow")

In [16]:
# Modelo 4: Pausality
metricas_pau = ml.train_evaluate(result, features = "pau")

In [18]:
# Modelo 5: Emotiveness
metricas_emo = ml.train_evaluate(result, features = "emo")

In [20]:
# Modelo 6: Uncertainty
metricas_unc = ml.train_evaluate(result, features = "unc")

In [30]:
# Modelo 7: Non-Immediacy
metricas_nim = ml.train_evaluate(result, features = "nim")

In [31]:
# Modelo 8: Pausality + Emotiveness + Uncertainty + Non-Immediacy
metricas_peun = ml.train_evaluate(result, features = "p+e+u+n")

In [None]:
# Modelo 9: Bag of Words + Emotiveness
metricas_bow_emo = ml.train_evaluate(result, features = "bow+e")

In [None]:
# Modelo 10: Todas as features
metricas_all = ml.train_evaluate(result)

In [33]:
print(metricas_peun)

              precision    recall  f1-score   support

           0       0.86      0.91      0.88      3600
           1       0.91      0.85      0.88      3600

    accuracy                           0.88      7200
   macro avg       0.88      0.88      0.88      7200
weighted avg       0.88      0.88      0.88      7200



## Referências

Monteiro, Rafael A., et al. "Contributions to the Study of Fake News in Portuguese: New Corpus and Automatic Detection Results." International Conference on Computational Processing of the Portuguese Language. Springer, Cham, 2018.

In [None]:
relatorio = open("relatorio.txt", "a")
relatorio.write("POS " + metricas_pos_f)
relatorio.write("BoW " + metricas_bow)
relatorio.write("POS + BoW " + metricas_pos_bow)
relatorio.write("PAU " + metricas_pau)
relatorio.write("Emo " + metricas_emo)
relatorio.write("Unc " + metricas_unc)
relatorio.write("NIM " + metricas_nim)
relatorio.write("PEUN " + metricas_peun)
relatorio.write("BoW + Emo " + metricas_bow_emo)
relatorio.write("Todas " + metricas_all)
relatorio.close()
