# Notebook de preprocessamento - junção dos arquivos

### Carregando dataframe com os dados dos textos normalizados das notícias reais

In [76]:
import glob
import pandas as pd

lista_arquivos_true = glob.glob('Fake.br-Corpus-master/Fake.br-Corpus-master/size_normalized_texts/true/*.txt')
lista_textos = []
lista_fake = []
for nome_arquivo in lista_arquivos_true:
    text = []
    with open(nome_arquivo , 'r', encoding='utf-8') as f:
        lines = f.readlines()
        for line in lines:
            text.append(line.strip())
          
    lista_textos.append([''.join(text)])
df_true = pd.DataFrame(lista_textos,columns=["texto normalizado"])  

### Incluindo uma coluna ao dataframe com a classe fake = 0 (notícias reais)

In [77]:
df_true["fake"] = 0

In [78]:
df_true.head() 

Unnamed: 0,texto normalizado,fake
0,O Podemos decidiu expulsar o deputado federal...,0
1,"Bolsonaro é um liberal completo, diz president...",0
2,Ministro do STF libera Andrea Neves de prisão ...,0
3,"Apesar da abundância, cresce preocupação com p...",0
4,"Por que Harvard e MIT levarão Dilma, Moro e Su...",0


In [73]:
df_true.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3600 entries, 0 to 3599
Data columns (total 2 columns):
texto normalizado    3600 non-null object
fake                 3600 non-null int64
dtypes: int64(1), object(1)
memory usage: 56.3+ KB


### Carregando dataframe com os dados dos textos normalizados das notícias fake

In [79]:
lista_arquivos_fake = glob.glob('Fake.br-Corpus-master/Fake.br-Corpus-master/size_normalized_texts/fake/*.txt')
lista_textos_fake = []
for nome_arquivo in lista_arquivos_fake:
    text = []
    with open(nome_arquivo , 'r', encoding='utf-8') as f:
        lines = f.readlines()
        for line in lines:
            text.append(line.strip())
          
    lista_textos_fake.append([''.join(text)])
    
    
df_fake = pd.DataFrame(lista_textos_fake,columns=["texto normalizado"])  

### Incluindo uma coluna ao dataframe com a classe fake = 1 (notícias fake)

In [80]:
df_fake["fake"] = 1

In [81]:
df_fake.head()

Unnamed: 0,texto normalizado,fake
0,Kátia Abreu diz que vai colocar sua expulsão e...,1
1,"Dr. Ray peita Bolsonaro, chama-o de “conservad...",1
2,Reinaldo Azevedo desmascarado pela Polícia Fed...,1
3,Relatório assustador do BNDES mostra dinheiro ...,1
4,"Radialista americano fala sobre o PT: ""Eles ve...",1


In [4]:
df_fake.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3600 entries, 0 to 3599
Data columns (total 2 columns):
texto normalizado    3600 non-null object
fake                 3600 non-null int64
dtypes: int64(1), object(1)
memory usage: 56.3+ KB


### Carregando dataframe com os metadados das notícias reais

In [82]:
lista_arquivos_meta = glob.glob('Fake.br-Corpus-master/Fake.br-Corpus-master/full_texts/true-meta-information/*.txt')
lista_textos_meta = []
lista_int_meta = []

lista_author = []
lista_link = []
lista_category = []
lista_tokens = []
lista_n_types = []
lista_n_of_l_i_n = []
lista_verbs = []
lista_n_sub_imp_v = []
lista_nouns = []
lista_adj = []
lista_adv = []
lista_n_mod_v = []
lista_pron = []
lista_p_w_spe_err = []
lista_emot = []
lista_div = []
for nome_arquivo in lista_arquivos_meta:

    with open(nome_arquivo , 'r', encoding='utf-8') as f:
        lines = f.readlines()
        lista_author.append(lines[0].strip())
        lista_link.append(lines[1].strip())
        lista_category.append(lines[2].strip())
        lista_tokens.append(int(str.strip(lines[4])))
        lista_n_types.append(str.strip(lines[6]))
        lista_n_of_l_i_n.append(lines[7].strip())
        lista_verbs.append(str.strip(lines[9]))
        lista_n_sub_imp_v.append(str.strip(lines[10]))
        lista_nouns.append(str.strip(lines[11]))
        lista_adj.append(str.strip(lines[12]))
        lista_adv.append(str.strip(lines[13]))
        lista_n_mod_v.append(str.strip(lines[14]))
        lista_pron.append(str.strip(lines[17]))
        lista_p_w_spe_err.append(str.strip(lines[22]))
        lista_emot.append(str.strip(lines[23]))
        lista_div.append(str.strip(lines[24]))
        
lista_textos_meta = {"author":lista_author,"link":lista_link,"category":lista_category}
lista_int_meta = {"number of tokens":lista_tokens,"number of types":lista_n_types,
                                  "number of links inside the news":lista_n_of_l_i_n,"number of verbs":lista_verbs,
                                  "number of subjuntive and imperative verbs":lista_n_sub_imp_v,"number of nouns":lista_nouns,
                                   "number of adjectives":lista_adj,"number of adverbs":lista_adv,
                                  "number of modal verbs (mainly auxiliary verbs)":lista_n_mod_v,"number of pronouns":lista_pron
                             }
lista_float_meta = {"percentage of news with speeling errors":lista_p_w_spe_err,"emotiveness":lista_emot,
                                   "diversity":lista_div
                 }

df_meta_text = pd.DataFrame(lista_textos_meta,columns=["author","link","category"])

df_meta_int = pd.DataFrame(lista_int_meta,columns=["number of tokens","number of types",
"number of links inside the news","number of verbs","number of subjuntive and imperative verbs","number of nouns","number of adjectives","number of adverbs",
"number of modal verbs (mainly auxiliary verbs)","number of pronouns"],dtype=int)

df_meta_float = pd.DataFrame(lista_float_meta,columns=["percentage of news with speeling errors","emotiveness","diversity"],dtype=float)

df_meta = df_meta_text.join(df_meta_int)

df_meta = df_meta.join(df_meta_float)

df_meta.head()

Unnamed: 0,author,link,category,number of tokens,number of types,number of links inside the news,number of verbs,number of subjuntive and imperative verbs,number of nouns,number of adjectives,number of adverbs,number of modal verbs (mainly auxiliary verbs),number of pronouns,percentage of news with speeling errors,emotiveness,diversity
0,Naira Trindade,http://politica.estadao.com.br/blogs/coluna-do...,politica,168,107,,24,2,43,5,4,3,7,0.0,0.134328,0.722973
1,Marco Rodrigo Almeida,http://www1.folha.uol.com.br/poder/2018/01/194...,politica,1028,474,,135,2,237,56,45,14,63,0.001156,0.271505,0.547977
2,"Fernando Zuba , Pedro Ângelo E Renan Ramalho",https://g1.globo.com/mg/minas-gerais/noticia/s...,politica,540,232,,69,0,146,10,20,7,19,0.0,0.139535,0.487395
3,"Por Anderson Viegas, G1 MS",https://g1.globo.com/mato-grosso-do-sul/notici...,politica,8634,2199,0.0,1053,29,2170,443,278,174,384,0.002504,0.223705,0.2898
4,Por BBC,https://g1.globo.com/educacao/noticia/por-que-...,politica,955,452,0.0,96,1,262,36,50,8,33,0.0,0.240223,0.54921


In [24]:
df_meta.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3600 entries, 0 to 3599
Data columns (total 16 columns):
author                                            3600 non-null object
link                                              3600 non-null object
category                                          3600 non-null object
number of tokens                                  3600 non-null int32
number of types                                   3600 non-null int32
number of links inside the news                   3600 non-null object
number of verbs                                   3600 non-null int32
number of subjuntive and imperative verbs         3600 non-null int32
number of nouns                                   3600 non-null int32
number of adjectives                              3600 non-null int32
number of adverbs                                 3600 non-null int32
number of modal verbs (mainly auxiliary verbs)    3600 non-null int32
number of pronouns                               

### Fazendo uma junção das colunas do dataframe do texto das notícias reais e seus metadados

In [83]:
df_true_completo = df_true.join(df_meta)
df_true_completo.head()

Unnamed: 0,texto normalizado,fake,author,link,category,number of tokens,number of types,number of links inside the news,number of verbs,number of subjuntive and imperative verbs,number of nouns,number of adjectives,number of adverbs,number of modal verbs (mainly auxiliary verbs),number of pronouns,percentage of news with speeling errors,emotiveness,diversity
0,O Podemos decidiu expulsar o deputado federal...,0,Naira Trindade,http://politica.estadao.com.br/blogs/coluna-do...,politica,168,107,,24,2,43,5,4,3,7,0.0,0.134328,0.722973
1,"Bolsonaro é um liberal completo, diz president...",0,Marco Rodrigo Almeida,http://www1.folha.uol.com.br/poder/2018/01/194...,politica,1028,474,,135,2,237,56,45,14,63,0.001156,0.271505,0.547977
2,Ministro do STF libera Andrea Neves de prisão ...,0,"Fernando Zuba , Pedro Ângelo E Renan Ramalho",https://g1.globo.com/mg/minas-gerais/noticia/s...,politica,540,232,,69,0,146,10,20,7,19,0.0,0.139535,0.487395
3,"Apesar da abundância, cresce preocupação com p...",0,"Por Anderson Viegas, G1 MS",https://g1.globo.com/mato-grosso-do-sul/notici...,politica,8634,2199,0.0,1053,29,2170,443,278,174,384,0.002504,0.223705,0.2898
4,"Por que Harvard e MIT levarão Dilma, Moro e Su...",0,Por BBC,https://g1.globo.com/educacao/noticia/por-que-...,politica,955,452,0.0,96,1,262,36,50,8,33,0.0,0.240223,0.54921


In [84]:
df_true_completo.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3600 entries, 0 to 3599
Data columns (total 18 columns):
texto normalizado                                 3600 non-null object
fake                                              3600 non-null int64
author                                            3600 non-null object
link                                              3600 non-null object
category                                          3600 non-null object
number of tokens                                  3600 non-null int32
number of types                                   3600 non-null int32
number of links inside the news                   3600 non-null object
number of verbs                                   3600 non-null int32
number of subjuntive and imperative verbs         3600 non-null int32
number of nouns                                   3600 non-null int32
number of adjectives                              3600 non-null int32
number of adverbs                               

In [85]:
df_true_completo.shape

(3600, 18)

### Carregando dataframe com os metadados das notícias fake

In [86]:
lista_arquivos_fake_meta = glob.glob('Fake.br-Corpus-master/Fake.br-Corpus-master/full_texts/fake-meta-information/*.txt')
lista_textos_fake_meta = []

lista_author = []
lista_link = []
lista_category = []
lista_tokens = []
lista_n_types = []
lista_n_of_l_i_n = []
lista_verbs = []
lista_n_sub_imp_v = []
lista_nouns = []
lista_adj = []
lista_adv = []
lista_n_mod_v = []
lista_pron = []
lista_p_w_spe_err = []
lista_emot = []
lista_div = []
for nome_arquivo in lista_arquivos_fake_meta:
    
    with open(nome_arquivo , 'r') as f:
        lines = f.readlines()
        lista_author.append(lines[0].strip())
        lista_link.append(lines[1].strip())
        lista_category.append(lines[2].strip())
        lista_tokens.append(lines[4])
        lista_n_types.append(lines[6])
        lista_n_of_l_i_n.append(lines[7].strip())
        lista_verbs.append(lines[9])
        lista_n_sub_imp_v.append(lines[10])
        lista_nouns.append(lines[11])
        lista_adj.append(lines[12])
        lista_adv.append(lines[13])
        lista_n_mod_v.append(lines[14])
        lista_pron.append(lines[17])
        lista_p_w_spe_err.append(lines[22])
        lista_emot.append(lines[23])
        lista_div.append(lines[24])
        
    
lista_textos_meta = {"author":lista_author,"link":lista_link,"category":lista_category}
lista_int_meta = {"number of tokens":lista_tokens,"number of types":lista_n_types,
                                  "number of links inside the news":lista_n_of_l_i_n,"number of verbs":lista_verbs,
                                  "number of subjuntive and imperative verbs":lista_n_sub_imp_v,"number of nouns":lista_nouns,
                                   "number of adjectives":lista_adj,"number of adverbs":lista_adv,
                                  "number of modal verbs (mainly auxiliary verbs)":lista_n_mod_v,"number of pronouns":lista_pron
                             }
lista_float_meta = {"percentage of news with speeling errors":lista_p_w_spe_err,"emotiveness":lista_emot,
                                   "diversity":lista_div
                 }

df_meta_text = pd.DataFrame(lista_textos_meta,columns=["author","link","category"])

df_meta_int = pd.DataFrame(lista_int_meta,columns=["number of tokens","number of types",
"number of links inside the news","number of verbs","number of subjuntive and imperative verbs","number of nouns","number of adjectives","number of adverbs",
"number of modal verbs (mainly auxiliary verbs)","number of pronouns"],dtype=int)

df_meta_float = pd.DataFrame(lista_float_meta,columns=["percentage of news with speeling errors","emotiveness","diversity"],dtype=float)

df_meta = df_meta_text.join(df_meta_int)

df_meta = df_meta.join(df_meta_float)

df_meta.head()

Unnamed: 0,author,link,category,number of tokens,number of types,number of links inside the news,number of verbs,number of subjuntive and imperative verbs,number of nouns,number of adjectives,number of adverbs,number of modal verbs (mainly auxiliary verbs),number of pronouns,percentage of news with speeling errors,emotiveness,diversity
0,mrk,https://ceticismopolitico.com/2017/11/30/katia...,politica,211,120,0,30,1,46,7,13,5,26,0.0,0.263158,0.648649
1,,https://ceticismopolitico.com/2017/11/24/dr-ra...,politica,289,163,0,56,8,64,11,18,11,20,0.007874,0.241667,0.641732
2,,https://afolhabrasil.com.br/politica/reinaldo-...,politica,304,170,0,45,1,88,9,8,8,18,0.003636,0.12782,0.618182
3,,https://www.diariodobrasil.org/relatorio-assus...,politica,639,316,1,87,7,175,39,21,14,34,0.001748,0.229008,0.552448
4,,https://www.diariodobrasil.org/radialista-amer...,politica,128,82,0,21,0,31,6,8,1,12,0.0,0.269231,0.738739


### Fazendo uma junção das colunas do dataframe do texto das notícias fake e seus metadados

In [87]:
df_fake_completo = df_fake.join(df_meta)
df_fake_completo.head()

Unnamed: 0,texto normalizado,fake,author,link,category,number of tokens,number of types,number of links inside the news,number of verbs,number of subjuntive and imperative verbs,number of nouns,number of adjectives,number of adverbs,number of modal verbs (mainly auxiliary verbs),number of pronouns,percentage of news with speeling errors,emotiveness,diversity
0,Kátia Abreu diz que vai colocar sua expulsão e...,1,mrk,https://ceticismopolitico.com/2017/11/30/katia...,politica,211,120,0,30,1,46,7,13,5,26,0.0,0.263158,0.648649
1,"Dr. Ray peita Bolsonaro, chama-o de “conservad...",1,,https://ceticismopolitico.com/2017/11/24/dr-ra...,politica,289,163,0,56,8,64,11,18,11,20,0.007874,0.241667,0.641732
2,Reinaldo Azevedo desmascarado pela Polícia Fed...,1,,https://afolhabrasil.com.br/politica/reinaldo-...,politica,304,170,0,45,1,88,9,8,8,18,0.003636,0.12782,0.618182
3,Relatório assustador do BNDES mostra dinheiro ...,1,,https://www.diariodobrasil.org/relatorio-assus...,politica,639,316,1,87,7,175,39,21,14,34,0.001748,0.229008,0.552448
4,"Radialista americano fala sobre o PT: ""Eles ve...",1,,https://www.diariodobrasil.org/radialista-amer...,politica,128,82,0,21,0,31,6,8,1,12,0.0,0.269231,0.738739


### Adicionando os dados do dataframe das notícias reais aos de fake

In [93]:
df_completo = df_true_completo.append(df_fake_completo)

In [94]:
df_completo.shape

(7200, 18)

### Salvando em arquivo o dataframe

In [95]:
df_completo.to_csv('fake_news_fakebrcorpus.csv', sep=';', encoding='utf-8')

### Recuperando o arquivo para um novo dataframe para testar

In [96]:
df_recuperado = pd.read_csv('fake_news_fakebrcorpus.csv', sep=';', encoding='utf-8')

In [97]:
df_recuperado.tail()

Unnamed: 0.1,Unnamed: 0,texto normalizado,fake,author,link,category,number of tokens,number of types,number of links inside the news,number of verbs,number of subjuntive and imperative verbs,number of nouns,number of adjectives,number of adverbs,number of modal verbs (mainly auxiliary verbs),number of pronouns,percentage of news with speeling errors,emotiveness,diversity
7195,3595,"FT: ""O julgamento de um populista que usou din...",1,,https://www.diariodobrasil.org/ft-o-julgamento...,politica,185,101,0,27,0,46,9,4,4,10,0.0,0.178082,0.619632
7196,3596,Nota fiscal emitida pela Petrobras mostra que ...,1,,https://www.diariodobrasil.org/nota-fiscal-emi...,politica,148,86,1,19,0,38,9,2,4,2,0.0,0.192982,0.651515
7197,3597,"Estão estancando a sangria da ""Lava-Jato"" bem ...",1,,https://www.diariodobrasil.org/estao-estancand...,politica,313,181,0,43,2,76,14,8,6,17,0.0,0.184874,0.665441
7198,3598,"(RJ) Cidadão finge estar possuído pela ""pomba ...",1,,https://www.diariodobrasil.org/rj-cidadao-fing...,sociedade_cotidiano,131,82,0,29,0,23,7,1,6,12,0.0,0.153846,0.796117
7199,3599,Autor de best-seller cita 5 expressões que ind...,1,,https://www.diariodobrasil.org/autor-de-best-s...,tv_celebridades,502,216,0,86,6,90,23,27,9,55,0.004762,0.284091,0.514286
