In [1]:
import json
from pathlib import Path
import pandas as pd
import numpy as np

pd.set_option('expand_frame_repr', True)
pd.set_option('display.column_space', 6)
pd.set_option('display.max_colwidth', 20)
pd.set_option('display.max_columns', None)
pd.set_option('display.show_dimensions', True)

# Data Preparation

The intent of this notebook is to tabulated the raw data from multiple files into one CSV file. The new data will be avaliable at the path defined in the variable `INTERMEDIATE_DATA_FOLDER` below.

In [2]:
CURRENT_WORK_DIRECTORY = Path().resolve().parents[0]

RAW_DATA_FOLDER = CURRENT_WORK_DIRECTORY / 'data/raw/'
INTERMEDIATE_DATA_FOLDER = CURRENT_WORK_DIRECTORY / 'data/interim/'
FULL_DATASET_PATH = INTERMEDIATE_DATA_FOLDER/ "fake_true_news.csv"

FAKE_DATA_FOLDER = RAW_DATA_FOLDER / 'fake'
TRUE_DATA_FOLDER = RAW_DATA_FOLDER / 'true'
FAKE_META_FOLDER = RAW_DATA_FOLDER / 'fake-meta-information'
TRUE_META_FOLDER = RAW_DATA_FOLDER / 'true-meta-information'

INTERMEDIATE_DATA_FOLDER.mkdir(exist_ok=True, parents=True)

## Preparing text data

First, we are going to load the full texts into a pandas dataframe for both fake and true news (indexed by the number of the file).

In [3]:
def create_text_df(folder):
    df_dict = {}
    for filepath in folder.glob("*.txt"):
        with open(filepath, 'r', encoding='utf-8') as f:
            df_dict[filepath.stem] = f.read() 
    return pd.DataFrame.from_dict(df_dict, orient='index', columns=['text'])

fake_text_df = create_text_df(FAKE_DATA_FOLDER)
true_text_df = create_text_df(TRUE_DATA_FOLDER)

In [4]:
display(fake_text_df.shape)
fake_text_df.head()

(3600, 1)

Unnamed: 0,text
1,Kátia Abreu diz que vai colocar sua expulsão e...
10,"Dr. Ray peita Bolsonaro, chama-o de conservad..."
100,Reinaldo Azevedo desmascarado pela Polícia Fed...
1000,Relatório assustador do BNDES mostra dinheiro ...
1001,"Radialista americano fala sobre o PT: ""Eles ve..."


In [5]:
display(true_text_df.shape)
true_text_df.head()

(3600, 1)

Unnamed: 0,text
1,﻿O Podemos decidiu expulsar o deputado federa...
10,"Bolsonaro é um liberal completo, diz president..."
100,Ministro do STF libera Andrea Neves de prisão ...
1000,"Apesar da abundância, cresce preocupação com p..."
1001,"Por que Harvard e MIT levarão Dilma, Moro e Su..."


## Metadata Dataset

On this section the metadata available as jsons are all tabulated into 2 dataframes (for both classes). The description for each variable (extracted from the source) can be found in *references/news_data_dictionary.json* file on the root of this repository.

In [6]:
def create_metadata_df(folder, metadata_columns):
    df_dict = {}
    df_dict = {k:[] for k in metadata_columns}
    df_dict["index"] = []
    
    for filepath in list(folder.glob("*.txt")):
        with open(filepath, 'r') as f:    
            df_dict["index"].append(filepath.stem.split("-")[0])
            for col, value in zip(metadata_columns, f.readlines()):
                df_dict[col].append(value[0:-1])
    
    df = pd.DataFrame(df_dict)
    df.index.name = None
    return df

metadata_columns = [
    "author", "link", "category", "date_of_publication",
    "tokens", "words_no_punctuation", "types", "links_inside", 
    "upper_words", "verbs", "subjuntive_imperative_verbs",
    "nouns", "adjectives", "adverbs", "modal_verbs", 
    "singular_first_second_personal_pronouns",
    "plural_first_personal_pronouns", "pronouns",
    "pausality", "characters", "average_sentence_length",
    "average_word_lenght", "percentage_spelling_errors",
    "emotiveness", "diversity"
]

fake_metadata_df = create_metadata_df(FAKE_META_FOLDER, metadata_columns)
true_metadata_df = create_metadata_df(TRUE_META_FOLDER, metadata_columns)

In [7]:
display(fake_metadata_df.shape)
fake_metadata_df.head()

(3600, 26)

Unnamed: 0,author,link,category,date_of_publication,tokens,words_no_punctuation,types,links_inside,upper_words,verbs,...,plural_first_personal_pronouns,pronouns,pausality,characters,average_sentence_length,average_word_lenght,percentage_spelling_errors,emotiveness,diversity,index
0,mrk,https://ceticismopolitico.com/2017/11/30/katia...,politica,2017-11-30,211,185,120,0,6,30,...,0,26,2.0,815,14.2308,4.40541,0.0,0.263158,0.64864,1
1,,https://ceticismopolitico.com/2017/11/24/dr-ra...,politica,2017-11-24,289,254,163,0,0,56,...,0,20,2.5,1205,18.1429,4.74409,0.00787402,0.241667,0.64173,10
2,,https://afolhabrasil.com.br/politica/reinaldo-...,politica,2017-05-23,304,275,170,0,0,45,...,0,18,1.8125,1344,17.1875,4.88727,0.00363636,0.12782,0.61818,100
3,,https://www.diariodobrasil.org/relatorio-assus...,politica,24/07/2017,639,572,316,1,14,87,...,0,34,2.68,3122,22.88,5.45804,0.00174825,0.229008,0.55244,1000
4,,https://www.diariodobrasil.org/radialista-amer...,politica,25/07/2017,128,111,82,0,1,21,...,0,12,0.894737,515,5.84211,4.63964,0.0,0.269231,0.73873,1001


In [8]:
display(true_metadata_df.shape)
true_metadata_df.head()

(3600, 26)

Unnamed: 0,author,link,category,date_of_publication,tokens,words_no_punctuation,types,links_inside,upper_words,verbs,...,plural_first_personal_pronouns,pronouns,pausality,characters,average_sentence_length,average_word_lenght,percentage_spelling_errors,emotiveness,diversity,index
0,Naira Trindade,http://politica.estadao.com.br/blogs/coluna-do...,politica,13/12/2017,168,148,107,,0,24,...,0,7,3.33333,761,24.6667,5.14189,0.0,0.134328,0.72297,1
1,Marco Rodrigo Almeida,http://www1.folha.uol.com.br/poder/2018/01/194...,politica,12/1/2018,1028,865,474,,9,135,...,1,63,2.85965,4205,15.1754,4.86127,0.00115607,0.271505,0.54797,10
2,"Fernando Zuba , Pedro Ângelo E Renan Ramalho",https://g1.globo.com/mg/minas-gerais/noticia/s...,politica,7/12/2017,540,476,232,,8,69,...,0,19,3.04762,2399,22.6667,5.03992,0.0,0.139535,0.48739,100
3,"Por Anderson Viegas, G1 MS",https://g1.globo.com/mato-grosso-do-sul/notici...,politica,31/10/2017 10h17,8634,7588,2199,0.0,52,1053,...,2,384,3.36334,37381,24.3987,4.92633,0.00250395,0.223705,0.289,1000
4,Por BBC,https://g1.globo.com/educacao/noticia/por-que-...,politica,05/04/2017 09h08,955,823,452,0.0,10,96,...,0,33,3.14286,4239,19.5952,5.15067,0.0,0.240223,0.5492,1001


Some of the null values are defined as a "None" string and can difficult the next steps. We need to fix this behavior by replacing the string with np.nan

In [9]:
print("# of 'None' strings")
print(f'True: {len(fake_metadata_df[fake_metadata_df.isin(["None"]).any(axis=1)])}')
print(f'True: {len(true_metadata_df[true_metadata_df.isin(["None"]).any(axis=1)])}')

fake_metadata_df = fake_metadata_df.replace("None", np.nan)
true_metadata_df = true_metadata_df.replace("None", np.nan)

# of 'None' strings
True: 3528
True: 1393


We also need to correct the data types for the variables. These types were defined manually. **NOTE:** the variable *date_of_publication* have a lot of different formats that could not be identified by the pandas library, since we are not going to use this data it will be loaded as a string.

In [10]:
metadata_dtypes = {
    "author": "string", "link": "string", "category": "string",
    "date_of_publication": "string",
    "tokens": "float", "words_no_punctuation": "float",
    "types": "float","links_inside": "float", "upper_words": "float",
    "verbs": "float", "subjuntive_imperative_verbs": "float", "nouns": "float", 
    "adjectives": "float", "adverbs": "float","modal_verbs": "float", 
    "singular_first_second_personal_pronouns": "float",
    "plural_first_personal_pronouns": "float", "pronouns": "float","characters": "float",
    "pausality": "float", "average_sentence_length": "float",
    "average_word_lenght": "float", "percentage_spelling_errors": "float",
    "emotiveness": "float", "diversity": "float"
}

fake_metadata_df = fake_metadata_df.astype(metadata_dtypes, errors='raise').set_index("index", drop=True)
true_metadata_df = true_metadata_df.astype(metadata_dtypes, errors='raise').set_index("index", drop=True)

## Creating full dataset

The only thing left is to merge the data into one unique csv. This data will be available at the `FULL_DATASET_PATH` at the beginning of this notebook.

### Fake News Dataset

In [11]:
fake_df = pd.concat([fake_text_df, fake_metadata_df], axis=1, sort=False)
fake_df.index = fake_df.index.astype(int)
fake_df = fake_df.reset_index().rename(columns={"index": "file_index"})
fake_df = fake_df.sort_index()

fake_df.shape

(3600, 27)

### True News Dataset

In [12]:
true_df = pd.concat([true_text_df, true_metadata_df], axis=1, sort=False)
true_df.index = true_df.index.astype(int)
true_df = true_df.reset_index().rename(columns={"index": "file_index"})
true_df = true_df.sort_index()

true_df.shape

(3600, 27)

### Full Dataset

In [13]:
result = pd.concat([true_df, fake_df], keys=['True', 'Fake'])
result = result.reset_index(level=0)
result = result.rename(columns={"level_0": "class"})
result.index.name = None

display(result.shape)
display(result.head())

result.to_csv(FULL_DATASET_PATH, index=False)

(7200, 28)

Unnamed: 0,class,file_index,text,author,link,category,date_of_publication,tokens,words_no_punctuation,types,...,singular_first_second_personal_pronouns,plural_first_personal_pronouns,pronouns,pausality,characters,average_sentence_length,average_word_lenght,percentage_spelling_errors,emotiveness,diversity
0,True,1,﻿O Podemos decidiu expulsar o deputado federa...,Naira Trindade,http://politica.estadao.com.br/blogs/coluna-do...,politica,13/12/2017,168.0,148.0,107.0,...,0.0,0.0,7.0,3.33333,761.0,24.6667,5.14189,0.0,0.134328,0.72297
1,True,10,"Bolsonaro é um liberal completo, diz president...",Marco Rodrigo Almeida,http://www1.folha.uol.com.br/poder/2018/01/194...,politica,12/1/2018,1028.0,865.0,474.0,...,3.0,1.0,63.0,2.85965,4205.0,15.1754,4.86127,0.001156,0.271505,0.54797
2,True,100,Ministro do STF libera Andrea Neves de prisão ...,"Fernando Zuba , Pedro Ângelo E Renan Ramalho",https://g1.globo.com/mg/minas-gerais/noticia/s...,politica,7/12/2017,540.0,476.0,232.0,...,0.0,0.0,19.0,3.04762,2399.0,22.6667,5.03992,0.0,0.139535,0.48739
3,True,1000,"Apesar da abundância, cresce preocupação com p...","Por Anderson Viegas, G1 MS",https://g1.globo.com/mato-grosso-do-sul/notici...,politica,31/10/2017 10h17,8634.0,7588.0,2199.0,...,0.0,2.0,384.0,3.36334,37381.0,24.3987,4.92633,0.002504,0.223705,0.289
4,True,1001,"Por que Harvard e MIT levarão Dilma, Moro e Su...",Por BBC,https://g1.globo.com/educacao/noticia/por-que-...,politica,05/04/2017 09h08,955.0,823.0,452.0,...,0.0,0.0,33.0,3.14286,4239.0,19.5952,5.15067,0.0,0.240223,0.5492
