In [1]:
%%capture

%cd ..
%load_ext autoreload
%autoreload 2

In [15]:
from pathlib import Path
import pandas as pd
import numpy as np
import json

# Data Acquisition and Preparation

In [16]:
RAW_DATA_FOLDER = Path('data/raw/')
INTERMEDIATE_DATA_FOLDER = Path('data/interim/')
REFERENCE_FOLDER = Path('references/')

## Functions

In [17]:
def create_text_dataframe(folder):
    df_dict = {}
    for filepath in folder.glob("*.txt"):
        with open(filepath, 'r', encoding='utf-8') as f:
            df_dict[filepath.stem] = f.read() 
    return pd.DataFrame.from_dict(df_dict, orient='index', columns=['text'])

def create_metadata_datasets(folder, metadata_columns, metadata_dtypes):
    df_dict = {}
    df_dict = {k:[] for k in metadata_columns}
    df_dict["index"] = []
    
    for filepath in list(folder.glob("*.txt")):
        with open(filepath, 'r') as f:    
            df_dict["index"].append(filepath.stem.split("-")[0])
            for col, value in zip(metadata_columns, f.readlines()):
                df_dict[col].append(value[0:-1])
    
    df = pd.DataFrame(df_dict)
    df = df.replace("None", np.nan)
    df = df.astype(metadata_dtypes, errors='ignore').set_index("index", drop=True)
    df.index.name = None
    
    return df

## Preparing Dataset

In [18]:
FAKE_DATA_FOLDER = RAW_DATA_FOLDER / 'fake'
TRUE_DATA_FOLDER = RAW_DATA_FOLDER / 'true'
FAKE_META_FOLDER = RAW_DATA_FOLDER / 'fake-meta-information'
TRUE_META_FOLDER = RAW_DATA_FOLDER / 'true-meta-information'

### Text datasets

In [19]:
fake_text_df = create_text_dataframe(FAKE_DATA_FOLDER)
true_text_df = create_text_dataframe(TRUE_DATA_FOLDER)

### Metadata Datasets

In [20]:
metadata_columns = [
    "author", "link", "category", "date_of_publication",
    "tokens", "words_no_punctuation", "types", "links_inside", 
    "upper_words", "verbs", "subjuntive_imperative_verbs",
    "nouns", "adjectives", "adverbs", "modal_verbs", 
    "singular_first_second_personal_pronouns",
    "plural_first_personal_pronouns", "pronouns",
    "pausality", "characters", "average_sentence_length",
    "average_word_lenght", "percentage_spelling_errors",
    "emotiveness", "diversity"
]

metadata_translate = [
    "author", "link", "category", "date of publication", "number of tokens",
    "number of words without punctuation", "number of types",
    "number of links inside the news", "number of words in upper case",
    "number of verbs", "number of subjuntive and imperative verbs",
    "number of nouns", "number of adjectives", "number of adverbs",
    "number of modal verbs (mainly auxiliary verbs)",
    "number of singular first and second personal pronouns",
    "number of plural first personal pronouns", "number of pronouns",
    "pausality", "number of characters", "average sentence length",
    "average word length", "percentage of news with speeling errors",
    "emotiveness", "diversity"
]

metadata_dtypes = {
    "author": "string", "link": "string", "category": "string",
    "date_of_publication": "datetime64[ns]",
    "tokens": "float", "words_no_punctuation": "float",
    "types": "float","links_inside": "float", "upper_words": "float",
    "verbs": "float", "subjuntive_imperative_verbs": "float", "nouns": "float", 
    "adjectives": "float", "adverbs": "float","modal_verbs": "float", 
    "singular_first_second_personal_pronouns": "float",
    "plural_first_personal_pronouns": "float", "pronouns": "float","characters": "float",
    "pausality": "float", "average_sentence_length": "float",
    "average_word_lenght": "float", "percentage_spelling_errors": "float",
    "emotiveness": "float", "diversity": "float"
}

In [21]:
fake_metadata_df = create_metadata_datasets(FAKE_META_FOLDER, metadata_columns, metadata_dtypes)
true_metadata_df = create_metadata_datasets(TRUE_META_FOLDER, metadata_columns, metadata_dtypes)

In [22]:
true_metadata_df.links_inside.unique()

array([ 0., nan])

In [23]:
true_metadata_df[true_metadata_df.links_inside.isna()]

Unnamed: 0,author,link,category,date_of_publication,tokens,words_no_punctuation,types,links_inside,upper_words,verbs,...,singular_first_second_personal_pronouns,plural_first_personal_pronouns,pronouns,pausality,characters,average_sentence_length,average_word_lenght,percentage_spelling_errors,emotiveness,diversity
2175,Por G1 DF,https://g1.globo.com/df/distrito-federal/notic...,sociedade_cotidiano,22/03/2018 07h10,741.0,640.0,361.0,,5.0,76.0,...,0.0,0.0,18.0,3.06061,3143.0,19.3939,4.91094,0.003125,0.184116,0.56406
1179,"Luiz Vassallo, Rafael Moraes Moura, Breno Pir...",http://politica.estadao.com.br/blogs/fausto-ma...,tv_celebridades,23 de setembro de 2017,1289.0,1173.0,529.0,,14.0,146.0,...,0.0,0.0,62.0,2.76190,5997.0,27.9286,5.11253,0.002558,0.235294,0.45090
2335,"Por Erick Gimenes e José Vianna, G1 PR, Curit...",https://g1.globo.com/pr/parana/noticia/moro-de...,tv_celebridades,28/04/2017 18h52,429.0,387.0,206.0,,1.0,64.0,...,0.0,0.0,17.0,2.62500,1951.0,24.1875,5.04134,0.000000,0.125714,0.53200
243,"Bernardo Caram, Alessandra Modzeleski E Fernan...",https://g1.globo.com/politica/noticia/rodrigo-...,politica,14/12/2017,599.0,505.0,241.0,,4.0,88.0,...,3.0,4.0,34.0,3.35714,2392.0,18.0357,4.73663,0.000000,0.163090,0.47722
1596,Por G1,https://g1.globo.com/pop-arte/cinema/noticia/i...,sociedade_cotidiano,07/09/2017 08h32,989.0,829.0,469.0,,4.0,125.0,...,0.0,0.0,57.0,4.10256,3967.0,21.2564,4.78528,0.001206,0.273973,0.56574
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1127,Por BBC,https://g1.globo.com/politica/noticia/essa-ide...,politica,19/05/2017 09h24,1014.0,855.0,416.0,,2.0,150.0,...,1.0,1.0,86.0,3.11765,4237.0,16.7647,4.95556,0.001170,0.355114,0.48650
2931,"Por Matheus Rodrigues e Cássio Bruno, G1 Rio",https://g1.globo.com/rj/rio-de-janeiro/noticia...,politica,19/11/2017 12h53,974.0,812.0,430.0,,8.0,111.0,...,1.0,0.0,37.0,3.44681,3892.0,17.2766,4.79310,0.001232,0.195531,0.52955
3392,Por BBC,https://g1.globo.com/economia/tecnologia/notic...,politica,21/03/2018 13h40,1149.0,1035.0,458.0,,8.0,175.0,...,1.0,2.0,67.0,2.65116,5271.0,24.0698,5.09275,0.001932,0.175055,0.44251
2922,"Por Fernanda Borges, G1 GO",https://g1.globo.com/goias/noticia/ex-deputado...,politica,14/04/2017 11h39,1797.0,1563.0,624.0,,19.0,223.0,...,4.0,0.0,76.0,3.54545,7698.0,23.6818,4.92514,0.000000,0.168142,0.39923


In [24]:
fake_metadata_df.links_inside.unique()

array([ 0.,  1.,  2.,  3.,  4.,  5.,  9., 11.,  6.,  8.])

## Merging Created Datasets

### Fake Dataset

In [25]:
fake_df = pd.concat([fake_text_df, fake_metadata_df], axis=1, sort=False)
fake_df.index = fake_df.index.astype(int)
fake_df = fake_df.sort_index()
fake_df = fake_df.reset_index(drop=True)

In [26]:
fake_metadata_df.links_inside.unique()

array([ 0.,  1.,  2.,  3.,  4.,  5.,  9., 11.,  6.,  8.])

### True Dataset

In [27]:
true_df = pd.concat([true_text_df, true_metadata_df], axis=1, sort=False)
true_df.index = true_df.index.astype(int)
true_df = true_df.sort_index()
true_df = true_df.reset_index(drop=True)

In [28]:
true_df.links_inside.unique()

array([nan,  0.])

## Merge All Datasets

In [29]:
result = pd.concat([true_df, fake_df], keys=['True', 'Fake'])
result = result.reset_index(level=0).rename(columns={"level_0": "class"})

result.to_csv(INTERMEDIATE_DATA_FOLDER/"fake_true_news.csv", index=False)

## Columns Information

In [30]:
columns_info ={}
columns_info['text'] = 'Text extracted from the news'
for var, desc in zip(metadata_columns, metadata_translate):
    columns_info[var] = desc

In [31]:
with open(REFERENCE_FOLDER / "news_data_dictionary.json","w") as f:
    f.write(json.dumps(columns_info))
    f.close()