In [2]:
from pathlib import Path
import pandas as pd
import numpy as np
import itertools

# Feature Engineering

This notebook shows the steps taken for the feature engineering after the cleaning done on "2 - Cleaning"

In [3]:
CURRENT_WORK_DIRECTORY = Path().resolve().parents[0]
PROCESSED_DATA_FOLDER = CURRENT_WORK_DIRECTORY / 'data/processed/'

## Loading Data

In [7]:
data_df = pd.read_csv(PROCESSED_DATA_FOLDER/"fake_true_news_cleaned.csv", index_col=0)
data_df["class"] = data_df["class"].astype('category')

display(data_df.shape)
data_df.head()

(7182, 23)

Unnamed: 0,class,text,link,tokens,words_no_punctuation,types,upper_words,verbs,subjuntive_imperative_verbs,nouns,...,singular_first_second_personal_pronouns,plural_first_personal_pronouns,pronouns,pausality,characters,average_sentence_length,average_word_lenght,percentage_spelling_errors,emotiveness,diversity
3,True,apesar da abundancia cresce preocupacao com pr...,https://g1.globo.com/mato-grosso-do-sul/notici...,8634.0,7588.0,2199.0,52.0,1053.0,29.0,2170.0,...,0.0,2.0,384.0,3.36334,37381.0,24.3987,4.92633,0.002504,0.223705,0.289
3021,True,cartas democracia cubana apoio qualquer coisa ...,"http://opiniao.estadao.com.br/noticias/geral,c...",8279.0,7348.0,2805.0,157.0,1148.0,64.0,1936.0,...,19.0,15.0,602.0,2.07812,36369.0,16.4018,4.94951,0.022319,0.300259,0.38173
509,True,cartas pac 2 ano viii dl na cerimonia que marc...,"http://opiniao.estadao.com.br/noticias/geral,c...",7608.0,6674.0,2782.0,233.0,1002.0,56.0,1863.0,...,13.0,4.0,482.0,2.54496,33862.0,18.1853,5.07372,0.023524,0.290052,0.41684
2766,True,reveja todos os videos do quadro partiurs do j...,https://g1.globo.com/rs/rio-grande-do-sul/noti...,7589.0,6641.0,1857.0,45.0,919.0,28.0,1912.0,...,0.0,0.0,298.0,2.48168,30348.0,17.3848,4.56979,0.005421,0.235959,0.27962
2589,True,cartas apoio a cuba mocao de solidariedade nao...,"http://opiniao.estadao.com.br/noticias/geral,c...",7516.0,6645.0,2680.0,148.0,1015.0,65.0,1791.0,...,8.0,9.0,528.0,2.41274,32770.0,18.4072,4.93153,0.021069,0.28047,0.40331


## Ratio Variables

To minimize the bias related to true news being bigger than fake ones our approach is to calculate the ratio of some variables in respect of the number of tokens.

In [8]:
def calculate_ratio(s1, s2):
    return (s1 / s2) * 100      

data_df['percent_words_no_punctuation'] = calculate_ratio(data_df.words_no_punctuation, data_df.tokens)
data_df['percent_adjectives'] = calculate_ratio(data_df.adjectives, data_df.tokens)
data_df['percent_adverbs'] = calculate_ratio(data_df.adverbs, data_df.tokens)
data_df['percent_nouns'] = calculate_ratio(data_df.nouns, data_df.tokens)
data_df['percent_subjuntive_imperative_verbs'] = calculate_ratio(data_df.subjuntive_imperative_verbs, data_df.tokens)
data_df['percent_upper_words'] = calculate_ratio(data_df.upper_words, data_df.tokens)
data_df['percent_verbs'] = calculate_ratio(data_df.verbs, data_df.tokens)
data_df['percent_modal_verbs'] = calculate_ratio(data_df.modal_verbs, data_df.tokens)
data_df['percent_singular_first_second_personal_pronouns'] = calculate_ratio(data_df.singular_first_second_personal_pronouns, data_df.tokens)
data_df['percent_plural_first_personal_pronouns'] = calculate_ratio(data_df.plural_first_personal_pronouns, data_df.tokens)
data_df['percent_pronouns'] = calculate_ratio(data_df.pronouns, data_df.tokens)

data_df = data_df.drop(columns=[
    'words_no_punctuation', 'adjectives', 'adverbs', 'nouns', 
    'subjuntive_imperative_verbs', 'upper_words', 'verbs', 'modal_verbs', 'pronouns',
    'singular_first_second_personal_pronouns', 'plural_first_personal_pronouns' 
])

## Exporting Data

In [9]:
data_df.to_csv(PROCESSED_DATA_FOLDER/"fake_true_news_cleaned_features.csv")