# Data Preprocessing

In this notebook the JSONs extracted by the crawlers are loaded and normalized:
 - Dates are changed to the format **dd/mm/yyyy**
 - Texts are cleaned (removing non-alphabetic characters, lowering the case, etc.)

In [1]:
import re
import pandas as pd
import numpy as np
import json
import pickle as pkl
import os
from nltk.corpus import stopwords

from unidecode import unidecode
from glob import glob

## Defining variables

In [2]:
PATH = '../'

RAW_DATA_PATH = os.path.join(PATH, 'data/raw/')

BOATOS_PATH = glob(RAW_DATA_PATH + '*')[0]
ELPAIS_PATH = glob(RAW_DATA_PATH + '*')[1]
G1_PATH = glob(RAW_DATA_PATH + '*')[2]

OUTPUT_PATH = os.path.join(PATH, 'data/processed/')

STOPWORDS = set(unidecode(sw) for sw in stopwords.words('portuguese'))

## Making FAKE NEWS DataFrame
### Loading JSON

In [3]:
with open(BOATOS_PATH, 'r') as f:
    boatos = json.load(f)
    
for i, item in enumerate(boatos):
    boatos[i]['text'] = ' '.join(item['text'])    

### Making DataFrame

In [4]:
df_fake = pd.DataFrame(boatos)
df_fake['FAKE'] = 1
df_fake.rename({'date': 'DATE', 'text': 'TEXT', 'title': 'TITLE'}, axis=1, inplace=True)
df_fake.head()

Unnamed: 0,DATE,TEXT,TITLE,FAKE
0,03/08/2019,Vamos assinar essa petição pela cassação do ma...,\nPetição para o impeachment de Bolsonaro prec...,1
1,04/08/2019,Lula foi vítima de um golpe político e não mer...,\nPetição Lula Livre contribui para liberdade ...,1
2,05/08/2019,Seu professor já te contou que a Dilma me mato...,\nMario Kozel Filho foi assassinado por Dilma ...,1
3,03/08/2019,VERGONHA PRESIDENTE DA OAB MENTIU QUE O PAI FO...,"\nFelipe Santa Cruz, presidente da OAB, mentiu...",1
4,06/08/2019,Partido DIABÓLICO!! Bandidos do PT estão abrin...,\nPT e esquerda estão abrindo buracos em estra...,1


## Making LEGIT NEWS DataFrame
### Loading JSONs

In [5]:
with open(G1_PATH, 'r') as f:
    g1 = json.load(f)
    
for i, item in enumerate(g1):
    g1[i]['text'] = ' '.join(item['text'])

with open(ELPAIS_PATH, 'r') as f:
    elpais = json.load(f)

for i, item in enumerate(elpais):
    elpais[i]['text'] = ' '.join(item['text'])

### Normalizing El Pais dates

In [6]:
month_dict = {
    'JAN': '01',
    'FEV': '02',
    'MAR': '03',
    'ABR': '04',
    'MAI': '05',
    'JUN': '06',
    'JUL': '07',
    'AGO': '08',
    'SET': '09',
    'SEP': '09',
    'OUT': '10',
    'NOV': '11',
    'DEZ': '12'
}

In [7]:
for i, item in enumerate(elpais):
    # Searching for dd/mmm/yyyy in the string
    matched = re.findall(r'\d+\s+[a-zA-Z]+\s+\d+', item['date'].strip())
    
    # If did not find anything, continues onto the next iteration
    if len(matched) == 0:
        continue
    
    date = matched[0]
    day = re.findall(r'\d+', date)[0]
    # Changing one digit days to two digits (e.g. 1/12/1999 to 01/12/1999)
    if len(day) == 1:
        date = date.replace(day + ' ', '0' + day + ' ')
    date = re.sub(r'\s+', ' ', date)
    date = re.sub(r'\s', '/', date)
    
    # Changing written month names to its numeric equivalent
    month = re.findall('[a-zA-Z]+', date)[0]
    if month in month_dict:
        date = re.sub(month, month_dict[month], date)
    
    elpais[i]['date'] = date

In [8]:
# Removing registers dated before 2013
for i, item in enumerate(elpais):
    if int(item['date'].split('/')[2]) < 2013:
        del elpais[i]

In [9]:
# Making sure the dates make sense
day = set()
month = set()
year = set()

for item in elpais:
    date = re.findall('\d+', item['date'])
    day.add(date[0])
    month.add(date[1])
    year.add(date[2])
    
print('Dias: ',len(day))
print('Meses', len(month))
print('Anos:', len(year), '-', year)

Dias:  31
Meses 12
Anos: 7 - {'2017', '2019', '2018', '2016', '2015', '2013', '2014'}


### Normalizing G1 dates

In [10]:
for i, item in enumerate(g1):
    matched = re.findall('\d+/\d+/\d+', g1[5]['date'].strip())
    if len(matched) == 0:
        continue
    g1[i]['date'] = matched[0]

### Making DataFrame

In [11]:
legit_news = elpais[:]
legit_news.extend(g1)

df_legit = pd.DataFrame(legit_news)
df_legit['FAKE'] = 0
df_legit.rename({'date': 'DATE', 'text': 'TEXT', 'title': 'TITLE'}, axis=1, inplace=True)
df_legit.head()

Unnamed: 0,DATE,TEXT,TITLE,FAKE
0,04/06/2019,O atual mandato presidencial no Brasil começou...,Começam a soar os alarmes sobre a sustentabili...,0
1,02/06/2019,Poucas horas antes de milhares de manifestant...,"Corte ou contingenciamento, quem está certo na...",0
2,03/06/2019,Se o tamanho de uma figura pública se mede pel...,Trump insulta prefeito de Londres no início de...,0
3,26/05/2019,Um dos tantos fenômenos imparáveis trazidos pe...,O líder e eu (e ninguém no meio),0
4,03/06/2019,"Após semanas de audiências públicas, o projeto...",A reforma da Previdência pesará mais sobre os ...,0


## Cleaning texts

In [12]:
def clean_text(text):
    # Making sure we're dealing with strings and lowering the characters
    text = str(text).lower()
    
    # Stripping accents
    text = unidecode(text)
    
    # Removing characters that aren't alphabetic
    text = re.sub(r'[^a-zA-Z ]', ' ', text)
    
    # Removing extra spaces
    text = re.sub('\s+', ' ', text)
    
    # Removing words with length equal or lower than 2
    return ' '.join([token for token in text.split() if len(token) > 2 and token not in STOPWORDS])

### Cleaning and dumping Fake News DataFrame

In [13]:
df_fake.loc[df_fake['TEXT'].apply(len) == 0, ['TEXT']] = df_fake[df_fake['TEXT'].apply(len) == 0]['TITLE']

In [14]:
df_fake['TEXT_CLEAN'] = df_fake['TEXT'].apply(clean_text)
df_fake['TITLE_CLEAN'] = df_fake['TITLE'].apply(clean_text).apply(lambda x: re.sub('boato(|s)', '', x).strip())

In [15]:
df_fake.drop(['TEXT', 'TITLE'], axis=1, inplace=True)

In [16]:
pkl.dump(df_fake, open(os.path.join(OUTPUT_PATH, 'df_fake_clean.pkl'), 'wb'))

### Cleaning and dumping Legit News Dataframe

In [17]:
df_legit = df_legit[df_legit['TEXT'].apply(len) > 0]
df_legit['TEXT_CLEAN'] = df_legit['TEXT'].apply(clean_text)
df_legit['TITLE_CLEAN'] = df_legit['TITLE'].apply(clean_text)

In [18]:
df_legit.drop(['TEXT', 'TITLE'], axis=1, inplace=True)

In [19]:
pkl.dump(df_legit, open(os.path.join(OUTPUT_PATH, 'df_legit_clean.pkl'), 'wb'))