# Pré Processamento de um conjunto de artigos

Escolher uma base de artigos e realizar o pré-processamento, sem utilizar a biblioteca Spacy para ajudar na remoção de palavras.

O pré processamento se resume em retirar stop words, normalizar o texto, remover números, e remover ruidos.

# Imports

In [1]:
# Bibliotecas nativas
import io
import os
import re
import string
import unicodedata
import xml.etree.ElementTree as ET
import zipfile
import pickle
# Pré processamento e estruturas de dados
import numpy as np
import nltk
import pandas as pd
from bs4 import BeautifulSoup
from fuzzywuzzy import fuzz
from langdetect import DetectorFactory, detect
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import spacy
from spacy.tokenizer import Tokenizer

import enchant




In [2]:
nlp = spacy.load('en',disable=['parser','ner'])
nlp_pt = spacy.load('pt_core_news_sm')

In [3]:
tokenizer = Tokenizer(nlp.vocab)

# Funções

In [4]:
def strip_html(text):
    soup = BeautifulSoup(text, "html.parser")
    return soup.get_text()
def remove_between_square_brackets(text):
    return re.sub('\[[^]]*\]', '', text)
def detect_portuguese_text(text):
    try:
        language = detect(text)
    except:
        return np.nan
    if language == 'en':
        return text
    else:
        return np.nan
def remove_portuguese_text(text):
    words = set(nltk.corpus.words.words())
    
    return " ".join(w for w in nltk.wordpunct_tokenize(text)
                    if w.lower() in words or not w.isalpha())
def remove_useless_words(text):
    doc = nlp(text)
    text = ""
    for token in doc:
        if token.pos_ in ['VERB'] and not token.is_stop:
            text += token.lemma_ + " "
    return text
def denoise_text(text):
    text = strip_html(text)
    text = remove_between_square_brackets(text)
#     text = remove_portuguese_text(text)
#     text = remove_useless_words(text)
#     text = detect_portuguese_text(text)
    return text

In [5]:
def remove_non_ascii(words):
    """Remove non-ASCII characters from list of tokenized words"""
    
    new_words = []
    for word in words:
        new_word = unicodedata.normalize('NFKD', word.lemma_).encode(
            'ascii', 'ignore').decode('utf-8', 'ignore')
        new_words.append(new_word)
    return new_words
#def to_lowercase(words):
    """Convert all characters to lowercase from list of tokenized words"""
    new_words = []
    for word in words:
        new_word = word.lemma_.lower()
        new_words.append(new_word)
    return new_words
def remove_punctuation(words):
    """Remove punctuation from list of tokenized words"""
    new_words = []
    for word in words:
        new_word = re.sub(r'[^\w\s]', '', word)
        if new_word != '':
            new_words.append(new_word)
    return new_words
def replace_numbers(words):
    """Replace all interger occurrences in list of tokenized words with textual representation"""
    p = inflect.engine()
    new_words = []
    for word in words:
        if word.isdigit():
            new_word = p.number_to_words(word)
            new_words.append(new_word)
        else:
            new_words.append(word)
    return new_words
def remove_number(words):
    text = []
    for word in words:
        try:
            if not word.isdigit():
                text.append(word)
        except ValueError:
            print(word)
    return text    
def normalize(words):
    words = remove_non_ascii(words)
    #words = to_lowercase(words)
    words = remove_punctuation(words)
    #words = remove_number(words)
#     words = replace_numbers(words)

    return words

In [6]:
def remove_small_words(wordst):
    new_words = []
    for word in wordst:
        if len(word) > 1:
            #print(word)
            new_words.append(word)
    return new_words
def remove_useless_words(text):
    doc = nlp(text)
    text = ""
    lemma = ['ADJ','NOUN']
    for token in doc:
            if token.pos_ not in ['ADJ']:
                text += token.lemma_ + " "
    return text
def remove_stopwords_pt(wordst):
    stop = stopwords.words('portuguese')
    stop = remove_non_ascii(stop)
    new_words = [w for w in wordst if w not in stop]

    return new_words
def remove_stopwords_en(wordst):
    stop = stopwords.words('english')
    stop = remove_non_ascii(stop)
    new_words = [w for w in wordst if w not in stop]

    return new_words

d = enchant.Dict('en_US')
def remove_portuguese(words):
    english_words = []
    totalLength = len(words)
    cont = 0
    for word in words:
        word = word.lemma_
        if d.check(word):
            english_words.append(word)
        else:
            cont+= 1
   # print(len(english_words),totalLength)
    if totalLength > 0:        
        rate = cont/totalLength
    else: 
        rate = 0
    #print(rate)
    if (rate) > 0.2:
        new_words = []
        return new_words
    else:
        return english_words

# Inicialização

In [43]:
year = '2014'

In [44]:
df_artigos = pd.read_csv('../datasets/artigos/nopreprocessed/artigos_'+year+'.csv',sep="\*\|\*")
df_artigos.dropna(inplace=True)
df_artigos = df_artigos.reset_index(drop=True)

df_artigos = df_artigos.sample(
    frac=1, random_state=29).reset_index(drop=True)  # Embaralha (shuffle) as linhas

df_artigos.head(10)

  """Entry point for launching an IPython kernel.


Unnamed: 0,autores,titulo,ano
0,Helena Barreto Matzenauer&Wiliam Correa Marque...,A operacionalidade do modelo multicriterio de ...,2014.0
1,Bernard Gendron&Abilio Lucena&Alexandre Salles...,"Benders Decomposition, Branch-and-Cut, and Hyb...",2014.0
2,João dos Santos Vila da Silva&Sandra Mara Alve...,Caracterizacao ambiental da unidade de planeja...,2014.0
3,Felype Santiago&Rohit Gheyi&Gustavo Soares&Pau...,A Toolset for Checking SPL Refinements,2014.0
4,"WANICK SARINHO, SILVIA&MENEZES JÚNIOR, JÚLIO V...",Experience In The Development Of a Mobile Diag...,2014.0
5,Nemésio Freitas Duarte Filho&Lucas Bortolini F...,Contributions for the Architectural Design of ...,2014.0
6,"UEYAMA, JO&FREITAS, HEITOR&FAICAL, BRUNO S.&FI...",Exploiting the use of unmanned aerial vehicles...,2014.0
7,Luiz Olmes de Carvalho&Willian Dener de Olivei...,A wider concept for similarity joins,2014.0
8,"CUADRAT, RAFAEL R. C.&DÁVILA, ALBERTO M. R.&CR...",An Orthology-Based Analysis of Pathogenic Prot...,2014.0
9,Fabricio Herpich&Rafaela Ribeiro Jardim&Ricard...,CYBERCIEGE: uma abordagem de jogos serios na e...,2014.0


In [45]:
df_pre = df_artigos.copy()

In [46]:
#df_pre['titulo'] = df_pre['titulo'].apply(lambda x: x.lower())

In [47]:
df_pre['titulo'] = df_pre['titulo'].apply(denoise_text)

df_pre['titulo'] = df_pre['titulo'].apply(remove_useless_words)

df_pre['titulo'] = df_pre['titulo'].apply(tokenizer)

In [48]:
def new_normalize(tokens):
    new_tokens = []
    for token in tokens:
        if token.is_ascii and not token.is_digit and not token.is_punct and not token.is_stop and token.is_alpha:
            new_tokens.append(token)
    return new_tokens

In [49]:
df_pre['titulo'] = df_pre['titulo'].apply(new_normalize)

In [50]:
df_pre['titulo'] = df_pre['titulo'].apply(remove_portuguese)

In [51]:
df_pre = df_pre[df_pre['titulo'].apply(len) >= 1]

In [52]:
df_pre['titulo'] = df_pre['titulo'].apply(lambda x: ' '.join(x))

In [53]:
df_pre.head(10)

Unnamed: 0,autores,titulo,ano
1,Bernard Gendron&Abilio Lucena&Alexandre Salles...,bender decomposition branch cut hybrid algorit...,2014.0
4,"WANICK SARINHO, SILVIA&MENEZES JÚNIOR, JÚLIO V...",experience development mobile diagnosis suppor...,2014.0
5,Nemésio Freitas Duarte Filho&Lucas Bortolini F...,contribution architectural design mobile learn...,2014.0
6,"UEYAMA, JO&FREITAS, HEITOR&FAICAL, BRUNO S.&FI...",exploit use vehicle provide resilience sensor ...,2014.0
7,Luiz Olmes de Carvalho&Willian Dener de Olivei...,concept similarity join,2014.0
10,Gilberto Câmara&Max J. Egenhofer&Karine Reis F...,field generic datum type big spatial datum,2014.0
11,Jeongyeon Seo&Jae-Kwan Kim&Joonghyun Ryu&Carli...,protein structure determination algorithm base...,2014.0
12,Fabiano Belem&Eder Martins&Jussara Marques Alm...,personalize object center tag recommendation m...,2014.0
13,"SOUZA SANTANA-VIEIRA, DAYSE DRIELLY&PEREIRA MI...",rapid differentiation closely citrus fluoresce...,2014.0
19,Antônio Vicente Lourenço Damaso&Nelson Rosa&Pa...,use net evaluate power consumption wireless se...,2014.0


In [54]:
df_pre = df_pre.drop_duplicates(subset=['titulo'],keep = 'first')

In [55]:
df_pre.shape

(2184, 3)

In [56]:
artigos = df_pre['titulo'].values

In [57]:
f = open('../datasets/artigos/preprocessed/'+year+'/artigosPre.txt','w')
f.write("".join(artigos+"\n"))
f.close()

- Separar em grupos procurando artigos iguais

In [58]:
df_pre.to_csv('../datasets/artigos/preprocessed/'+year+'/artigosPre.csv',sep="|")