# Pré Processamento de um conjunto de artigos

Escolher uma base de artigos e realizar o pré-processamento, sem utilizar a biblioteca Spacy para ajudar na remoção de palavras.

O pré processamento se resume em retirar stop words, normalizar o texto, remover números, e remover ruidos.

# Imports

In [4]:
# Bibliotecas nativas
import io
import os
import re
import string
import unicodedata
import xml.etree.ElementTree as ET
import zipfile
import pickle
# Pré processamento e estruturas de dados
import numpy as np
import nltk
import pandas as pd
from bs4 import BeautifulSoup
from fuzzywuzzy import fuzz
from langdetect import DetectorFactory, detect
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import spacy
from spacy.tokenizer import Tokenizer

import enchant


ModuleNotFoundError: No module named 'enchant'

In [None]:
nlp = spacy.load('en',disable=['parser','ner'])
nlp_pt = spacy.load('pt_core_news_sm')

In [None]:
tokenizer = Tokenizer(nlp.vocab)

# Funções

In [None]:
def strip_html(text):
    soup = BeautifulSoup(text, "html.parser")
    return soup.get_text()
def remove_between_square_brackets(text):
    return re.sub('\[[^]]*\]', '', text)
def detect_portuguese_text(text):
    try:
        language = detect(text)
    except:
        return np.nan
    if language == 'en':
        return text
    else:
        return np.nan
def remove_portuguese_text(text):
    words = set(nltk.corpus.words.words())
    
    return " ".join(w for w in nltk.wordpunct_tokenize(text)
                    if w.lower() in words or not w.isalpha())
def remove_useless_words(text):
    doc = nlp(text)
    text = ""
    for token in doc:
        if token.pos_ in ['VERB'] and not token.is_stop:
            text += token.lemma_ + " "
    return text
def denoise_text(text):
    text = strip_html(text)
    text = remove_between_square_brackets(text)
#     text = remove_portuguese_text(text)
#     text = remove_useless_words(text)
#     text = detect_portuguese_text(text)
    return text

In [None]:
def remove_non_ascii(words):
    """Remove non-ASCII characters from list of tokenized words"""
    
    new_words = []
    for word in words:
        new_word = unicodedata.normalize('NFKD', word.lemma_).encode(
            'ascii', 'ignore').decode('utf-8', 'ignore')
        new_words.append(new_word)
    return new_words
#def to_lowercase(words):
    """Convert all characters to lowercase from list of tokenized words"""
    new_words = []
    for word in words:
        new_word = word.lemma_.lower()
        new_words.append(new_word)
    return new_words
def remove_punctuation(words):
    """Remove punctuation from list of tokenized words"""
    new_words = []
    for word in words:
        new_word = re.sub(r'[^\w\s]', '', word)
        if new_word != '':
            new_words.append(new_word)
    return new_words
def replace_numbers(words):
    """Replace all interger occurrences in list of tokenized words with textual representation"""
    p = inflect.engine()
    new_words = []
    for word in words:
        if word.isdigit():
            new_word = p.number_to_words(word)
            new_words.append(new_word)
        else:
            new_words.append(word)
    return new_words
def remove_number(words):
    text = []
    for word in words:
        try:
            if not word.isdigit():
                text.append(word)
        except ValueError:
            print(word)
    return text    
def normalize(words):
    words = remove_non_ascii(words)
    #words = to_lowercase(words)
    words = remove_punctuation(words)
    #words = remove_number(words)
#     words = replace_numbers(words)

    return words

In [2]:
def remove_small_words(wordst):
    new_words = []
    for word in wordst:
        if len(word) > 1:
            #print(word)
            new_words.append(word)
    return new_words
def remove_useless_words(text):
    doc = nlp(text)
    text = ""
    lemma = ['ADJ','NOUN']
    for token in doc:
            if token.pos_ not in ['ADJ']:
                text += token.lemma_ + " "
    return text
def remove_stopwords_pt(wordst):
    stop = stopwords.words('portuguese')
    stop = remove_non_ascii(stop)
    new_words = [w for w in wordst if w not in stop]

    return new_words
def remove_stopwords_en(wordst):
    stop = stopwords.words('english')
    stop = remove_non_ascii(stop)
    new_words = [w for w in wordst if w not in stop]

    return new_words

d = enchant.Dict('en_US')
def remove_portuguese(words):
    english_words = []
    totalLength = len(words)
    cont = 0
    for word in words:
        word = word.lemma_
        if d.check(word):
            english_words.append(word)
        else:
            cont+= 1
   # print(len(english_words),totalLength)
    if totalLength > 0:        
        rate = cont/totalLength
    else: 
        rate = 0
    #print(rate)
    if (rate) > 0.2:
        new_words = []
        return new_words
    else:
        return english_words

NameError: name 'enchant' is not defined

# Inicialização

In [7]:
df_artigos = pd.read_csv('../datasets/artigos/nopreprocessed/artigos_2017.csv',sep="\*\|\*")
df_artigos.dropna(inplace=True)
df_artigos = df_artigos.reset_index(drop=True)

df_artigos = df_artigos.sample(
    frac=1, random_state=29).reset_index(drop=True)  # Embaralha (shuffle) as linhas

df_artigos.head(10)

  """Entry point for launching an IPython kernel.


Unnamed: 0,autores,titulo,ano
0,"GARCIA, LUAN FONSECA&GRACIOLLI, VINICIUS&DE RO...",A Conceptual Framework for Rock Data Integrati...,2017
1,"Araújo, Ricardo de A.&Adriano Lorena Inacio de...",A morphological neural network for binary clas...,2017
2,"ALVES PEREIRA, LUIS F.&JANSSENS, ELINE&George ...",Inline discrete tomography system: Application...,2017
3,"Túlio Ângelo Machado Toffolo&ESPRIT, ELINE&Ton...",A two-dimensional heuristic decomposition appr...,2017
4,"ROSEMBACK, ROBERTA GUERRA&RANGEL RIGOTTI, JOSÉ...","Demografia, planejamento territorial e a quest...",2017
5,"MAROTTA, MARCELO A.&KIST, MAICON&WICKBOLDT, JU...",Design considerations for software-defined wir...,2017
6,"SILVA, GUSTAVO R.L.&MEDEIROS, RAFAEL R.&JAIMES...",CUDA-based parallelization of Power Iteration ...,2017
7,Adrialdo Azanha&João Batista de Camargo Junior...,ERP: uma investigacao sobre a decisao entre co...,2017
8,Rogério Ricalde Torres&Taise Cristine Buske&To...,Alem do equipamento,2017
9,"DOUGLAS, DAVID&SANTANNA, JOSÉ JAIR&Ricardo de ...",Booters: can anything justify distributed deni...,2017


In [8]:
df_pre = df_artigos.copy()

In [9]:
#df_pre['titulo'] = df_pre['titulo'].apply(lambda x: x.lower())

In [10]:
df_pre['titulo'] = df_pre['titulo'].apply(denoise_text)

df_pre['titulo'] = df_pre['titulo'].apply(remove_useless_words)

df_pre['titulo'] = df_pre['titulo'].apply(tokenizer)

In [11]:
def new_normalize(tokens):
    new_tokens = []
    for token in tokens:
        if token.is_ascii and not token.is_digit and not token.is_punct and not token.is_stop and token.is_alpha:
            new_tokens.append(token)
    return new_tokens

In [12]:
df_pre['titulo'] = df_pre['titulo'].apply(new_normalize)

In [13]:
df_pre['titulo'] = df_pre['titulo'].apply(remove_portuguese)

In [14]:
df_pre = df_pre[df_pre['titulo'].apply(len) >= 1]

In [15]:
df_pre['titulo'] = df_pre['titulo'].apply(lambda x: ' '.join(x))

In [16]:
df_pre.head(10)

Unnamed: 0,autores,titulo,ano
0,"GARCIA, LUAN FONSECA&GRACIOLLI, VINICIUS&DE RO...",conceptual framework rock datum integration re...,2017
1,"Araújo, Ricardo de A.&Adriano Lorena Inacio de...",network classification problem,2017
2,"ALVES PEREIRA, LUIS F.&JANSSENS, ELINE&George ...",tomography system application product inspection,2017
3,"Túlio Ângelo Machado Toffolo&ESPRIT, ELINE&Ton...",decomposition approach container load problem,2017
5,"MAROTTA, MARCELO A.&KIST, MAICON&WICKBOLDT, JU...",design consideration software define network c...,2017
13,"FERNANDES, CHRYSTINNE OLIVEIRA&Carlos José Per...",software framework remote patient monitor use ...,2017
15,Julio Cesar Santos dos Anjos&Tatiana Galibus&C...,sec approach secure big datum process cloud,2017
17,Leyvison Rafael V. da Conceição&Livia M. Carne...,synthesis macaw palm oil use catalyst comprise...,2017
19,Gildárcio Sousa Gonçalves&Luiz Alberto Vieira ...,agile interdisciplinary approach safety critic...,2017
20,"THOMAZINHO, HELLEN CHRISTINE SERODIO&Alexandre...",case study strategy maintain software numb user,2017


In [17]:
df_pre = df_pre.drop_duplicates(subset=['titulo'],keep = 'first')

In [18]:
df_pre.shape

(2226, 3)

In [19]:
artigos = df_pre['titulo'].values

In [20]:
f = open('../datasets/artigos/preprocessed/2017/artigosPre.txt','w')
f.write("".join(artigos+"\n"))
f.close()

- Separar em grupos procurando artigos iguais

In [21]:
df_pre.to_csv('../datasets/artigos/preprocessed/2017/artigosPre.csv',sep="|")