# Pré-Processamento dos Documentos

A limpeza dos dados é um processo essencial para garantir a qualidade e a confiabilidade das informações armazenadas em um banco de dados. A limpeza dos dados envolve a identificação e a correção de erros, inconsistências, duplicidades e valores ausentes nos dados. A arquitetura do armazenamento é a forma como os dados são organizados, estruturados e acessados em um banco de dados. Uma das opções de arquitetura é o formato YAML, que significa YAML Ain't Markup Language. O YAML é um formato de serialização de dados que usa uma sintaxe simples e legível para representar estruturas de dados como listas, mapas, sequências e escalares. O YAML é compatível com diversas linguagens de programação e pode ser usado para armazenar dados de forma hierárquica e flexível.

<!-- <hr style="border-width: 1px" width="95%" > -->
<div></div> 

In [1]:
# Importa os módulos necessários
import os    # Módulo para lidar com funções do sistema operacional
import gc    # Módulo para realizar coleta de lixo e gerenciamento de memória

import numpy as np   # Módulo para trabalhar com matrizes e funções matemáticas
import pandas as pd  # Módulo para trabalhar com dataframes e séries em Python

import nltk # Módulo para processamento de linguagem natural



<div></div> 

## Estruturação dos Arquivos

<div></div> 

In [None]:
# caminho das queries 
query_path = '../data/emails/mini_newsgroups/misc.forsale/'

# caminho dos documentos
docs_path = '../data/emails/20_newsgroups/misc.forsale/'

In [None]:
def read_files(doc_dir):
    # Use a list comprehension to get a list of file paths
    database = [{'filepath': doc_dir,
                 'filename': filename,
                 'text': open(os.path.join(doc_dir, filename), 'r').read().strip()}
                 for filename in os.listdir(doc_dir)]

    return database


In [None]:
# Import das bases
database_docs = read_files(query_path)
database_query = read_files(query_path)

base_docs = pd.DataFrame(database_docs)
base_query = pd.DataFrame(database_query)

# Marcação das bases
base_docs['tag'] = 'doc'
base_query['tag'] = 'query'

# junção das bases 
base = pd.concat([base_docs, base_query])

del base_docs, base_query, database_docs, database_query
gc.collect()


35

<div></div> 

## Processamento de Texto

<div></div> 

### Transformação de minúsculos

<div></div> 

In [None]:
import ir 

In [None]:
# (\[a-z]): para encontrar todos os caracteres que começam com uma barra invertida () seguida por uma letra minúscula (a-z);
# ([^\w\]): para encontrar todos os caracteres que não são letras, números ou barras invertidas ();
# (\S+\d\S+): para encontrar todos os trechos de texto que contêm um ou mais caracteres não brancos (\S), seguidos por um dígito (\d), seguidos por mais um ou mais caracteres não brancos (\S).
base['post'] = base['text'].replace(r'(\\[a-z])|([^\w\\])|(\S+\d\S+)', ' ', regex=True)


# Aplicando as funções str.lower() e str.strip() simultaneamente
base['post'] = base['post'].apply(lambda x: x.lower().strip())


In [None]:
from ir.preprocessing import PreProcessing as pp 

In [None]:
pp.tfidf(base, 'post')

AttributeError: type object 'PreProcessing' has no attribute 'tfidf'

In [22]:
tf_idf.tfidf(base.query('tag=="doc"'), 'post').fillna(0)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
edu,0.769037,0.769037,0.553274,0.612645,0.702170,0.702170,0.476731,0.476731,0.702170,0.368849,...,0.661155,0.476731,0.702170,0.368849,0.368849,0.553274,0.661155,0.702170,0.661155,0.184425
wpi,23.817975,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
to,0.392922,0.000000,0.578730,0.152003,0.456009,0.152003,0.152003,0.304006,0.392922,0.152003,...,0.304006,0.392922,0.544925,0.000000,0.000000,0.152003,0.152003,0.152003,0.504943,0.152003
cs,1.023461,0.395929,0.395929,0.395929,0.791857,0.395929,0.395929,0.000000,0.791857,0.395929,...,0.395929,0.000000,0.791857,0.000000,0.000000,0.791857,0.395929,0.791857,0.395929,0.395929
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
msrp,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,6.643856
scotts,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,6.643856
bbking,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,6.643856
sherman,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,6.643856


<div></div> 

### Tokenização e Lemmatizer

<div></div>

In [26]:
base['post'].apply(lambda x: ' '.join([lemmatize_word(word.lower()) for word in x.split()]))

0     path cantaloupe srv cs cmu edu rochester udel ...
1     path cantaloupe srv cs cmu edu da new harvard ...
2     newsgroup misc forsale path cantaloupe srv cs ...
3     path cantaloupe srv cs cmu edu rochester corne...
4     xref cantaloupe srv cs cmu edu path cantaloupe...
                            ...                        
95    xref cantaloupe srv cs cmu edu newsgroup misc ...
96    newsgroup misc forsale subject want lcd overhe...
97    newsgroup ingr forsale hsv forsale misc forsal...
98    newsgroup misc forsale path cantaloupe srv cs ...
99    xref cantaloupe srv cs cmu edu path from scott...
Name: post, Length: 200, dtype: object

In [None]:
def lemmatize_text(df, input_col):
    vowels = ['a', 'e', 'i', 'o', 'u']

a    def lemmatize_word(word):
        if len(word) <= 2:
            return word
        
        if word.endswith('ns'):
            return word[:-2]
        
        if word.endswith('s'):
            return word[:-1]
        
        if word.endswith('ing') and len(word) > 5:
            if word[-4] == word[-5] and word[-5] not in vowels:
                return word[:-4] + word[-3:]
            elif word[-3] in vowels:
                return word[:-3]
            else:
                return word[:-2]
        
        if word.endswith('ly') and len(word) > 4:
            return word[:-2]
        
        if word.endswith('ed') and len(word) > 3:
            if word[-3] == word[-4] and word[-4] not in vowels:
                return word[:-3] + word[-2:]
            else:
                return word[:-2]
        
        return word

    df[output_col] = df[input_col].apply(lambda x: ' '.join([lemmatize_word(word.lower()) for word in x.split()]))
    return df


In [4]:
nltk.download('wordnet')    # faz o download do recurso 'wordnet' do nltk
nltk.download('punkt')     # faz o download do recurso 'punkt' do nltk

# Cria um objeto 'w_tokenizer' da classe 'WhitespaceTokenizer' do nltk para tokenizar o texto por espaços em branco
w_tokenizer = nltk.tokenize.WhitespaceTokenizer()

# Cria um objeto 'lemmatizer' da classe 'WordNetLemmatizer' do nltk para realizar a lematização das palavras
lemmatizer = nltk.WordNetLemmatizer()

# Define a função 'lemmatizer_text' que recebe um texto como entrada, tokeniza o texto em palavras e lematiza cada palavra
def lemmatizer_text(text): 
    return [lemmatizer.lemmatize(w) for w in w_tokenizer.tokenize(text)]

# Cria uma nova coluna 'tokens' na tabela 'base_inicial' que contém uma lista de tokens lematizados para cada texto
base_inicial['tokens'] = base_inicial['text'].map(lemmatizer_text)

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\kevin\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\kevin\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


<div></div> 

### Remoção de Stopwords

<div></div>

In [5]:
# Faz o download do recurso 'stopwords' do nltk
nltk.download('stopwords')

# Define a lista de stopwords em inglês usando o módulo stopwords do nltk
stopwords = stopwords.words('english')

# Aplica a função lambda em cada linha da coluna 'text' da tabela 'base_inicial'
# A função lambda realiza a tokenização do texto, transforma as palavras em minúsculas e remove as stopwords
base_inicial['text'] = base_inicial['text'].apply(lambda words: ' '.join(word.lower() for word in words.split() if word not in stopwords))
base_inicial['tokens'] = base_inicial['tokens'].apply(lambda words: [word.lower() for word in words if word not in stopwords])


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\kevin\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [6]:
# # Cria uma lista de palavras a partir da lista de tokens
# w = [j for i in list(itertools.chain(base_inicial['tokens'])) for j in i]

# # Instancia um objeto SpellChecker para correção ortográfica
# spell = SpellChecker()

# if !os.path.isfile(): 
#     # Cria um dicionário com as palavras únicas da lista, faz a correção ortográfica e associa com a palavra original
#     spell_checked = {word: spell.correction(word) for word in pd.Series(w).unique()}

#     # Define o caminho do arquivo que irá armazenar o dicionário serializado
#     path = '../references/spellcheck.pickle'

#     # Abre o arquivo para gravação em modo binário e escreve o objeto serializado
#     with open(path, 'wb') as file: 
#         pickle.dump(spell_checked, file)
# else: 
#     pass

### Export da base

In [7]:
path = '../data/processed/base_processed.parquet.gzip'

if os.path.isfile(path): 
    answer = input('File already exists, do you want to overwrite? (y/n)')
    if answer.lower() in ['s', 'y']:
        base_inicial.to_parquet(path, compression='gzip')
    else:
        raise FileExistsError('File already exists')
else: 
    base_inicial.to_parquet(path, compression='gzip')