# Pré-Processamento dos Documentos

A limpeza dos dados é um processo essencial para garantir a qualidade e a confiabilidade das informações armazenadas em um banco de dados. A limpeza dos dados envolve a identificação e a correção de erros, inconsistências, duplicidades e valores ausentes nos dados. A arquitetura do armazenamento é a forma como os dados são organizados, estruturados e acessados em um banco de dados. Uma das opções de arquitetura é o formato YAML, que significa YAML Ain't Markup Language. O YAML é um formato de serialização de dados que usa uma sintaxe simples e legível para representar estruturas de dados como listas, mapas, sequências e escalares. O YAML é compatível com diversas linguagens de programação e pode ser usado para armazenar dados de forma hierárquica e flexível.

<!-- <hr style="border-width: 1px" width="95%" > -->
<div></div> 

In [1]:
# Importa os módulos necessários
import os    # Módulo para lidar com funções do sistema operacional
import gc    # Módulo para realizar coleta de lixo e gerenciamento de memória

import numpy as np   # Módulo para trabalhar com matrizes e funções matemáticas
import pandas as pd  # Módulo para trabalhar com dataframes e séries em Python


<div></div> 

## Estruturação dos Arquivos

<div></div> 

In [31]:
# caminho das queries 
query_path = '../data/emails/mini_newsgroups/'

# caminho dos documentos
docs_path = '../data/emails/20_newsgroups/'

# Iterate over each file in the directory and its subdirectories
def process_files(doc_dir: str): 
    
    database = [] 
    
    for filepath in os.listdir(doc_dir): 
        
        for filename in os.listdir(f'{doc_dir}{filepath}'):

            # Open each file individually and read its contents
            with open(os.path.join(doc_dir, filepath, filename), 'r') as f:
                text_data = f.read().strip()

            # Split the header and body of the email
            try:
                header, body = text_data.split('\n\n', maxsplit=1)
            except:
                continue

            # Convert header to a dictionary
            # header_dict = {}
            # for line in header.split('\n'):
            #     try:
            #         # Split the key and value in each header field and store them in a dictionary
            #         key, value = line.strip().split(': ', maxsplit=1)
            #         header_dict[key] = value
            #     except:
            #         # If a header field cannot be split properly, skip it and continue
            #         continue

            # Append the processed data to the list

            database.append({'filepath': filepath, 
                            'filename': filename,
                            'body': body, 
                            # **header_dict,
                            # 'text': text_data
                            })
    return database

# tranformation from dict -> dataframe
base_doc = pd.DataFrame(process_files(docs))

base_doc = pd.DataFrame(process_files(doc_dir))

# remove database from memory
gc.collect()

0

In [5]:
# caminho das queries 
query_path = '../data/emails/mini_newsgroups/'

# caminho dos documentos
docs_path = '../data/emails/20_newsgroups/'

# Import das bases
database_docs = read_files(docs_path)
database_query = read_files(query_path)

base_docs = pd.DataFrame(database_docs)
base_query = pd.DataFrame(database_query)

# Marcação das bases
base_docs['tag'] = 'doc'
base_query['tag'] = 'query'

# junção das bases 
base = pd.concat([base_docs, base_query])
base.reset_index(drop=True, inplace=True)

del base_docs, base_query, database_docs, database_query
gc.collect()


PermissionError: [Errno 13] Permission denied: '../data/emails/20_newsgroups/alt.atheism'

In [4]:
base

Unnamed: 0,filepath,filename,text,tag
0,../data/emails/20_newsgroups/misc.forsale/,70337,Path: cantaloupe.srv.cs.cmu.edu!rochester!udel...,doc
1,../data/emails/20_newsgroups/misc.forsale/,74150,Path: cantaloupe.srv.cs.cmu.edu!crabapple.srv....,doc
2,../data/emails/20_newsgroups/misc.forsale/,74720,Path: cantaloupe.srv.cs.cmu.edu!das-news.harva...,doc
3,../data/emails/20_newsgroups/misc.forsale/,74721,Newsgroups: misc.forsale\nPath: cantaloupe.srv...,doc
4,../data/emails/20_newsgroups/misc.forsale/,74722,Path: cantaloupe.srv.cs.cmu.edu!crabapple.srv....,doc
...,...,...,...,...
1095,../data/emails/mini_newsgroups/misc.forsale/,76927,Xref: cantaloupe.srv.cs.cmu.edu misc.wanted:31...,query
1096,../data/emails/mini_newsgroups/misc.forsale/,76936,Newsgroups: misc.forsale\nSubject: WANTED LCD ...,query
1097,../data/emails/mini_newsgroups/misc.forsale/,76937,"Newsgroups: ingr.forsale,hsv.forsale,misc.fors...",query
1098,../data/emails/mini_newsgroups/misc.forsale/,76940,Newsgroups: misc.forsale\nPath: cantaloupe.srv...,query


<div></div> 

## Processamento de Texto

<div></div> 

### Transformação de minúsculos

<div></div> 

In [4]:
# (\[a-z]): para encontrar todos os caracteres que começam com uma barra invertida () seguida por uma letra minúscula (a-z);
# ([^\w\]): para encontrar todos os caracteres que não são letras, números ou barras invertidas ();
# (\S+\d\S+): para encontrar todos os trechos de texto que contêm um ou mais caracteres não brancos (\S), 
# seguidos por um dígito (\d), seguidos por mais um ou mais caracteres não brancos (\S).
base['post'] = base['text'].replace(r'(\\[a-z])|([^\w\\])|(\S+\d\S+)', ' ', regex=True)


# Aplicando as funções str.lower() e str.strip() simultaneamente
base['post'] = base['post'].apply(lambda x: x.lower().strip())


<div></div> 

### Tokenização e Lemmatizer

<div></div>

In [5]:
from ir.preprocessing import lemmatize_word

base['post'].apply(lambda x: ' '.join([lemmatize_word(word.lower()) for word in x.split()]))

0       path cantaloupe srv c cmu edu rochester udel g...
1       path from myoakam ci ohio state edu micah r yo...
2       path from maureen l eagle newsgroup misc forsa...
3       newsgroup misc forsale path from mike diack mi...
4       path from jvinson xsoft xerox com jeffrey a vi...
                              ...                        
1095    xref cantaloupe srv c cmu edu newsgroup misc w...
1096    newsgroup misc forsale subject want lcd overhe...
1097    newsgroup ingr forsale hsv forsale misc forsal...
1098    newsgroup misc forsale path cantaloupe srv c c...
1099    xref cantaloupe srv c cmu edu path from scott ...
Name: post, Length: 1100, dtype: object

In [6]:
from ir import tf_idf

weight = tf_idf.tfidf(base, 'post').iloc[1:]


### Identificação das query / docs

In [36]:
d_index = base.query('tag=="doc"').index
q_index = base.query('tag=="query"').index

In [145]:
import itertools

similarity = dict()
index_matrix = dict()
rank_matrix = dict()

for j in q_index:
    for i in d_index: 
        numerator = np.sum( weight.loc[:,i] * weight.loc[: , j])
        denominator = np.linalg.norm(weight.loc[:,i])*np.linalg.norm(weight.loc[:,1019])
        similarity[i] = numerator/denominator


    rank_matrix[j] = pd.DataFrame(similarity.values(), columns=['rank']).sort_values(by='rank', ascending=False).head(10).values.tolist()
    index_matrix[j] = pd.DataFrame(similarity.values(), columns=['rank']).sort_values(by='rank', ascending=False).head(10).index.to_list()

    


In [153]:
pd.DataFrame(rank_matrix)

Unnamed: 0,1000,1001,1002,1003,1004,1005,1006,1007,1008,1009,...,1090,1091,1092,1093,1094,1095,1096,1097,1098,1099
0,[0.6422584945103841],[0.668743974268336],[1.008422835773131],[0.6661363151210968],[0.8642602181331219],[0.7417012424961686],[0.7315442686011049],[0.9460833202189584],[0.9746037822064916],[0.6869123604148133],...,[1.1523833456270243],[0.7956243694544436],[1.7802942925566625],[0.5158809769289813],[0.5234239788184921],[0.8038496213979783],[0.6360279187397326],[1.1259044435676868],[1.0616162379284384],[0.6784846202202287]
1,[0.2873172889001637],[0.17456679249116952],[0.6332470445844255],[0.09678526898308248],[0.537824913541413],[0.2820702175096787],[0.6513139955405113],[0.3597970619791958],[0.24900044829809295],[0.14368626326554504],...,[0.42456045151374616],[0.13839258426540738],[0.1920094056436893],[0.4608108432879023],[0.46754863207460967],[0.635942460180357],[0.2673113797648495],[0.09064837068978648],[0.19426228087688888],[0.17427577670397354]
2,[0.19584221062188678],[0.1319897299803032],[0.08491363433325547],[0.08737476357264361],[0.47799964699339453],[0.20691840972537448],[0.0713300455225047],[0.18078764107798664],[0.11093369090167131],[0.09580043962698802],...,[0.39006994534248846],[0.09497709201044909],[0.17705994384084703],[0.1911406606344065],[0.19471247373710213],[0.6126542869398587],[0.06788233871189489],[0.07823702305427702],[0.17319361461646068],[0.07447944169226609]
3,[0.13903116434341475],[0.10146093652625415],[0.08181050400129704],[0.07523129301857605],[0.18045913119908524],[0.20260530333012666],[0.054813355262948726],[0.16776960486431058],[0.10031171790840715],[0.07826399750895882],...,[0.2509003350905366],[0.08889171346193057],[0.16005754765559338],[0.053302833660593664],[0.053302833660593664],[0.2514441707424639],[0.06610690581400588],[0.07732220562999899],[0.17148587761263784],[0.04757751873987604]
4,[0.1129173777348391],[0.08606053698831793],[0.07547569629425901],[0.06970411933742698],[0.17135272355687187],[0.20091765863852995],[0.05469328989879712],[0.16743762657989103],[0.09811418008672529],[0.07488273104320195],...,[0.1456676671253771],[0.08031528425526144],[0.1591723994058402],[0.05187299077727413],[0.04876597436921601],[0.24540764374264745],[0.064867860088572],[0.07458882712067232],[0.1268769662336863],[0.04675221931850179]
5,[0.11038217708080131],[0.08212202043548315],[0.07432402899651668],[0.06957535331034335],[0.11414112796082569],[0.09766529639479365],[0.05342228639423025],[0.11293055713592527],[0.09499508593239105],[0.07351917900522721],...,[0.1356468036645224],[0.07846954905194828],[0.15849239426755599],[0.04462161696830591],[0.04472544383210234],[0.2051558258313172],[0.06082789393903154],[0.06855422701443233],[0.08964124013628569],[0.043706874358644336]
6,[0.10833574384967115],[0.08201334401786602],[0.07412645164725896],[0.06759708576024981],[0.09130509649851562],[0.08716787810262448],[0.0497899252125642],[0.09487498299853164],[0.08477817264550953],[0.06935710726422203],...,[0.13395484290588885],[0.07651073480529516],[0.14400058142958222],[0.04329534404644248],[0.04462161696830591],[0.10677052218571517],[0.05830025911923057],[0.06718300838215913],[0.08292563259802246],[0.04288337427490202]
7,[0.09458970277908005],[0.08147133735902357],[0.07118734960890093],[0.06755116098175724],[0.08423578168208135],[0.08564850087282809],[0.04712198065394311],[0.08900364470414873],[0.0845410509094711],[0.06696048135598477],...,[0.12832106795670173],[0.0742473891955823],[0.1337598250032541],[0.03987922316585672],[0.04329534404644248],[0.08913041300626738],[0.05705502660231897],[0.06310045914518085],[0.07445114613146177],[0.04255318648575154]
8,[0.06683185055384143],[0.08107329389309223],[0.06992507760853946],[0.0665144106585746],[0.07732918671239193],[0.07767329982049115],[0.04496152399683175],[0.07886393345966285],[0.08159272098702083],[0.0655222722939256],...,[0.12560417304820823],[0.07411198929338968],[0.1331408833577327],[0.03857993879037075],[0.038711425869337984],[0.08897777442428674],[0.05560594182068467],[0.0626845924656044],[0.07237739511197822],[0.041613779283231046]
9,[0.06139453131422424],[0.07948227337943198],[0.06689020592169981],[0.05955785852513209],[0.07517277782794342],[0.07765247541664737],[0.04318953773752061],[0.07531732464870161],[0.08098184954929952],[0.06458414852590263],...,[0.12278663409161311],[0.07365677105558269],[0.13062404036287226],[0.03855871951859559],[0.03855871951859559],[0.08461722594983144],[0.053320319118708596],[0.06258022212397257],[0.07103356433858925],[0.04145958334073781]
