# Identificador de Arquivos Relevantes

Identifica os principais tópicos de uma lista de arquivos PDF e define se eles são relevantes ou não a um contexto pré-definido.

**Requisitos**: 
* Arquivo de configuração: config.ini

## Sumário
1. [Importações e configurações](#p1)
2. [Lista arquivos PDF a serem processados](#p2)
3. [Função de pré-processamento](#p3)
4. [Processa os arquivos PDF](#p4)
5. [Modelador de tópico](#p5)
6. [Busca de contexto](#p6)
7. [Resultado](#p7)

<a id="p1"></a>
## 1. Importações e configurações

In [None]:
# Imports

import configparser
import os
import PyPDF2
import csv
import spacy
import pandas as pd
import sklearn
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

In [None]:
# Settings

CONFIG_FILE = "config.ini"

if not os.path.isfile(CONFIG_FILE):
    print(f"ERROR! Configuration file not found: {CONFIG_FILE}")
    exit(1)

config = configparser.ConfigParser()
config.read(CONFIG_FILE)

#Example = config["Settings"]["PdfFilesPath"]

<a id="p2"></a>
## 2. Lista arquivos PDF a serem processados

In [None]:
# Find all PDFs from path
pdf_search_path = config["Settings"]["PdfFilesPath"]

pdf_list = [os.path.join(pdf_search_path, f) for f in os.listdir(pdf_search_path) if os.path.isfile(os.path.join(pdf_search_path, f)) and f.endswith(".pdf")]

# Read processed files
processed_pdf_file = config["Settings"]["ProcessedPdfList"]
processed_pdf_list = list()
with open(processed_pdf_file) as csvfile:
    cvsreader = csv.reader(csvfile)
    processed_pdf_list = [row[0] for row in cvsreader]

# Smaller sample
max_processed_files = int(config["Settings"]["MaxPdfFileProcessing"])

pdf_list = [pdf_file for pdf_file in pdf_list if pdf_file not in processed_pdf_list]
pdf_list = pdf_list[:max_processed_files]

In [None]:
pdf_list

<a id="p3"></a>
## 3. Função de pré-processamento

In [None]:
nlp = spacy.load('pt')

# Custom spacy tokenizer
def process_tokens(tokens):
    # Removable chars
    extra_chars = list("%&!'()*#$˛ˇ\"?+-/˚ˇˆ˙˝˛˝˚˙˜˘")
   
    # Remove stop words, puctuations, numbers, symbols, urls, emails, spaces, dates, time and extra chars
    tokens = [token for token in tokens if 
                  not token.is_punct and 
                  not token.is_stop and 
                  not token.like_url and
                  not token.like_email and
                  not token.like_num and
                  token.pos_ != "NUM" and 
                  token.pos_ != "SYM" and 
                  token.pos_ != "SPACE" and 
                  token.ent_type_ != "DATE" and
                  token.ent_type_ != "TIME" and
                  token.text not in extra_chars
             ]

    # Lemmatizing each token and converting each token into lowercase
    tokens = [token.lemma_.lower().strip() if token.lemma_ != "-PRON-" else token.lower_ for token in tokens ]

    # return preprocessed list of tokens
    return " ".join(tokens)

<a id="p4"></a>
## 4. Processa os arquivos PDF

In [None]:
pdfs_content = []
for pdf in pdf_list:
    fopen = open(pdf,"rb")
    
    try:
        pdf_reader = PyPDF2.PdfFileReader(fopen)
        pdf_text = ""
    
        for p in range(pdf_reader.numPages):   
            page = pdf_reader.getPage(p)
            pdf_text += page.extractText()

        tokens = nlp(pdf_text)
        pdfs_content.append([pdf, process_tokens(tokens)])
    except Exception as ex:
        # TODO: Better process this errors
        print(f"PdfReadError! {pdf} - {type(ex).__name__}")
        pass

    fopen.close()

In [None]:
#print(pdfs_content)

In [None]:
# Convert to data frame
df = pd.DataFrame(pdfs_content, columns = ['PdfFile', 'Content'])

In [None]:
print(df.head)

<a id="p5"></a>
## 5. Modelador de tópico

In [None]:
cv = CountVectorizer(max_df=0.95, min_df=2, stop_words=spacy.lang.pt.stop_words.STOP_WORDS)

dtm = cv.fit_transform(df['Content'])

num_topics = int(config["Settings"]["NumberOfTopics"])
random_state = int(config["Settings"]["RandomState"])

lda = LatentDirichletAllocation(n_components=num_topics,random_state=random_state)
lda.fit(dtm)

top_topics = list()
for index,topic in enumerate(lda.components_):    
    top_topics.append(set([cv.get_feature_names()[i] for i in topic.argsort()[-15:]]))

In [None]:
# Check found topics
i = 0
for topic in top_topics:
    print(i, topic)
    i+=1

<a id="p6"></a>
## 6. Busca de contexto

In [None]:
searched_topics = set(config["Settings"]["RelevantTopicKeyWords"].split())

found_topic_index = None

i = 0
for topic in top_topics:
    if searched_topics.issubset(topic):
        found_topic_index = i
    i+=1


topic_results = lda.transform(dtm)
df['Topic'] = topic_results.argmax(axis=1)

In [None]:
print(found_topic_index)

In [None]:
print(df.head)

In [None]:
# Aux function to set relevant topic
def set_relevant (row, target_topic):
    # target topic not found
    if not target_topic:
        return 0
    
    # target topic found
    if row['Topic'] == target_topic:
        return 1
    else:
        return 0

<a id="p7"></a>
## 7. Resultado

In [None]:
# Set dataframenew column "Relevant" as 1 for matched topics and zero for not matched topics
df['Relevant'] = df.apply(lambda row: set_relevant(row, found_topic_index), axis=1)

In [None]:
# Print final data frame columns
formatted_df_output = df.to_csv(columns=["PdfFile","Relevant"], index=False, header=False, quoting=csv.QUOTE_NONNUMERIC)
print(formatted_df_output)

In [None]:
df[['PdfFile','Relevant']]

In [None]:
# Append to file
with open(processed_pdf_file, "a") as output_file:
    output_file.write(formatted_df_output)