In [3]:
# !pip install PyMuPDF
# !pip install langchain-community
# !pip install nltk
# !pip install spacy
# !pip install pdfplumber

In [4]:
from langchain_community.document_loaders import PyMuPDFLoader
import os
import unicodedata
import re
import nltk
from nltk.corpus import stopwords
import spacy
import pdfplumber

In [None]:
##### RUN THIS COMMAND JUST ONE TIME LOCALLY ########################################
# nltk.download('stopwords')

In [None]:
##### RUN THIS COMMAND JUST ONE TIME LOCALLY ########################################
# !python -m spacy download pt_core_news_sm

### Bronze to Silver0

In [6]:
def list_files_in_directory2(directory_path):
    # List only files (not directories)
    return [directory_path + file for file in os.listdir(directory_path) if os.path.isfile(os.path.join(directory_path, file))]

def list_files_in_directory(directory_path):
    # List only files (not directories)
    return [file for file in os.listdir(directory_path) if os.path.isfile(os.path.join(directory_path, file))]

def pdf_to_txt(file_name):
    loader = PyMuPDFLoader(bronze_path + file_name)
    # here we have a class with metadata
    data = loader.load()

    # this way we get all the text of the pdf in just 1 string
    text = ''
    for i in range(len(data)):
        text = text + data[i].page_content

    silver_path = f'../medallion/silver0/{file_name[:-4]}.txt'

    with open(silver_path, 'w', encoding='utf-8') as file:
        file.write(text)

def remove_accents_and_special_characters(file0):
    # Open the file and read all the content
    with open(file0, 'r', encoding='utf-8') as file:
        text = file.read()

    # Normalize the text to separate accents from letters
    normalized_text = unicodedata.normalize('NFKD', text)
    
    # Remove accents by discarding non-ASCII characters
    text_without_accents = normalized_text.encode('ASCII', 'ignore').decode('utf-8')
    
    # Remove any special characters (keeping letters and numbers)
    clean_text = re.sub(r'[^a-zA-Z0-9\s]', '', text_without_accents)

    # Convert the content to lowercase
    clean_text = clean_text.lower()
    
    # Replace 'silver0' with 'silver1'
    silver1_path = file0.replace('silver0', 'silver1')

    # Open the file in write mode and write the string to it
    with open(silver1_path, 'w') as file:
        file.write(clean_text)

def convert_to_markdown(pdf_path, markdown_path):
    with pdfplumber.open(pdf_path) as pdf:
        markdown_content = ""
        for page in pdf.pages:
            text = page.extract_text()
            # You can add formatting here, e.g., convert headings, lists, etc.
            # Example: treat lines with all caps as headings
            markdown_content += "\n\n" + text

    # Save the Markdown content to a file
    with open(markdown_path, 'w', encoding='utf-8') as md_file:
        md_file.write(markdown_content)

In [49]:
# here we transfer from Bronze to Silver0
bronze_path = '../medallion/bronze/'
file_list = list_files_in_directory(bronze_path)

for file in file_list:
    pdf_to_txt(file)



### Silver0: pdf to txt

In [64]:
silver0_path = '../medallion/silver0/'
list_files_in_directory2(silver0_path)

['../medallion/silver0/plano-acao-adaptacao-climatica-nacional.txt',
 '../medallion/silver0/plano-acao-climatica-agro.txt',
 '../medallion/silver0/plano-acao-climatica-curitiba.txt',
 '../medallion/silver0/plano-acao-climatica-federal.txt',
 '../medallion/silver0/plano-acao-climatica-itabirito.txt',
 '../medallion/silver0/plano-acao-climatica-joao-pessoa.txt',
 '../medallion/silver0/plano-acao-climatica-sp-regiao.txt',
 '../medallion/silver0/plano-enfrentamento-mudanca-climatica-nacional.txt']

In [52]:
def read_and_convert_to_lowercase(filename):
    # Open the file in read mode with UTF-8 encoding
    with open(filename, 'r', encoding='utf-8') as file:
        content = file.read()
    
    # Convert the content to lowercase
    lowercase_content = content.lower()
    
    return lowercase_content

### Silver 1: remove accents and special characters

In [8]:
silver0_path = '../medallion/silver0_manual/'
file_list = list_files_in_directory2(silver0_path)

for file0 in file_list:
    remove_accents_and_special_characters(file0)

### Silver 2: stopwords

In [9]:
import nltk
from nltk.corpus import stopwords

def remove_stopwords(file0):

    # Open the file and read all the content
    with open(file0, 'r', encoding='utf-8') as file:
        text = file.read()

    # Get the list of Portuguese stopwords
    stop_words = set(stopwords.words('portuguese'))

    # Split the string into words
    words = text.split()

    # Remove stopwords
    filtered_text = [word for word in words if word.lower() not in stop_words]

    # Join the filtered words back into a string
    result = ' '.join(filtered_text)

    # Replace 'silver0' with 'silver1'
    silver2_path = file0.replace('silver1', 'silver2')

    # Open the file in write mode and write the string to it
    with open(silver2_path, 'w') as file:
        file.write(result)

In [11]:
silver1_path = '../medallion/silver1_manual/'
file_list = list_files_in_directory2(silver1_path)

for file0 in file_list:
    remove_stopwords(file0)

### Silver 3: Lematization

In [12]:
def lemmatize(file0):
    # Open the file and read all the content
    with open(file0, 'r', encoding='utf-8') as file:
        text = file.read()

    nlp = spacy.load("pt_core_news_sm")

    # Process the text with the spaCy NLP model
    doc = nlp(text)

    # Extract the lemmatized tokens
    lemmatized_text = ' '.join([token.lemma_ for token in doc])

    # Replace 'silver0' with 'silver1'
    silver3_path = file0.replace('silver2', 'silver3')
    print(silver3_path)

    # Open the file in write mode and write the string to it
    with open(silver3_path, 'w') as file:
        file.write(lemmatized_text)

In [13]:
silver2_path = '../medallion/silver2_manual/'
file_list = list_files_in_directory2(silver2_path)

for file0 in file_list:
    lemmatize(file0)

../medallion/silver3_manual/plano-acao-adaptacao-climatica-nacional.txt
../medallion/silver3_manual/plano-acao-climatica-agro.txt
../medallion/silver3_manual/plano-acao-climatica-curitiba.txt
../medallion/silver3_manual/plano-acao-climatica-federal.txt
../medallion/silver3_manual/plano-acao-climatica-itabirito.txt
../medallion/silver3_manual/plano-acao-climatica-joao-pessoa.txt
../medallion/silver3_manual/plano-acao-climatica-sp-regiao.txt
../medallion/silver3_manual/plano-enfrentamento-mudanca-climatica-nacional.txt
