### Importing Libraries and Constants Definition

In [1]:
!pip install gensim

import pandas as pd
import numpy as np
import logging
from sklearn.metrics.pairwise import cosine_similarity
from transformers import BertTokenizer, BertModel
from transformers import AutoModel, AutoTokenizer
from gensim.corpora import Dictionary
import torch

logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

DATA_PATH = '../data/public/'
PROPOSALS_FILE = DATA_PATH + 'brasilparticipativo.presidencia.gov.br-open-data-proposals.csv'
VOCAB_FILE = DATA_PATH + 'vocabulario-controlado-basico-vcb-lista-alfabetica.txt'
TOP_N_TOPICS = 5

Collecting gensim
  Using cached gensim-4.3.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (26.6 MB)
Collecting smart-open>=1.8.1
  Downloading smart_open-6.4.0-py3-none-any.whl (57 kB)
[K     |████████████████████████████████| 57 kB 1.1 MB/s eta 0:00:011
[?25hInstalling collected packages: smart-open, gensim
Successfully installed gensim-4.3.2 smart-open-6.4.0


  from .autonotebook import tqdm as notebook_tqdm


### Set up GPU

In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

### Load BERT model and tokenizer

In [3]:
tokenizer = BertTokenizer.from_pretrained("neuralmind/bert-large-portuguese-cased")
model = BertModel.from_pretrained("neuralmind/bert-large-portuguese-cased").to(device)

### Functions Definition

In [5]:
def process_vocab_file(vocab_file):
    ROWS_TO_SKIP = 8
    vocab_dict = {}
    with open(vocab_file, 'r', encoding='utf-8') as file:
        current_term = ''
        for line in file.read().splitlines()[ROWS_TO_SKIP:]:
            if not line.startswith('\t') and line != '':
                current_term = line
                vocab_dict[current_term] = {
                    'USE': [], # Use
                    'DF': [],  # Definição
                    'UP': [],  # Termos não preferenciais
                    'TG': [],  # Termo geral
                    'TR': [],  # Termos relacionados
                    'TE': [],  # Termos específicos
                    'EQ': [],  # Equivalente
                    'Nota de escopo': [],
                }
            elif line != '':
                property = line.strip().split(':')
                key = property[0].strip()
                value = property[1].strip()
                vocab_dict[current_term][key].append(value)
    return vocab_dict

def process_vocab(vocab_file, exclusion_terms):
    vocab_json = process_vocab_file(vocab_file)
    vocab_list_processed = [key for key in vocab_json if key.lower() not in exclusion_terms]
    return vocab_list_processed

def load_and_preprocess_proposals(file_path):
    df = pd.read_csv(file_path, delimiter=';')
    
    # Remove rejected and withdrawn proposals
    # df = df[~df['state'].isin(['rejected', 'withdrawn'])]
    
    # Replace <br> or <br/> or <br /> with spaces
    df['body/pt-BR'] = df['body/pt-BR'].str.replace(r'<br\s*/?>', ' ', regex=True)
    # Remove other HTML tags
    df['body/pt-BR'] = df['body/pt-BR'].str.replace(r'<[^>]*>', '', regex=True)
    # Remove text after "Orgão Responsável"
    df['body/pt-BR'] = df['body/pt-BR'].apply(lambda x: x.split('Órgão Responsável:')[0])

    relevant_cols = [
        'id',
        'category/id',
        'category/name/pt-BR',
        'title/pt-BR',
        'body/pt-BR',
        'supports',
        'followers',
        'comments',
        'published_at',
        'state', # REMOVE
        'url',
        'participatory_space/url',
    ]
    return df[relevant_cols]

def get_unique_themes(df):
    themes = df['category/name/pt-BR'].str.lower().unique().tolist()
    themes = [str(theme) for theme in themes if theme is not np.nan]
    themes = set([theme.split('- direito à')[-1].strip() for theme in themes])
    themes = set([theme.split('- direito ao')[-1].strip() for theme in themes])
    return themes


def get_embedding(sentence):
    print(sentence)
    tokens = tokenizer.tokenize(sentence)
    tokens = ['[CLS]'] + tokens + ['[SEP]']
    input_ids = tokenizer.convert_tokens_to_ids(tokens)
    input_ids = torch.tensor([input_ids]).to(device)

    with torch.no_grad():
        output = model(input_ids)
        embeddings = output.last_hidden_state.mean(dim=1)
    return embeddings

def classify_topics(sentences, topics, top_n=TOP_N_TOPICS):
    # Convert sentences and topics to BERT embeddings
    print("sentence_embeddings = [get_embedding(sentence) for sentence in sentences]")
    sentence_embeddings = [get_embedding(sentence) for sentence in sentences]
    print("topic_embeddings = [get_embedding(topic) for topic in topics]")
    topic_embeddings = [get_embedding(topic) for topic in topics]

    classified_data = []

    for i, sentence_embedding in enumerate(sentence_embeddings):
        print(i, len(sentence_embeddings))
        cosine_similarities = cosine_similarity(sentence_embedding.cpu().numpy(), np.array([t.cpu().numpy() for t in topic_embeddings]).squeeze(1))
        top_n_indices = np.argsort(cosine_similarities[0])[-top_n:][::-1]
        top_n_topic_similarities = [(topics[idx], cosine_similarities[0][idx]) for idx in top_n_indices]
        classified_data.append((sentences[i], top_n_topic_similarities))

    return classified_data

def classify_topics_lda(corpus, lda_model, num_topics=TOP_N_TOPICS):
    topics_data = []

    for doc in corpus:
        doc_topics = lda_model.get_document_topics(doc)
        doc_topics = sorted(doc_topics, key=lambda x: x[1], reverse=True)[:num_topics]
        topics_data.append(doc_topics)

    return topics_data

### Loading and preprocessing proposals

In [6]:
df_propostas = load_and_preprocess_proposals(PROPOSALS_FILE)
df_propostas.head(5)

Unnamed: 0,id,category/id,category/name/pt-BR,title/pt-BR,body/pt-BR,supports,followers,comments,published_at,state,url,participatory_space/url
0,1,30.0,Turismo,Turismo: esse é o Destino,Objetivo: Posicionar o turismo como vetor de d...,1,1,0,2023-05-10 10:03:41 -0300,,http://brasilparticipativo.presidencia.gov.br/...,http://brasilparticipativo.presidencia.gov.br/...
1,8,31.0,Desenvolvimento Agrário e Agricultura Familiar,Agricultura Familiar e Agroecologia,Objetivo: Fortalecer a agricultura familiar em...,1,0,0,2023-05-10 16:22:51 -0300,,http://brasilparticipativo.presidencia.gov.br/...,http://brasilparticipativo.presidencia.gov.br/...
2,9,1.0,Agricultura e Pecuária,Agropecuária Sustentável,Objetivo: Contribuir para o desenvolvimento do...,2,0,0,2023-05-10 16:35:47 -0300,,http://brasilparticipativo.presidencia.gov.br/...,http://brasilparticipativo.presidencia.gov.br/...
3,10,27.0,Saúde,Atenção Primária à Saúde,"Fortalecer a Atenção Primária à Saúde, amplian...",20427,515,0,2023-05-10 16:42:43 -0300,,http://brasilparticipativo.presidencia.gov.br/...,http://brasilparticipativo.presidencia.gov.br/...
4,11,27.0,Saúde,Atenção Especializada à Saúde,Ampliar o acesso às ações e serviços da Atençã...,18786,383,0,2023-05-10 16:41:01 -0300,,http://brasilparticipativo.presidencia.gov.br/...,http://brasilparticipativo.presidencia.gov.br/...


### Extracting sentences from proposals and titles

In [9]:
!pip install nltk

import nltk
from nltk.corpus import stopwords
from gensim.models import LdaModel

nltk.download('stopwords')
stop_words_pt = set(stopwords.words('portuguese'))

Collecting nltk
  Using cached nltk-3.8.1-py3-none-any.whl (1.5 MB)
Collecting click
  Downloading click-8.1.7-py3-none-any.whl (97 kB)
[K     |████████████████████████████████| 97 kB 1.6 MB/s eta 0:00:01
Installing collected packages: click, nltk
Successfully installed click-8.1.7 nltk-3.8.1


[nltk_data] Downloading package stopwords to
[nltk_data]     /home/anapaula/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [25]:
themes = get_unique_themes(df_propostas)
def flatten(lst):
    return [item for sublist in lst for item in sublist]
stop_words_pt.update(flatten([theme.split(' ') for theme in themes]))
stop_words_pt

{'-',
 'a',
 'acesso',
 'advocacia-geral',
 'aeroportos',
 'agricultura',
 'agrário',
 'ambiente',
 'ao',
 'aos',
 'aquela',
 'aquelas',
 'aquele',
 'aqueles',
 'aquicultura',
 'aquilo',
 'as',
 'assistência',
 'até',
 'banco',
 'casa',
 'central',
 'cidadania',
 'cidadania,',
 'cidades',
 'civil',
 'ciência,',
 'clima',
 'com',
 'combate',
 'como',
 'comunicação',
 'comunicações',
 'comércio',
 'controladoria-geral',
 'cultura',
 'da',
 'das',
 'de',
 'defesa',
 'dela',
 'delas',
 'dele',
 'deles',
 'depois',
 'desenvolvimento',
 'desenvolvimento,',
 'desporto',
 'direitos',
 'diversidade',
 'do',
 'dos',
 'e',
 'educação',
 'ela',
 'elas',
 'ele',
 'eles',
 'em',
 'emprego',
 'energia',
 'entre',
 'era',
 'eram',
 'esporte',
 'essa',
 'essas',
 'esse',
 'esses',
 'esta',
 'estamos',
 'estar',
 'estas',
 'estava',
 'estavam',
 'este',
 'esteja',
 'estejam',
 'estejamos',
 'estes',
 'esteve',
 'estive',
 'estivemos',
 'estiver',
 'estivera',
 'estiveram',
 'estiverem',
 'estivermos',
 

In [30]:
df_propostas['sentence'] = df_propostas['title/pt-BR'].str.lower() + ' ' + df_propostas['body/pt-BR'].str.lower()
sentences = df_propostas['sentence'].tolist()
sentences[-10:-1]

# Tokenize as sentenças
print("Tokenize as sentenças")
def tokenize_and_remove_stopwords(sentence):
    tokens = [word for word in sentence.split() if word.lower() not in stop_words_pt]
    return tokens

df_propostas['tokens'] = df_propostas['sentence'].apply(tokenize_and_remove_stopwords)

# Crie um dicionário a partir dos tokens
print("Crie um dicionário a partir dos tokens")
dictionary = Dictionary(df_propostas['tokens'])
corpus = [dictionary.doc2bow(text) for text in df_propostas['tokens']]

NUM_TOPICS = 2
lda = LdaModel(corpus, num_topics=NUM_TOPICS, id2word=dictionary, passes=15)

classified_topics_lda = classify_topics_lda(corpus, lda)
classified_topics_processed_lda = []

for idx, topics in enumerate(classified_topics_lda):
    print(idx, topics)
    topic_strings = []
    for topic_id, prop in topics:
        words_in_topic = " ".join([word[0] for word in lda.show_topic(topic_id, topn=2)])
        topic_strings.append(f"{words_in_topic} ({round(prop*100)}%)")
    classified_topics_processed_lda.append([sentences[idx]] + topic_strings)

df_classified_lda = pd.DataFrame(classified_topics_processed_lda)
df_classified_lda

2023-10-23 16:16:39,031 - INFO - adding document #0 to Dictionary<0 unique tokens: []>


Tokenize as sentenças
Crie um dicionário a partir dos tokens


2023-10-23 16:16:39,210 - INFO - built Dictionary<44793 unique tokens: ['atividade', 'aumentar', 'brasileiros,', 'brasileiros.', 'cidadãos']...> from 8800 documents (total 293108 corpus positions)
2023-10-23 16:16:39,211 - INFO - Dictionary lifecycle event {'msg': "built Dictionary<44793 unique tokens: ['atividade', 'aumentar', 'brasileiros,', 'brasileiros.', 'cidadãos']...> from 8800 documents (total 293108 corpus positions)", 'datetime': '2023-10-23T16:16:39.211254', 'gensim': '4.3.2', 'python': '3.8.10 (default, Mar 15 2022, 12:22:08) \n[GCC 9.4.0]', 'platform': 'Linux-5.10.102.1-microsoft-standard-WSL2-x86_64-with-glibc2.29', 'event': 'created'}
2023-10-23 16:16:39,319 - INFO - using symmetric alpha at 0.5
2023-10-23 16:16:39,320 - INFO - using symmetric eta at 0.5
2023-10-23 16:16:39,324 - INFO - using serial LDA version on this node
2023-10-23 16:16:39,329 - INFO - running online (multi-pass) LDA training, 2 topics, 15 passes over the supplied corpus of 8800 documents, updating m

0 [(0, 0.9646116), (1, 0.035388403)]
1 [(0, 0.97721803), (1, 0.02278202)]
2 [(0, 0.97603905), (1, 0.02396099)]
3 [(0, 0.97968346), (1, 0.020316571)]
4 [(0, 0.97299653), (1, 0.027003441)]
5 [(0, 0.9652654), (1, 0.034734633)]
6 [(0, 0.9662099), (1, 0.033790138)]
7 [(0, 0.5699939), (1, 0.43000612)]
8 [(0, 0.96545374), (1, 0.034546204)]
9 [(0, 0.9775804), (1, 0.022419583)]
10 [(0, 0.9843485), (1, 0.015651543)]
11 [(0, 0.93152046), (1, 0.068479575)]
12 [(0, 0.9716853), (1, 0.02831463)]
13 [(0, 0.96349424), (1, 0.036505803)]
14 [(0, 0.9752726), (1, 0.024727441)]
15 [(0, 0.9724818), (1, 0.027518187)]
16 [(0, 0.9225402), (1, 0.077459775)]
17 [(0, 0.9856615), (1, 0.014338473)]
18 [(0, 0.98566335), (1, 0.014336589)]
19 [(0, 0.9696197), (1, 0.030380331)]
20 [(0, 0.5039364), (1, 0.49606356)]
21 [(0, 0.95788276), (1, 0.04211723)]
22 [(0, 0.98579437), (1, 0.014205622)]
23 [(0, 0.9840285), (1, 0.015971527)]
24 [(0, 0.9729774), (1, 0.027022598)]
25 [(0, 0.978373), (1, 0.021627054)]
26 [(0, 0.974349), 

Unnamed: 0,0,1,2
0,turismo: esse é o destino objetivo: posicionar...,criação programa (96%),piso todos (4%)
1,agricultura familiar e agroecologia objetivo: ...,criação programa (98%),piso todos (2%)
2,agropecuária sustentável objetivo: contribuir ...,criação programa (98%),piso todos (2%)
3,atenção primária à saúde fortalecer a atenção ...,criação programa (98%),piso todos (2%)
4,atenção especializada à saúde ampliar o acesso...,criação programa (97%),piso todos (3%)
...,...,...,...
8795,direitos humano introduzir logo nos primeiros ...,criação programa (96%),piso todos (4%)
8796,"mobimove: não perca seu ônibus, não perca seu ...",piso todos (92%),criação programa (8%)
8797,política nacional de saúde mental criar uma p...,criação programa (74%),piso todos (26%)
8798,justiça e segurança pública proposta de revisã...,criação programa (54%),piso todos (46%)


In [29]:
OUTPUT_FILE = DATA_PATH + 'results_TopicClassification_LDA.csv'
df_classified_lda = df_classified_lda.drop_duplicates()
df_classified_lda.to_csv(OUTPUT_FILE, index=False)