### Importing Libraries and Constants Definition

In [1]:
!pip install gensim

import pandas as pd
import numpy as np
import logging
from sklearn.metrics.pairwise import cosine_similarity
from transformers import BertTokenizer, BertModel
from transformers import AutoModel, AutoTokenizer
from gensim.corpora import Dictionary
import torch

logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

DATA_PATH = '../data/public/'
PROPOSALS_FILE = DATA_PATH + 'brasilparticipativo.presidencia.gov.br-open-data-proposals.csv'
VOCAB_FILE = DATA_PATH + 'vocabulario-controlado-basico-vcb-lista-alfabetica.txt'
TOP_N_TOPICS = 5

Defaulting to user installation because normal site-packages is not writeable
[33mDEPRECATION: distro-info 0.23ubuntu1 has a non-standard version number. pip 24.0 will enforce this behaviour change. A possible replacement is to upgrade to a newer version of distro-info or contact the author to suggest that they release a version with a conforming version number. Discussion can be found at https://github.com/pypa/pip/issues/12063[0m[33m
[0m[33mDEPRECATION: python-debian 0.1.36ubuntu1 has a non-standard version number. pip 24.0 will enforce this behaviour change. A possible replacement is to upgrade to a newer version of python-debian or contact the author to suggest that they release a version with a conforming version number. Discussion can be found at https://github.com/pypa/pip/issues/12063[0m[33m
[0m

  from .autonotebook import tqdm as notebook_tqdm
2024-02-19 14:14:10.880720: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2024-02-19 14:14:10.958327: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2024-02-19 14:14:10.958342: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.
2024-02-19 14:14:11.447115: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open s

### Set up GPU

In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

### Load BERT model and tokenizer

In [3]:
tokenizer = BertTokenizer.from_pretrained("neuralmind/bert-large-portuguese-cased")
model = BertModel.from_pretrained("neuralmind/bert-large-portuguese-cased").to(device)

Some weights of the model checkpoint at neuralmind/bert-large-portuguese-cased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


### Functions Definition

In [4]:
def process_vocab_file(vocab_file):
    ROWS_TO_SKIP = 8
    vocab_dict = {}
    with open(vocab_file, 'r', encoding='utf-8') as file:
        current_term = ''
        for line in file.read().splitlines()[ROWS_TO_SKIP:]:
            if not line.startswith('\t') and line != '':
                current_term = line
                vocab_dict[current_term] = {
                    'USE': [], # Use
                    'DF': [],  # Definição
                    'UP': [],  # Termos não preferenciais
                    'TG': [],  # Termo geral
                    'TR': [],  # Termos relacionados
                    'TE': [],  # Termos específicos
                    'EQ': [],  # Equivalente
                    'Nota de escopo': [],
                }
            elif line != '':
                property = line.strip().split(':')
                key = property[0].strip()
                value = property[1].strip()
                vocab_dict[current_term][key].append(value)
    return vocab_dict

def process_vocab(vocab_file, exclusion_terms):
    vocab_json = process_vocab_file(vocab_file)
    vocab_list_processed = [key for key in vocab_json if key.lower() not in exclusion_terms]
    return vocab_list_processed

def load_and_preprocess_proposals(file_path):
    df = pd.read_csv(file_path, delimiter=';')
    
    # Remove rejected and withdrawn proposals
    # df = df[~df['state'].isin(['rejected', 'withdrawn'])]
    
    # Replace <br> or <br/> or <br /> with spaces
    df['body/pt-BR'] = df['body/pt-BR'].str.replace(r'<br\s*/?>', ' ', regex=True)
    # Remove other HTML tags
    df['body/pt-BR'] = df['body/pt-BR'].str.replace(r'<[^>]*>', '', regex=True)
    # Remove text after "Orgão Responsável"
    df['body/pt-BR'] = df['body/pt-BR'].apply(lambda x: x.split('Órgão Responsável:')[0])

    relevant_cols = [
        'id',
        'category/id',
        'category/name/pt-BR',
        'title/pt-BR',
        'body/pt-BR',
        'supports',
        'followers',
        'comments',
        'published_at',
        'state', # REMOVE
        'url',
        'participatory_space/url',
    ]
    return df[relevant_cols]

def get_unique_themes(df):
    themes = df['category/name/pt-BR'].str.lower().unique().tolist()
    themes = [str(theme) for theme in themes if theme is not np.nan]
    themes = set([theme.split('- direito à')[-1].strip() for theme in themes])
    themes = set([theme.split('- direito ao')[-1].strip() for theme in themes])
    return themes


def get_embedding(sentence):
    print(sentence)
    tokens = tokenizer.tokenize(sentence)
    tokens = ['[CLS]'] + tokens + ['[SEP]']
    input_ids = tokenizer.convert_tokens_to_ids(tokens)
    input_ids = torch.tensor([input_ids]).to(device)

    with torch.no_grad():
        output = model(input_ids)
        embeddings = output.last_hidden_state.mean(dim=1)
    return embeddings

def classify_topics_lda(corpus, lda_model, num_topics=TOP_N_TOPICS):
    topics_data = []

    for doc in corpus:
        doc_topics = lda_model.get_document_topics(doc)
        doc_topics = sorted(doc_topics, key=lambda x: x[1], reverse=True)[:num_topics]
        topics_data.append(doc_topics)

    return topics_data

### Loading and preprocessing proposals

In [5]:
df_propostas = load_and_preprocess_proposals(PROPOSALS_FILE)
df_propostas.head(5)

Unnamed: 0,id,category/id,category/name/pt-BR,title/pt-BR,body/pt-BR,supports,followers,comments,published_at,state,url,participatory_space/url
0,1,30.0,Turismo,Turismo: esse é o Destino,Objetivo: Posicionar o turismo como vetor de d...,1,1,0,2023-05-10 10:03:41 -0300,,http://brasilparticipativo.presidencia.gov.br/...,http://brasilparticipativo.presidencia.gov.br/...
1,8,31.0,Desenvolvimento Agrário e Agricultura Familiar,Agricultura Familiar e Agroecologia,Objetivo: Fortalecer a agricultura familiar em...,1,0,0,2023-05-10 16:22:51 -0300,,http://brasilparticipativo.presidencia.gov.br/...,http://brasilparticipativo.presidencia.gov.br/...
2,9,1.0,Agricultura e Pecuária,Agropecuária Sustentável,Objetivo: Contribuir para o desenvolvimento do...,2,0,0,2023-05-10 16:35:47 -0300,,http://brasilparticipativo.presidencia.gov.br/...,http://brasilparticipativo.presidencia.gov.br/...
3,10,27.0,Saúde,Atenção Primária à Saúde,"Fortalecer a Atenção Primária à Saúde, amplian...",20427,516,0,2023-05-10 16:42:43 -0300,,http://brasilparticipativo.presidencia.gov.br/...,http://brasilparticipativo.presidencia.gov.br/...
4,11,27.0,Saúde,Atenção Especializada à Saúde,Ampliar o acesso às ações e serviços da Atençã...,18786,385,0,2023-05-10 16:41:01 -0300,,http://brasilparticipativo.presidencia.gov.br/...,http://brasilparticipativo.presidencia.gov.br/...


### Extracting sentences from proposals and titles

In [6]:
!pip install nltk

import nltk
from nltk.corpus import stopwords
from gensim.models import LdaModel

nltk.download('stopwords')
stop_words_pt = set(stopwords.words('portuguese'))

Defaulting to user installation because normal site-packages is not writeable
[33mDEPRECATION: distro-info 0.23ubuntu1 has a non-standard version number. pip 24.0 will enforce this behaviour change. A possible replacement is to upgrade to a newer version of distro-info or contact the author to suggest that they release a version with a conforming version number. Discussion can be found at https://github.com/pypa/pip/issues/12063[0m[33m
[0m[33mDEPRECATION: python-debian 0.1.36ubuntu1 has a non-standard version number. pip 24.0 will enforce this behaviour change. A possible replacement is to upgrade to a newer version of python-debian or contact the author to suggest that they release a version with a conforming version number. Discussion can be found at https://github.com/pypa/pip/issues/12063[0m[33m
[0m

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/anapaula/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [7]:
themes = get_unique_themes(df_propostas)
def flatten(lst):
    return [item for sublist in lst for item in sublist]
stop_words_pt.update(flatten([theme.split(' ') for theme in themes]))
stop_words_pt

{'-',
 'a',
 'acesso',
 'advocacia-geral',
 'aeroportos',
 'agricultura',
 'agrário',
 'ambiente',
 'ao',
 'aos',
 'aquela',
 'aquelas',
 'aquele',
 'aqueles',
 'aquicultura',
 'aquilo',
 'as',
 'assistência',
 'até',
 'banco',
 'casa',
 'central',
 'cidadania',
 'cidadania,',
 'cidades',
 'civil',
 'ciência,',
 'clima',
 'com',
 'combate',
 'como',
 'comunicação',
 'comunicações',
 'comércio',
 'controladoria-geral',
 'cultura',
 'da',
 'das',
 'de',
 'defesa',
 'dela',
 'delas',
 'dele',
 'deles',
 'depois',
 'desenvolvimento',
 'desenvolvimento,',
 'desporto',
 'direitos',
 'diversidade',
 'do',
 'dos',
 'e',
 'educação',
 'ela',
 'elas',
 'ele',
 'eles',
 'em',
 'emprego',
 'energia',
 'entre',
 'era',
 'eram',
 'esporte',
 'essa',
 'essas',
 'esse',
 'esses',
 'esta',
 'estamos',
 'estar',
 'estas',
 'estava',
 'estavam',
 'este',
 'esteja',
 'estejam',
 'estejamos',
 'estes',
 'esteve',
 'estive',
 'estivemos',
 'estiver',
 'estivera',
 'estiveram',
 'estiverem',
 'estivermos',
 

In [8]:
df_propostas['sentence'] = df_propostas['title/pt-BR'].str.lower() + ' ' + df_propostas['body/pt-BR'].str.lower()
sentences = df_propostas['sentence'].tolist()
sentences[-10:-1]

# Tokenize as sentenças
print("Tokenize as sentenças")
def tokenize_and_remove_stopwords(sentence):
    tokens = [word for word in sentence.split() if word.lower() not in stop_words_pt]
    return tokens

df_propostas['tokens'] = df_propostas['sentence'].apply(tokenize_and_remove_stopwords)

# Crie um dicionário a partir dos tokens
print("Crie um dicionário a partir dos tokens")
dictionary = Dictionary(df_propostas['tokens'])
corpus = [dictionary.doc2bow(text) for text in df_propostas['tokens']]

NUM_TOPICS = 2
lda = LdaModel(corpus, num_topics=NUM_TOPICS, id2word=dictionary, passes=15)

classified_topics_lda = classify_topics_lda(corpus, lda)
classified_topics_processed_lda = []

for idx, topics in enumerate(classified_topics_lda):
    print(idx, topics)
    topic_strings = []
    for topic_id, prop in topics:
        words_in_topic = " ".join([word[0] for word in lda.show_topic(topic_id, topn=2)])
        topic_strings.append(f"{words_in_topic} ({round(prop*100)}%)")
    classified_topics_processed_lda.append([sentences[idx]] + topic_strings)

df_classified_lda = pd.DataFrame(classified_topics_processed_lda)
df_classified_lda

2024-02-19 14:14:18,043 - INFO - adding document #0 to Dictionary(0 unique tokens: [])


Tokenize as sentenças
Crie um dicionário a partir dos tokens


2024-02-19 14:14:18,220 - INFO - adding document #10000 to Dictionary(47178 unique tokens: ['atividade', 'aumentar', 'brasileiros,', 'brasileiros.', 'cidadãos']...)
2024-02-19 14:14:18,235 - INFO - built Dictionary(48072 unique tokens: ['atividade', 'aumentar', 'brasileiros,', 'brasileiros.', 'cidadãos']...) from 10752 documents (total 349351 corpus positions)
2024-02-19 14:14:18,352 - INFO - using symmetric alpha at 0.5
2024-02-19 14:14:18,353 - INFO - using symmetric eta at 0.5
2024-02-19 14:14:18,358 - INFO - using serial LDA version on this node
2024-02-19 14:14:18,363 - INFO - running online (multi-pass) LDA training, 2 topics, 15 passes over the supplied corpus of 10752 documents, updating model once every 2000 documents, evaluating perplexity every 10752 documents, iterating 50x with a convergence threshold of 0.001000
2024-02-19 14:14:18,363 - INFO - PROGRESS: pass 0, at document #2000/10752
2024-02-19 14:14:18,824 - INFO - merging changes from 2000 documents into a model of 10

0 [(0, 0.9666862), (1, 0.03331383)]
1 [(0, 0.97778904), (1, 0.022210961)]
2 [(0, 0.97303826), (1, 0.026961736)]
3 [(0, 0.9761845), (1, 0.023815503)]
4 [(0, 0.970693), (1, 0.029307028)]
5 [(0, 0.9598432), (1, 0.04015679)]
6 [(0, 0.89058733), (1, 0.10941265)]
7 [(0, 0.9769215), (1, 0.023078546)]
8 [(0, 0.8882982), (1, 0.11170179)]
9 [(0, 0.9743603), (1, 0.025639692)]
10 [(0, 0.98187536), (1, 0.018124653)]
11 [(0, 0.9767522), (1, 0.023247752)]
12 [(0, 0.96386236), (1, 0.036137607)]
13 [(0, 0.9607572), (1, 0.039242808)]
14 [(0, 0.9756281), (1, 0.024371905)]
15 [(0, 0.97242934), (1, 0.02757069)]
16 [(0, 0.8948055), (1, 0.105194524)]
17 [(0, 0.6840303), (1, 0.31596968)]
18 [(0, 0.9829333), (1, 0.017066767)]
19 [(0, 0.971867), (1, 0.028133009)]
20 [(0, 0.9620658), (1, 0.037934195)]
21 [(0, 0.805854), (1, 0.19414595)]
22 [(0, 0.98405105), (1, 0.015948981)]
23 [(0, 0.9074603), (1, 0.09253967)]
24 [(0, 0.951678), (1, 0.04832203)]
25 [(0, 0.97703046), (1, 0.022969546)]
26 [(0, 0.9752753), (1, 0.0

Unnamed: 0,0,1,2
0,turismo: esse é o destino objetivo: posicionar...,jovens criação (97%),ensino pessoas (3%)
1,agricultura familiar e agroecologia objetivo: ...,jovens criação (98%),ensino pessoas (2%)
2,agropecuária sustentável objetivo: contribuir ...,jovens criação (97%),ensino pessoas (3%)
3,atenção primária à saúde fortalecer a atenção ...,jovens criação (98%),ensino pessoas (2%)
4,atenção especializada à saúde ampliar o acesso...,jovens criação (97%),ensino pessoas (3%)
...,...,...,...
10747,bulling zero nas escolas o período escolar não...,ensino pessoas (86%),jovens criação (14%)
10748,centros culturais para jovens a cultura é dire...,jovens criação (90%),ensino pessoas (10%)
10749,passe cultural para estudantes programa que d...,jovens criação (87%),ensino pessoas (13%)
10750,políticos e familiares serem obrigados a usare...,ensino pessoas (99%),jovens criação (1%)


In [9]:
OUTPUT_FILE = DATA_PATH + 'results_TopicClassification_LDA.csv'
df_classified_lda = df_classified_lda.drop_duplicates()
df_classified_lda.to_csv(OUTPUT_FILE, index=False)