<a href="https://colab.research.google.com/github/jeffersonramelo/workshop-ufu/blob/main/Analise_de_topicos_LDA.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import re
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

import nltk
nltk.download('stopwords') # Carregar stopwords e inicializar lemmatizer

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

# Função para pré-processamento de texto
def preprocess_text(text):
    # Converter para minúsculas
    text = text.lower()
    
    # Remover caracteres especiais e números
    text = re.sub(r'[^a-zA-Z]', ' ', text)
    
    # Tokenização e lematização
    tokens = text.split()
    tokens = [lemmatizer.lemmatize(token) for token in tokens]
    
    # Remover stopwords
    tokens = [token for token in tokens if token not in stop_words]
    
    # Juntar os tokens em uma string novamente
    preprocessed_text = ' '.join(tokens)
    
    return preprocessed_text

dados = pd.read_excel('/content/atascopomEN_ML.xlsx')

import nltk
nltk.download('wordnet')

# Pré-processamento dos textos
dados['texto_preprocessado'] = dados['texto'].apply(preprocess_text)

# Vetorização dos textos
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(dados['texto_preprocessado'])

# Obter os nomes das características
feature_names = vectorizer.get_feature_names_out()

# Aplicar modelo LDA
lda_model = LatentDirichletAllocation(n_components=5, random_state=42)
lda_model.fit(X)

# Extração dos tópicos e palavras-chave
num_top_words = 10

for topic_idx, topic in enumerate(lda_model.components_):
    print("Tópico #%d:" % topic_idx)
    print(" ".join([feature_names[i] for i in topic.argsort()[:-num_top_words - 1:-1]]))
    print()


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


Tópico #0:
month price inflation rate increase year increased compared copom wa

Tópico #1:
inflation price month year copom increased monetary policy according market

Tópico #2:
price inflation month growth rate year index committee copom market

Tópico #3:
inflation copom monetary policy committee scenario economic rate risk economy

Tópico #4:
price rate month increase wa billion year exchange inflation central



In [3]:
dados.head()

Unnamed: 0,DataReferencia,ImagemCapa,Titulo,Url,LinkPagina,EsconderDataReferencia,ata,link,texto,data,Meta SELIC,sentimento,texto_preprocessado
0,2023-05-03T03:00:00Z,/content/publications/PublishingImages/Capas/c...,"254th Meeting - May 2-3, 2023",/content/copom/copomminutes/MINUTES 254.pdf,/en/publications/copomminutes/03052023,1.0,254,https://www.bcb.gov.br/content/copom/copomminu...,th_x000D_\n ...,2023-05-03,13.75,negativo,th x minute meeting x monetary policy committe...
1,2023-03-22T03:00:00Z,/content/publications/PublishingImages/Capas/c...,"253rd Meeting - March 21-22, 2023",/content/copom/copomminutes/MINUTES 253.pdf,/en/publications/copomminutes/22032023,1.0,253,https://www.bcb.gov.br/content/copom/copomminu...,rd_x000D_\n ...,2023-03-22,13.75,negativo,rd x minute meeting x monetary policy committe...
2,2023-02-01T03:00:00Z,/content/publications/PublishingImages/Capas/c...,"252nd Meeting - January 31 - February 1, 2023",/content/copom/copomminutes/MINUTES 252.pdf,/en/publications/copomminutes/01022023,1.0,252,https://www.bcb.gov.br/content/copom/copomminu...,nd_x000D_\n 2...,2023-02-01,13.75,negativo,nd x minute meeting x monetary policy committe...
3,2022-12-07T03:00:00Z,/content/publications/PublishingImages/Capas/c...,"251st Meeting - December 6-7, 2022",/content/copom/copomminutes/Minutes 251.pdf,/en/publications/copomminutes/07122022,1.0,251,https://www.bcb.gov.br/content/copom/copomminu...,st_x000D_\n ...,2022-12-07,13.75,negativo,st x minute meeting x monetary policy committe...
4,2022-10-26T03:00:00Z,/content/publications/PublishingImages/Capas/c...,"250th Meeting - October 25-26, 2022",/content/copom/copomminutes/Minutes 250.pdf,/en/publications/copomminutes/26102022,1.0,250,https://www.bcb.gov.br/content/copom/copomminu...,th_x000D_\n ...,2022-10-26,13.75,negativo,th x minute meeting x monetary policy committe...


In [4]:
#saber quanto de Tópico #0 tem em cada coluna ata
# Calcular a proporção do Tópico #0 em cada coluna "ata"
topic_0_proportions = []

for i in range(len(dados['texto_preprocessado'])):
    text = dados['texto_preprocessado'][i]
    preprocessed_text = preprocess_text(text)
    vectorized_text = vectorizer.transform([preprocessed_text])
    topic_distribution = lda_model.transform(vectorized_text)
    topic_0_proportion = topic_distribution[0][0]  # Proporção do Tópico #0
    topic_0_proportions.append(topic_0_proportion)

# Adicionar as proporções como uma nova coluna no DataFrame
dados['topic_0_proportion'] = topic_0_proportions

# Exibir o DataFrame com a nova coluna
print(dados[['ata', 'topic_0_proportion']])


     ata  topic_0_proportion
0    254            0.000111
1    253            0.000103
2    252            0.000107
3    251            0.000126
4    250            0.000142
..   ...                 ...
209   45            0.019436
210   45            0.019436
211   44            0.000080
212   43            0.083971
213   42            0.002621

[214 rows x 2 columns]
