In [1]:
from sentence_transformers import SentenceTransformer
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import string
from unidecode import unidecode
from tqdm import tqdm

from umap import UMAP
import hdbscan

from gensim.corpora import Dictionary
from gensim.models import CoherenceModel
from collections import Counter


  from .autonotebook import tqdm as notebook_tqdm


ModuleNotFoundError: No module named 'hdbscan'

In [None]:

# Carregar dataset
df = pd.read_csv('../datasets/fakeWhatsApp.BR_2022.csv', delimiter=',', index_col=0)

# IDs a remover
ids_para_remover = [
    '6ef561ec0f448afcd7b3751124bb0712', 'd3e678a0ba0e1485548260a7c4599152',
    'ac5703154484de05336af617455ca55e', 'a39edbd64d378226ffa60433649a0acf',
    'c66d0d4ae5a4b281bff67e1fa4fbd6ba', '819bbc872ed6d81f44d746b710eecf06',
    'ec94da4d54f9a5693e88fa582926be53', '6a38c72316d87c028dfd66c10442476b',
    '94099e1e46f129856541e2b3640896d1', '8f367d1693fff47218603fa47ded525c',
    'c09caffee0d1bd30926dea9df25dc88f'
]

df_filtrado = df[~df['id_member_anonymous'].isin(ids_para_remover)]
df_filtrado = df_filtrado.dropna(subset=['pre_processed_text'])
df_filtrado = df_filtrado[df_filtrado['trava_zap'] == False]


In [None]:

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('portuguese'))

def preprocess_text(text):
    def substituir_dominios(texto):
        def extrair_dominio(url):
            dominio = re.sub(r'^https?://(?:www\.)?|www\.', '', url)
            dominio = re.split(r'[/?#]', dominio)[0]
            return dominio.split('.')[0]
        return re.sub(r'https?://(?:www\.)?\S+|www\.\S+', lambda match: extrair_dominio(match.group(0)), texto)

    text = substituir_dominios(text)
    text = text.lower()
    text = unidecode(text)
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = re.sub(r'([\U00010000-\U0010FFFF])\1+', r'\1', text)
    text = re.sub(r'\s+', ' ', text).strip()
    text = re.sub(r'k{2,}|K{2,}', 'kk', text)
    text = re.sub(r'(ha){2,}', 'haha', text, flags=re.IGNORECASE)
    text = re.sub(r'(ka){2,}', 'kaka', text, flags=re.IGNORECASE)
    text = ' '.join([word for word in text.split() if word not in stop_words])
    return text


In [None]:

df_geral = df_filtrado.copy()
df_geral['text_processed'] = df_geral['pre_processed_text'].apply(preprocess_text)
docs = df_geral['text_processed'].tolist()


In [None]:

model = SentenceTransformer("paraphrase-multilingual-MiniLM-L12-v2")
embeddings = model.encode(docs, show_progress_bar=True)


In [None]:

umap_model = UMAP(n_neighbors=15, n_components=5, metric='cosine', random_state=42)
reduced_embeddings = umap_model.fit_transform(embeddings)

cluster_model = hdbscan.HDBSCAN(min_cluster_size=15, metric='euclidean', prediction_data=True)
labels = cluster_model.fit_predict(reduced_embeddings)


In [None]:

df_clustered = pd.DataFrame({'text': docs, 'label': labels})
topics_words = []

for label in sorted(set(labels)):
    if label == -1: continue
    texts = df_clustered[df_clustered['label'] == label]['text']
    all_words = " ".join(texts).split()
    word_freq = Counter(all_words)
    top_words = [word for word, _ in word_freq.most_common(10)]
    topics_words.append(top_words)


In [None]:

texts = [doc.split() for doc in docs]
id2word = Dictionary(texts)
corpus = [id2word.doc2bow(text) for text in texts]

coh_cv = CoherenceModel(topics=topics_words, texts=texts, dictionary=id2word, coherence='c_v').get_coherence()
coh_npmi = CoherenceModel(topics=topics_words, texts=texts, dictionary=id2word, coherence='c_npmi').get_coherence()
coh_uci = CoherenceModel(topics=topics_words, texts=texts, dictionary=id2word, coherence='c_uci').get_coherence()
coh_umass = CoherenceModel(topics=topics_words, corpus=corpus, dictionary=id2word, coherence='u_mass').get_coherence()

def topic_diversity(topics_words, top_n=10):
    words = [word for topic in topics_words for word in topic[:top_n]]
    unique_words = set(words)
    return len(unique_words) / (len(topics_words) * top_n)

div = topic_diversity(topics_words)

def irbo(topics, topk=10, p=0.9):
    def rbo(list1, list2, p):
        overlap = 0.0
        rbo_score = 0.0
        depth = min(len(list1), len(list2))
        for d in range(1, depth + 1):
            if list1[d-1] in list2[:d] and list2[d-1] in list1[:d]:
                overlap += 1
            rbo_score += overlap / d * (p ** d)
        return (1 - p) * rbo_score

    scores = []
    for i in range(len(topics)):
        for j in range(i + 1, len(topics)):
            t1 = topics[i][:topk]
            t2 = topics[j][:topk]
            score = rbo(t1, t2, p)
            scores.append(score)
    return 1 - np.mean(scores) if scores else 0.0

irbo_score = irbo(topics_words)


In [None]:

print("\n📊 Avaliação dos Tópicos:")
print(f"Coerência c_v:     {coh_cv:.4f}")
print(f"Coerência c_npmi:  {coh_npmi:.4f}")
print(f"Coerência c_uci:   {coh_uci:.4f}")
print(f"Coerência u_mass:  {coh_umass:.4f}")
print(f"Diversidade:       {div:.4f}")
print(f"IRBO:              {irbo_score:.4f}")
