#### Importações e Funções de Pré-processamento

In [1]:
import pandas as pd
import re
import numpy as np
from nltk.corpus import stopwords
from gensim.utils import simple_preprocess
import spacy
import tomotopy as tp
import matplotlib.pyplot as plt
from wordcloud import WordCloud
import pyLDAvis
from gensim import corpora
from gensim.models import CoherenceModel
import scipy.stats as stats
from tqdm import tqdm

import string
import emoji

# emojis and punctuation
punct = list(string.punctuation) + ["\n"]
keep_punct = ["-"]
[punct.remove(item) for item in keep_punct if item in punct]

emojis_list = list(emoji.EMOJI_DATA.keys())
emojis_list += ["\n"]
emojis_punct = emojis_list + punct

def processEmojisPunctuation(text, remove_punct=False, remove_emoji=False):
    """
    Put spaces between emojis. Removes punctuation.
    """
    # get all unique chars
    chars = set(text)
    # for each unique char in text, do:
    for c in chars:
        if remove_punct:  # remove punctuation
            if c in punct:
                text = text.replace(c, " ")

        if remove_emoji:  # remove emojis
            if c in emojis_list:
                text = text.replace(c, " ")
        else:  # put spaces between emojis
            if c in emojis_list:
                text = text.replace(c, " " + c + " ")

    text = re.sub(" +", " ", text)
    return text

# stop words removal
stop_words = list(stopwords.words("portuguese"))
new_stopwords = [
    "aí",
    "pra",
    "vão",
    "vou",
    "onde",
    "tá",
    "pois",
    "so",
    "deu",
    "ai",
    "ta",
    "alguem",
    "ne",
    "cara",
    "to",
    "mim",
    "la",
    "vcs",
    "tbm",
    "tudo",
    "a",
    "O",
    "uma",
    "de",
    "que"
]
stop_words = stop_words + new_stopwords
final_stop_words = []
for sw in stop_words:
    sw = " " + sw + " "
    final_stop_words.append(sw)

def removeStopwords(text):
    for sw in final_stop_words:
        text = text.replace(sw, " ")
    return text

# lemmatization
nlp = spacy.load("pt_core_news_sm")

def lemmatization(text):
    doc = nlp(text)
    for token in doc:
        if token.text != token.lemma_:
            text = text.replace(token.text, token.lemma_)
    return text

def domainUrl(text):
    """
    Substitutes an URL in a text for the domain of this URL
    Input: an string
    Output: the string with the modified URL
    """
    if "http" in text:
        re_url = r"[^\s]*https*://[^\s]*"
        matches = re.findall(re_url, text, flags=re.IGNORECASE)
        for m in matches:
            domain = m.split("//")
            domain = domain[1].split("/")[0]
            text = re.sub(re_url, domain, text, 1)
        return text
    else:
        return text

def processLoL(text):
    re_kkk = r"kkk*"
    t = re.sub(re_kkk, "kkk", text, flags=re.IGNORECASE)
    return t

def preprocess(text, semi=False, rpunct=False, remoji=False):
    text = text.lower().strip()
    text = domainUrl(text)
    text = processLoL(text)
    text = processEmojisPunctuation(text, remove_punct=rpunct, remove_emoji=remoji)
    if semi:
        return text
    text = removeStopwords(text)
    text = lemmatization(text)
    return text


#### Carregar e Pré-processar o Dataset

In [2]:
# Carregar o dataset
dataset = pd.read_csv('../transcricoes_medium/m_transc_Bernardo_P_Küster.csv')

# Limpeza e preprocessamento
dataset = dataset[:3000]
dataset['Transcription'] = dataset['Transcription'].apply(lambda x: preprocess(x, semi=True, rpunct=True, remoji=True))
dataset.dropna(subset=['Transcription'], inplace=True)
clean_messages = dataset['Transcription'].apply(lambda x: preprocess(x, semi=True, rpunct=True, remoji=True))

# Tokenização e preparação dos textos
token_messages = [nlp(x) for x in clean_messages]
texts = [[token.text for token in doc] for doc in token_messages]

# Criar o corpus para tomotopy
corpus = tp.utils.Corpus()
for text in texts:
    corpus.add_doc(text)


FileNotFoundError: [Errno 2] No such file or directory: '../transcricoes_medium/m_transc_Bernardo_P_Küster.csv'

#### Definir Funções para Diversidade de Tópicos, iRBO e Jaccard

In [None]:
def proportion_unique_words(topics, topk=10):
    """
    Calcular a proporção de palavras únicas nos tópicos

    Parameters
    ----------
    topics: lista de listas de palavras
    topk: top k palavras nas quais a diversidade do tópico será calculada
    """
    if topk > len(topics[0]):
        raise Exception('Words in topics are less than '+str(topk))
    else:
        unique_words = set()
        for topic in topics:
            unique_words = unique_words.union(set(topic[:topk]))
        puw = len(unique_words) / (topk * len(topics))
        return puw

def calculate_irbo(pt_model, topk=10):
    """
    Calcular o iRBO entre tópicos
    """
    import irbo
    topics = [pt_model.get_topic_words(k, top_n=topk) for k in range(pt_model.k)]
    topics = [[word for word, _ in topic] for topic in topics]
    irbo_values = []
    for i in range(len(topics)):
        for j in range(i + 1, len(topics)):
            irbo_value = irbo.rank_biased_overlap(topics[i], topics[j], p=0.9)
            irbo_values.append(irbo_value)
    return np.mean(irbo_values)

def calculate_jaccard_distance(pt_model, topk=10):
    """
    Calcular a distância de Jaccard entre tópicos
    """
    def jaccard_distance(set1, set2):
        intersection = len(set1.intersection(set2))
        union = len(set1.union(set2))
        return 1 - intersection / union

    topics = [pt_model.get_topic_words(k, top_n=topk) for k in range(pt_model.k)]
    topics = [set([word for word, _ in topic]) for topic in topics]
    jaccard_values = []
    for i in range(len(topics)):
        for j in range(i + 1, len(topics)):
            jaccard_value = jaccard_distance(topics[i], topics[j])
            jaccard_values.append(jaccard_value)
    return np.mean(jaccard_values)


#### Treinar Modelos e Calcular Métricas

In [None]:
# Definir a faixa de tópicos
start = 2
limit = 30
step = 2

metric_values = []

# Treinamento dos modelos de tópicos e cálculo das métricas
for num_topics in tqdm(range(start, limit, step)):
    pt_model = tp.PTModel(p=1000, k=num_topics, seed=42)
    for doc in corpus:
        pt_model.add_doc(doc.words)
    pt_model.train(100)  # Esse 100 vem de um hiperparametro, pode mudar depois

    metric_value_row = {'num_topics': num_topics}  # Armazena o número de tópicos

    # Calcular as métricas de coerência
    for preset in ('u_mass', 'c_uci', 'c_npmi', 'c_v'):
        coh = tp.coherence.Coherence(pt_model, coherence=preset)
        average_coherence = coh.get_score()
        metric_value_row[preset] = average_coherence

    # Calcular a perplexidade
    metric_value_row['perplexity'] = pt_model.perplexity

    # Calcular a diversidade dos tópicos
    topics = [pt_model.get_topic_words(k, top_n=10) for k in range(num_topics)]
    topics = [[word for word, _ in topic] for topic in topics]
    metric_value_row['diversity'] = proportion_unique_words(topics, topk=10)

    # Calcular o iRBO entre os tópicos
    metric_value_row['irbo'] = calculate_irbo(pt_model, topk=10)

    # Calcular a distância de Jaccard entre os tópicos
    metric_value_row['jaccard'] = calculate_jaccard_distance(pt_model, topk=10)

    metric_values.append(metric_value_row)

# Salvar as métricas em um CSV
metrics_df = pd.DataFrame(metric_values)
metrics_df.to_csv('topic_model_metrics.csv', index=False)


#### Visualizar as Métricas

In [None]:
# Plotar a coerência dos tópicos
coherence_values = [x['c_v'] for x in metric_values]
x = range(start, limit, step)
plt.plot(x, coherence_values)
plt.xlabel("Num Tópicos")
plt.ylabel("Score de Coerência")
plt.legend(("Valores de Coerência"), loc='best')
plt.savefig("coherence_score.png")
plt.show()

# Plotar a diversidade dos tópicos
diversity_values = [x['diversity'] for x in metric_values]
plt.plot(x, diversity_values)
plt.xlabel("Num Tópicos")
plt.ylabel("Diversidade dos Tópicos")
plt.legend(("Diversidade dos Tópicos"), loc='best')
plt.savefig("diversity_score.png")
plt.show()

# Plotar o iRBO entre os tópicos
irbo_values = [x['irbo'] for x in metric_values]
plt.plot(x, irbo_values)
plt.xlabel("Num Tópicos")
plt.ylabel("iRBO")
plt.legend(("iRBO"), loc='best')
plt.savefig("irbo_score.png")
plt.show()

# Plotar a distância de Jaccard entre os tópicos
jaccard_values = [x['jaccard'] for x in metric_values]
plt.plot(x, jaccard_values)
plt.xlabel("Num Tópicos")
plt.ylabel("Distância de Jaccard")
plt.legend(("Distância de Jaccard"), loc='best')
plt.savefig("jaccard_score.png")
plt.show()


#### Salvar e Visualizar Tópicos

In [None]:
# Salvar os tópicos em um CSV e imprimir os tópicos
def save_and_print_topics(pt_model, num_topics, num_words, filename):
    topics = []
    for k in range(num_topics):
        topic_words = pt_model.get_topic_words(k, top_n=num_words)
        topics.append([word for word, prob in topic_words])
        print(f'Tópico {k}: {[word for word, prob in topic_words]}')

    topics_df = pd.DataFrame(topics)
    topics_df.insert(0, 'Tópico', range(num_topics))
    topics_df.to_csv(filename, index=False)

save_and_print_topics(pt_model, num_topics=10, num_words=30, filename='bernardo_topicos_tomotopy.csv')


#### Nuvem de Palavras e Visualização Interativa

In [None]:
# Nuvem de Palavras Única
def plot_combined_word_cloud(model, num_topics, filename):
    combined_dict = {}
    for t in range(num_topics):
        topic_words = model.get_topic_words(t, top_n=30)
        for word, weight in topic_words:
            if word in combined_dict:
                combined_dict[word] += weight
            else:
                combined_dict[word] = weight

    plt.figure()
    plt.imshow(WordCloud().fit_words(combined_dict))
    plt.axis("off")
    plt.title("Nuvem de Palavras Combinada")
    plt.savefig(filename)  # Salvar a nuvem de palavras
    plt.show()

plot_combined_word_cloud(pt_model, num_topics=pt_model.k, filename='word_cloud.png')

# Preparar os dados para visualização com pyLDAvis
topic_term_dists = np.stack([pt_model.get_topic_word_dist(k) for k in range(pt_model.k)])
doc_topic_dists = np.stack([doc.get_topic_dist() for doc in pt_model.docs])
doc_topic_dists /= doc_topic_dists.sum(axis=1, keepdims=True)
doc_lengths = np.array([len(doc.words) for doc in pt_model.docs])
vocab = list(pt_model.used_vocabs)
term_frequency = pt_model.used_vocab_freq

prepared_data = pyLDAvis.prepare(
    topic_term_dists=topic_term_dists,
    doc_topic_dists=doc_topic_dists,
    doc_lengths=doc_lengths,
    vocab=vocab,
    term_frequency=term_frequency
)

# Salvar a visualização em um arquivo HTML
pyLDAvis.save_html(prepared_data, 'lda_vis.html')
pyLDAvis.show(prepared_data)
