# Análise de Tópicos

#### Importação de Bibliotecas

In [None]:
import pandas as pd
import spacy
from bertopic import BERTopic

from collections import Counter

import re
from datetime import datetime

from gensim.utils import simple_preprocess

import nltk
nltk.download(["stopwords", "rslp"])
stopwords = nltk.corpus.stopwords.words("portuguese")

from transformers import AutoTokenizer  # Or BertTokenizer
from transformers import AutoModelForSequenceClassification  # Or BertForPreTraining for loading pretraining heads
from transformers import AutoModel  # or BertModel, for BERT without pretraining heads
from transformers import pipeline
import torch

import warnings
warnings.filterwarnings("ignore")

#### Leitura do arquivo

In [None]:
df_reviews = pd.read_csv("guacamole_reviews.csv")
df_reviews.head()

In [None]:
df_comments = df_reviews["Review"]
df_dates = df_reviews["Date"]

#### Limpeza dos dados

In [None]:
def remove_stopwords(text):
    words = simple_preprocess(text)
    phrase_adjusted = " ".join([word for word in words if word not in stopwords])
    return phrase_adjusted.lower()

spacy_lemma = spacy.load("pt_core_news_sm")

def lemmatizer(text, postags_permit=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """https://spacy.io/api/annotation"""
    doc = spacy_lemma(text.lower())
    doc_lemma = " ".join([token.lemma_ for token in doc if token.pos_ in postags_permit])
    return doc_lemma

In [None]:
def parse_date(date_str):
    months = {
        'janeiro': 'January', 'fevereiro': 'February', 'março': 'March', 'abril': 'April',
        'maio': 'May', 'junho': 'June', 'julho': 'July', 'agosto': 'August',
        'setembro': 'September', 'outubro': 'October', 'novembro': 'November', 'dezembro': 'December'
    }
    match = re.match(r'(\d+) de (\w+) de (\d+)', date_str)
    if match:
        day, month, year = match.groups()
        month = months[month.lower()]
        return datetime(int(year), list(months.values()).index(month) + 1, int(day)).strftime("%Y-%m-%d")
    else:
        return None

In [None]:
df_reviews["Review Lemma"] = df_reviews["Review"].map(remove_stopwords)
df_reviews["Review Lemma"] = df_reviews["Review"].map(lemmatizer)

In [None]:
df_reviews["Date Formatted"] = df_reviews["Date"].apply(parse_date)

In [None]:
df_reviews.head()

In [None]:
df_comments = df_reviews["Review Lemma"]
doc_comments = df_comments.to_list()
doc_comments[:5]

#### BERTopic

In [None]:
topic_model = BERTopic(language="portuguese", calculate_probabilities=True, verbose=True)
topics, probs = topic_model.fit_transform(doc_comments)

In [None]:
freq = topic_model.get_topic_info(); freq.head(5)

In [None]:
topic_model.get_topic(0)

#### Visualização dos tópicos

In [None]:
topic_model.visualize_barchart(top_n_topics=20)

In [None]:
topic_model.visualize_hierarchy()

In [None]:
dates_reviews = df_reviews["Date Formatted"].to_list()

In [None]:
topics_over_time = topic_model.topics_over_time(doc_comments, dates_reviews, datetime_format="%Y-%m-%d", nr_bins=20)

In [None]:
topic_model.visualize_topics_over_time(topics_over_time, top_n_topics=20)

#### DataFrame com informações completas

In [None]:
df_topics = df_reviews.copy()

df_topics["Topics"] = topics

topic_name = freq.drop(columns=["Count"]).rename(columns={"Topic": "Topics", "Name": "Names"})
df_topics = df_topics.merge(topic_name, how="left")

df_topics.head()

# Análise de Sentimentos

In [None]:
model = AutoModelForSequenceClassification.from_pretrained('lxyuan/distilbert-base-multilingual-cased-sentiments-student')
tokenizer = AutoTokenizer.from_pretrained('lxyuan/distilbert-base-multilingual-cased-sentiments-student', do_lower_case=False)
sentiment_task = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer)

In [None]:
sentiment_task("Eu sou feliz")

#### Classificação das avaliações e salvamento dos resultados no DataFrame

In [None]:
%%time

df_topics["Sentiment"] = df_topics["Review"].apply(lambda x: sentiment_task(x)[0]["label"])
df_topics["Sentiment Score"] = df_topics["Review"].apply(lambda x: sentiment_task(x)[0]["score"])    

In [None]:
df_topics.head()

In [None]:
df_topics.to_csv('guacamole_topics_sentiment.csv', index=False)

In [None]:
count_sentiment = df_topics["Sentiment"].value_counts()
count_sentiment

# Extração de Entidade Nomeada

In [None]:
model_ner = AutoModelForTokenClassification.from_pretrained('51la5/roberta-large-NER')
tokenizer_ner = AutoTokenizer.from_pretrained('51la5/roberta-large-NER', do_lower_case=False)
ner_task = pipeline("ner", model=model_ner, tokenizer=tokenizer_ner)

In [None]:
ner_task("Julia não gosta de Londres nem Berlim")

In [None]:
def classify_ner(text):
    """
    Token classification function using a pretrained model.

    Parameters:
    - text: Input text to be tokenized and classified.

    Returns:
    List of tuples containing predicted pairs (token, label) for the input text.

    Example:
    Input:  "Julia is tired of living in London."
    Output: [('Julia', 'B-PESSOA'), ('is', 'O'), ('tired', 'O'), ('of', 'O'),
            ('living', 'O'), ('in', 'O'), ('London', 'B-LOCAL'), ('.', 'O')]
    """

    inputs = tokenizer_ner(text, max_length=512, truncation=True, return_tensors="pt")
    tokens = inputs.tokens()

    outputs = model_ner(**inputs).logits
    predictions = torch.argmax(outputs, dim=2)

    results = []

    for token, prediction in zip(tokens, predictions[0].numpy()):
        label = model_ner.config.id2label.get(prediction, 'O')
        if label != 'O':
            results.append((token, label))

    return results

In [None]:
df_topics["Token Predictions"] = df_topics["Review"].apply(classify_ner)

In [None]:
df_topics["Token Predictions"]

In [None]:
def merge_tokens(token_predictions):
    """
    Function to merge consecutive tokens that start with "_" and have the same label.

    Parameters:
    - token_predictions: List of tuples containing predicted pairs (token, label).

    Returns:
    List of merged tuples where consecutive tokens starting with "_"
    and having the same label are combined.

    Example:
    Input:  [('▁Mathe', 'I-PER'), ('us', 'I-PER')]
    Output: [('▁Matheus', 'I-PER')]
    """

    merged_results = []
    current_token = ""
    current_label = ""

    for token, label in token_predictions:
        if token.startswith("▁"):
            if current_token:
                merged_results.append((current_token, current_label))
            current_token = token[1:]
            current_label = label
        else:
            current_token += token
            current_label = label

    if current_token:
        merged_results.append((current_token, current_label))

    return merged_results

In [None]:
df_topics["Token Predictions Corrected"] = df_topics["Token Predictions"].apply(merge_tokens)

In [None]:
df_topics["Token Predictions Corrected"]

In [None]:
token_predictions_corrected = df_topics["Token Predictions Corrected"]

flat_list = [item for sublist in token_predictions_corrected for item in sublist]

counter = Counter(flat_list)

most_common_elements = counter.most_common(10)

most_common_elements