In [1]:
import pandas as pd
import re
from textblob import TextBlob
from unidecode import unidecode
import emoji
from nltk.corpus import stopwords
import nltk

# Certifique-se de ter baixado os recursos do NLTK
nltk.download('stopwords')

# Carrega o dataset
df = pd.read_csv("dataset_zap_1.csv")  # ou dataset_zap_2.csv

# Remove linhas vazias
df = df.dropna(subset=["text_content_anonymous"])

# Remove duplicatas
df = df.drop_duplicates(subset=["text_content_anonymous"])

# Lista de stopwords extendida
stop_words = set(stopwords.words('portuguese'))
stop_words.update([
    "bom dia", "boa noite", "boa tarde", "am√©m", "amem", "deus √© fiel", ".", ",", "üëç", "üôè", "üî•", "üëè", "üí™",
    "kkk", "kkkk", "sim", "n√£o", "grato", "grata", "obrigado", "obrigada", "valeu", "ok", "t√°", "eh", "ah", "oie"
])

# Fun√ß√£o de pr√©-processamento
def preprocessar(texto):
    if pd.isna(texto):
        return ""
    # Remove emojis
    texto = emoji.replace_emoji(texto, replace='')
    # Remove caracteres especiais e n√∫meros
    texto = re.sub(r"[^a-zA-Z√Ä-√ø\s]", "", texto)
    # Remove acentua√ß√£o
    texto = unidecode(texto)
    # Min√∫sculas
    texto = texto.lower()
    # Remove stopwords personalizadas
    for sw in stop_words:
        texto = texto.replace(sw, "")
    # Remove espa√ßos m√∫ltiplos
    texto = re.sub(r"\s+", " ", texto).strip()
    return texto

# Aplica pr√©-processamento
df["texto_limpo"] = df["text_content_anonymous"].apply(preprocessar)

# Remove mensagens que ficaram vazias
df = df[df["texto_limpo"].str.strip().astype(bool)]

# Calcula polaridade com TextBlob
df["score_sentiment"] = df["texto_limpo"].apply(lambda x: TextBlob(x).sentiment.polarity)

# Seleciona mensagens mais positivas e negativas
top_positivas = df.sort_values(by="score_sentiment", ascending=False).head(10)
top_negativas = df.sort_values(by="score_sentiment", ascending=True).head(10)

# Seleciona colunas desejadas
colunas = ["text_content_anonymous", "score_sentiment"]
if "score_misinformation" in df.columns:
    colunas.append("score_misinformation")

# Junta e exibe
amostras = pd.concat([top_positivas[colunas], top_negativas[colunas]])
print(amostras.to_string(index=False))


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\nasci\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.
  df = pd.read_csv("dataset_zap_1.csv")  # ou dataset_zap_2.csv


                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                        