In [1]:
import pandas as pd
import spacy
from collections import Counter
from mlxtend.frequent_patterns import apriori, association_rules
from mlxtend.preprocessing import TransactionEncoder

import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import string
from unidecode import unidecode
from transformers import pipeline

# Carregar o modelo de português do spaCy
nlp = spacy.load("pt_core_news_sm")

  from .autonotebook import tqdm as notebook_tqdm


In [2]:

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('portuguese'))



custom_stopwords = {
    't', 'be', 'nao', 'youtu', 'vai', 'av', 'ja', 'to', 'the', 'this', 'i', 'and', 
    'you', 'y', 'www', 'sao', 'pois', 'contra', 'user', 'ai', 'so', 'gente', 'voce', 'of', 
    'ola', 'gift', 'card', 'kwaivideo', 'r', 'bom', 'q', 'vc', 'vcs', 'pra','ta', 'phone', 'ok', 'la',
    'sera', 'ha', 'aqui', 'ate', 'dia', 'mc', 'im', 'tmj', 'pix', 'g', 'diz', 'ti', 'etc', 'tudo', 
    'todo', 'toda', 'youtube', 'g1', 'm', 'instagram', 'fb', 'in', 'link', 'was', 'blocked', 'kk'
}

stop_words.update(custom_stopwords)

def preprocess_text(text):

    # Função para extrair e substituir o domínio da URL
    def substituir_dominios(texto):
        # Função para extrair e substituir o domínio da URL
        def extrair_dominio(url):
            # Remove o protocolo (http://, https://, etc.) e o "www." se presente
            dominio = re.sub(r'^https?://(?:www\.)?|www\.', '', url)
            # Remove o caminho e parâmetros da URL
            dominio = re.split(r'[/?#]', dominio)[0]
            # Retorna a parte principal do domínio (antes do primeiro ponto)
            return dominio.split('.')[0]

        # Substitui URLs por seus domínios principais
        return re.sub(r'https?://(?:www\.)?\S+|www\.\S+', lambda match: extrair_dominio(match.group(0)), texto)

    # Substituir domínios
    text = substituir_dominios(text)

    # Converte para minúsculas
    text = text.lower()

    # Remove acentos
    text = unidecode(text)

    #Remover Pontuação
    text = text.translate(str.maketrans('', '', string.punctuation))

    # Remove URLs e menções
    #text = re.sub(r'http\S+|www\S+|https\S+|@\w+', '', text)

    # Substitui emojis repetidos por apenas um
    text = re.sub(r'([\U00010000-\U0010FFFF])\1+', r'\1', text)
    text = re.sub(r'([\U0001F600-\U0001F64F]|[\U0001F300-\U0001F5FF]|[\U0001F680-\U0001F6FF]|[\U0001F700-\U0001F77F]|[\U0001F780-\U0001F7FF]|[\U0001F800-\U0001F8FF]|[\U0001F900-\U0001F9FF]|[\U0001FA00-\U0001FA6F]|[\U0001FA70-\U0001FAFF])\1+', r'\1', text)


    # Remove espaços em branco extras (início ou final) e múltiplos espaços no meio do texto
    text = re.sub(r'\s+', ' ', text).strip()

    # Remove pontuações e caracteres especiais
    #text = re.sub(r'[^\w\s]', '', text)

    # Ajusta risadas "kkk" ou mais para "kk"
    text = re.sub(r'k{2,}|K{2,}', 'kk', text)

    # Ajusta risadas "haha" ou mais para "haha"
    text = re.sub(r'(ha){2,}', 'haha', text, flags=re.IGNORECASE)

    # Ajusta risadas "kaka" ou mais para "kaka"
    text = re.sub(r'(ka){2,}', 'kaka', text, flags=re.IGNORECASE)

    # Remove as stopwords
    text = ' '.join([word for word in text.split() if word not in stop_words])


    return text


[nltk_data] Downloading package punkt to C:\Users\Melissa
[nltk_data]     Felipe\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to C:\Users\Melissa
[nltk_data]     Felipe\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to C:\Users\Melissa
[nltk_data]     Felipe\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


## Telegram

In [5]:
df_telegram = pd.read_csv('../datasets/fakeTelegram.BR_2022.csv')
df_telegram.head()

Unnamed: 0,date_message,id_member_anonymous,id_group_anonymous,media,media_type,media_url,has_media,has_media_url,trava_zap,text_content_anonymous,dataset_info_id,date_system,score_sentiment,score_misinformation,id_message,id_persona,message_type,messenger,media_name,media_md5
0,2022-10-05 06:25:04,1078cc958f0febe28f4d03207660715f,12283e08a2eb5789201e105b34489ee7,,,,False,False,False,Então é Fato Renato o áudio que eu ouvi no wha...,5,2022-10-05 06:25:28.863641,0.0,,16385,Wanda Silva,Texto,telegram,,
1,2022-10-05 06:25:08,,12283e08a2eb5789201e105b34489ee7,,,,False,False,False,"Saiu no YouTube do presidente a 8 horas atrás,...",5,2022-10-05 06:25:28.926311,0.0644,,16386,Wanda Silva,Texto,telegram,,
2,2022-10-05 06:26:28,92a2d8fd7144074f659d1d29dc3751da,9f2d7394334eb224c061c9740b5748fc,,,,False,False,False,"É isso, nossa parte já foi quase toda feita. N...",5,2022-10-05 06:26:29.361949,-0.3551,0.157242,16366,Wanda Silva,Texto,telegram,,
3,2022-10-05 06:27:28,d60aa38f62b4977426b70944af4aff72,c8f2de56550ed0bf85249608b7ead93d,94dca4cda503100ebfda7ce2bcc060eb.jpg,image/jpg,,True,False,False,GENTE ACHEI ELES EM UMA SEITA MAÇONÁRICA,5,2022-10-05 06:27:29.935624,0.0,,19281,Wanda Silva,Imagem,telegram,,94dca4cda503100ebfda7ce2bcc060eb
4,2022-10-05 06:27:44,cd6979b0b5265f08468fa1689b6300ce,e56ec342fc599ebb4ed89655eb6f03aa,5ad5c8bbe9da93a37fecf3e5aa5b0637.jpg,image/jpg,,True,False,False,,5,2022-10-05 06:28:29.316325,,,507185,Wanda Silva,Imagem,telegram,,5ad5c8bbe9da93a37fecf3e5aa5b0637


In [6]:
ids_para_remover = ['b73c0b674b28d87375d78dcae2f1a21f','62b3aa83e4df122e48cb1f97583e403f',
                    '893874da7344daaa00b8b5b3dc295b59','38737f3c7a07586c4efbfa39aa345705',
                    'da4cbbc1b7b1192ad83bcdae613ac2c3', 'bf66a467459264e0d33e4c8518a82827',
                    '46e009035e6150d656d0c194db88fb07','811541f6ce49347d1798e8dd2d3cbf83',
                    '1d2a0a63333d12ad188e9e6f7f2e0419','7c379613d4da713fe959d8c7ddc11ce2',
                    'debe17fd10d504076d2df8682f63bfc6','e9713ae04a02a810d6f33dd956f42794',
                    'd6c77928db26721ce46aca2d549780f0','8e53464d3b01eea3e39d07e51ecbb1b4',
                    'bacd40da04dd7d13f646993bdcf8e79d','2273d1167a6212812d95dc8fadbae78e',
                    'cd3bb1bdf75be7595e6373171a5c2225','add8c1ba533c5e5450d92c061a5ee7bf',
                    'e198f90df1995528531dd43db0c935ea', '06dc9ac55ed64caab2bd97e9ab717302',
                    'c0110feb539d212836605b66192722dd','f71912700ac5331415408ce229681359',
                    '2ee692357a9c948351c43a9540e859ae', '25b66278176dabe814dfc25a405a2470',
                    '08b21cb1e7de74ef5fe1085230075523']

In [7]:
# Filtrar o DataFrame para remover as linhas com esses IDs
df_filtrado_telegram = df_telegram[~df_telegram['id_member_anonymous'].isin(ids_para_remover)]
df_filtrado_telegram

Unnamed: 0,date_message,id_member_anonymous,id_group_anonymous,media,media_type,media_url,has_media,has_media_url,trava_zap,text_content_anonymous,dataset_info_id,date_system,score_sentiment,score_misinformation,id_message,id_persona,message_type,messenger,media_name,media_md5
0,2022-10-05 06:25:04,1078cc958f0febe28f4d03207660715f,12283e08a2eb5789201e105b34489ee7,,,,False,False,False,Então é Fato Renato o áudio que eu ouvi no wha...,5,2022-10-05 06:25:28.863641,0.0000,,16385,Wanda Silva,Texto,telegram,,
1,2022-10-05 06:25:08,,12283e08a2eb5789201e105b34489ee7,,,,False,False,False,"Saiu no YouTube do presidente a 8 horas atrás,...",5,2022-10-05 06:25:28.926311,0.0644,,16386,Wanda Silva,Texto,telegram,,
2,2022-10-05 06:26:28,92a2d8fd7144074f659d1d29dc3751da,9f2d7394334eb224c061c9740b5748fc,,,,False,False,False,"É isso, nossa parte já foi quase toda feita. N...",5,2022-10-05 06:26:29.361949,-0.3551,0.157242,16366,Wanda Silva,Texto,telegram,,
3,2022-10-05 06:27:28,d60aa38f62b4977426b70944af4aff72,c8f2de56550ed0bf85249608b7ead93d,94dca4cda503100ebfda7ce2bcc060eb.jpg,image/jpg,,True,False,False,GENTE ACHEI ELES EM UMA SEITA MAÇONÁRICA,5,2022-10-05 06:27:29.935624,0.0000,,19281,Wanda Silva,Imagem,telegram,,94dca4cda503100ebfda7ce2bcc060eb
4,2022-10-05 06:27:44,cd6979b0b5265f08468fa1689b6300ce,e56ec342fc599ebb4ed89655eb6f03aa,5ad5c8bbe9da93a37fecf3e5aa5b0637.jpg,image/jpg,,True,False,False,,5,2022-10-05 06:28:29.316325,,,507185,Wanda Silva,Imagem,telegram,,5ad5c8bbe9da93a37fecf3e5aa5b0637
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
557581,2022-11-11 12:06:15,333e9869f23dbd4682d1be382d9c1e59,e56ec342fc599ebb4ed89655eb6f03aa,25e43b6a58b848c43ad5b5f9e979822a.jpg,url,https://terrabrasilnoticias.com/2022/11/bndes-...,True,True,False,"BNDES tem lucro de R$ 9,6 bilhões no terceiro ...",5,2022-11-16 14:49:39.146502,0.1027,,575796,Wanda Silva,Url,telegram,,25e43b6a58b848c43ad5b5f9e979822a
557582,2022-11-11 12:09:08,,5b10d7739171149be6d9961e3350c071,657949d03e4088f6b332e2686ccd3221.jpg,url,https://youtu.be/8g1Vz9_0xVk,True,True,False,https://youtu.be/8g1Vz9_0xVk,5,2022-11-16 14:49:39.847434,0.0000,,1286443,Wanda Silva,Url,telegram,,657949d03e4088f6b332e2686ccd3221
557583,2022-11-11 12:09:47,,1590a03f43b5ba4b6147a1c5e1dd357b,a21848a61045380a6483866daed0ca0e.jpg,image/jpg,https://t.me/vemprasruas,True,True,False,"Empresários, demitam os petistas primeiro.\n\n...",5,2022-11-16 14:49:39.922279,0.0000,,13294,Wanda Silva,Imagem,telegram,,a21848a61045380a6483866daed0ca0e
557584,2022-11-11 12:09:46,,5b10d7739171149be6d9961e3350c071,a21848a61045380a6483866daed0ca0e.jpg,image/jpg,https://t.me/vemprasruas,True,True,False,"Empresários, demitam os petistas primeiro.\n\n...",5,2022-11-16 14:49:39.992932,0.0000,,1286444,Wanda Silva,Imagem,telegram,,a21848a61045380a6483866daed0ca0e


In [8]:
# Removendo linhas com valores NaN na coluna 'text_content_anonymous'
df_filtrado_telegram = df_filtrado_telegram.dropna(subset=['text_content_anonymous'])

# Removendo trava_zap
df_filtrado_telegram = df_filtrado_telegram[df_filtrado_telegram['trava_zap'] == False]


len(df_filtrado_telegram)

443759

In [9]:

df_geral_telegram = df_filtrado_telegram.copy()

# Aplicar o pré-processamento à coluna de texto
df_geral_telegram['text_processed'] = df_geral_telegram['text_content_anonymous'].apply(preprocess_text)

In [10]:
palavras_religiosas = [
    "deus", "jesus", "misericordia", "davi",
    "salomao", "reino", "templo", "conservador",
    "pentecostal", "rcc", "renovacao", "carismatic",
    "paulo ricardo", "bernardo kuster", "herege", "ateu",
    "jerico", "heresia"
]

# Função para verificar se uma palavra está relacionada à religião
def relacionada_religiao(word):
    word_lower = word.lower()
    palavras_religiosas_lower = [palavra.lower() for palavra in palavras_religiosas]

    # Verificando se alguma palavra da lista de palavras religiosas está presente
    return any(palavra in word_lower for palavra in palavras_religiosas_lower)


df_religiao_telegram = df_geral_telegram[df_geral_telegram['text_processed'].apply(lambda x: relacionada_religiao(x))]


In [11]:
palavras_politicas = [ "lula", "bolsonaro", "pt", "pl", "stf", "patria", "55", "22", "13", "senadores", "lulaladrao",
                       "urnas", "alexandre", "moraes", "comunismo", "eleicao", "eleicoes", "esquerda", "direita",
                         "presidente", "tse", "fraude", "voto", "turno", "ministro"]

def retirar_mensagens_com_palavras_politicas(word):
    word_lower = word.lower()
    palavras_politica_lower = [palavra.lower() for palavra in palavras_politicas]

    return any(palavra in word_lower for palavra in palavras_politica_lower)


df_politico_telegram = df_religiao_telegram[~df_religiao_telegram['text_processed'].apply(lambda x: retirar_mensagens_com_palavras_politicas(x))]

In [12]:
df_politico_telegram.shape

(8526, 21)

In [13]:

# Inicializar um contador para entidades
entidade_counter = Counter()

# Iterar sobre a coluna com textos
for doc_text in df_politico_telegram['text_processed']:
    doc = nlp(doc_text)
    for ent in doc.ents:
        entidade_counter[ent.text] += 1

# Obter as 10 entidades mais frequentes
top_10_entidades = entidade_counter.most_common(10)

# Mostrar os resultados
for entidade, frequencia in top_10_entidades:
    print(f"{entidade}: {frequencia}")


brasil: 668
jesus: 607
jesus cristo: 179
reino unido: 176
deus: 132
jesus salva leia biblia: 87
eua: 61
brazil: 51
venezuela: 47
edson: 44


### xlm - roberta

In [19]:
from transformers import pipeline
from collections import Counter

ner = pipeline(
    "ner",
    model="Davlan/xlm-roberta-base-ner-hrl",
    aggregation_strategy="simple",
    device=0  # usa GPU
)

entidade_counter = Counter()

for text in df_politico_telegram["text_processed"].dropna():
    if not isinstance(text, str) or text.strip() == "":
        continue

    # evita estouro de memória / tokens
    text = text[:1000]

    entities = ner(text)
    for ent in entities:
        entidade_counter[ent["word"]] += 1

for entidade, freq in entidade_counter.most_common(10):
    print(f"{entidade}: {freq}")


Device set to use cuda:0
You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


brasil: 942
je: 600
jesus: 596
sus: 244
da: 95
: 90
brazil: 86
israel: 56
mari: 47
venezuela: 42


## XLM-ROBERTA

In [20]:
ner = pipeline(
    "ner",
    model="Davlan/xlm-roberta-base-ner-hrl",
    aggregation_strategy="simple"
)

entidade_counter_xlm = Counter()

for text in df_politico_telegram["text_processed"]:
    entities = ner(text)
    for ent in entities:
        entidade_counter_xlm[ent["word"]] += 1

top_10_entidades = entidade_counter_xlm.most_common(10)

for entidade, freq in top_10_entidades:
    print(f"{entidade}: {freq}")

Device set to use cuda:0


brasil: 941
jesus: 615
je: 603
sus: 244
da: 95
: 90
brazil: 86
israel: 56
mari: 46
deus: 42


## WhatsApp

In [3]:
df_whatsapp = pd.read_csv('../datasets/fakeWhatsApp.BR_2022.csv')
df_whatsapp.head()

Unnamed: 0,date_message,id_member_anonymous,id_group_anonymous,media,media_type,media_url,has_media,has_media_url,trava_zap,text_content_anonymous,dataset_info_id,date_system,score_sentiment,score_misinformation,id_message,id_persona,message_type,messenger,media_name,media_md5
0,2022-10-10 18:20:24,9d737b3c9387855139bbad2311cc5709,0638569ee76dac58f59dcac20463c955,,,,False,False,False,☝️\nHoje que eu fique sabendo do CANAL LULA FL...,5,2022-10-10 18:20:25.000937,-0.7003,0.843775,F7023FFB06C429A2C166922849A35ED8,558594228826.0:12@s.whatsapp.net,Texto,whatsapp,,
1,2022-10-10 22:02:58,1660a60f661754d2802ca53296e25be8,a5910d5cc1c830ade9eb4dd00f15ff6a,,,,False,False,False,Mais pra que isso não aconteça nois temos quê ...,5,2022-10-10 22:02:58.4682,-0.296,,A9FAC78070C144890D181EF415B90CAD,558594228826.0:12@s.whatsapp.net,TextoExtendido,whatsapp,,
2,2022-10-11 00:39:31,c882172d447798d74915973ac83eba68,b84dfe2d1599b82768dcdecce7e6bb23,d2e0ec59ffd9f84764f5b147725d7196.oga,audio/ogg; codecs=opus,,True,False,False,,5,2022-10-11 00:39:33.445125,,,737948BE86D450A426470794F91BC80D,558594228826.0:12@s.whatsapp.net,Audio,whatsapp,,d2e0ec59ffd9f84764f5b147725d7196
3,2022-10-10 23:36:19,91e3c22c08b24ba01ac4524d77bcb1da,addb88a34374d43aa9ecd4df7359ce39,,,,False,False,False,‎Acesse este link para entrar no meu grupo do ...,5,2022-10-10 23:36:19.724987,-0.1531,,439A91ADD8F355CD23C4BB107A5E88BB,558594228826.0:12@s.whatsapp.net,TextoExtendido,whatsapp,,
4,2022-10-10 23:40:12,77c1a8a31fee269db258a028a61f0b88,a5831b109d4d95fae8ee0ce464e48a6a,bb80cd530332bb6b95c34e719992d249.f4v,video/mp4,,True,False,False,,5,2022-10-10 23:40:14.397495,,,092203A082AC3DFB2A4933F60453AEB8,558594228826.0:12@s.whatsapp.net,Video,whatsapp,,bb80cd530332bb6b95c34e719992d249


In [4]:
ids_para_remover = ['6ef561ec0f448afcd7b3751124bb0712','d3e678a0ba0e1485548260a7c4599152',
                    'ac5703154484de05336af617455ca55e','a39edbd64d378226ffa60433649a0acf',
                    'c66d0d4ae5a4b281bff67e1fa4fbd6ba','819bbc872ed6d81f44d746b710eecf06',
                    'ec94da4d54f9a5693e88fa582926be53','6a38c72316d87c028dfd66c10442476b',
                    '94099e1e46f129856541e2b3640896d1','8f367d1693fff47218603fa47ded525c',
                    'c09caffee0d1bd30926dea9df25dc88f']

In [5]:
# Filtrar o DataFrame para remover as linhas com esses IDs
df_filtrado_whatsapp = df_whatsapp[~df_whatsapp['id_member_anonymous'].isin(ids_para_remover)]
df_filtrado_whatsapp

Unnamed: 0,date_message,id_member_anonymous,id_group_anonymous,media,media_type,media_url,has_media,has_media_url,trava_zap,text_content_anonymous,dataset_info_id,date_system,score_sentiment,score_misinformation,id_message,id_persona,message_type,messenger,media_name,media_md5
0,2022-10-10 18:20:24,9d737b3c9387855139bbad2311cc5709,0638569ee76dac58f59dcac20463c955,,,,False,False,False,☝️\nHoje que eu fique sabendo do CANAL LULA FL...,5,2022-10-10 18:20:25.000937,-0.7003,0.843775,F7023FFB06C429A2C166922849A35ED8,558594228826.0:12@s.whatsapp.net,Texto,whatsapp,,
1,2022-10-10 22:02:58,1660a60f661754d2802ca53296e25be8,a5910d5cc1c830ade9eb4dd00f15ff6a,,,,False,False,False,Mais pra que isso não aconteça nois temos quê ...,5,2022-10-10 22:02:58.4682,-0.2960,,A9FAC78070C144890D181EF415B90CAD,558594228826.0:12@s.whatsapp.net,TextoExtendido,whatsapp,,
2,2022-10-11 00:39:31,c882172d447798d74915973ac83eba68,b84dfe2d1599b82768dcdecce7e6bb23,d2e0ec59ffd9f84764f5b147725d7196.oga,audio/ogg; codecs=opus,,True,False,False,,5,2022-10-11 00:39:33.445125,,,737948BE86D450A426470794F91BC80D,558594228826.0:12@s.whatsapp.net,Audio,whatsapp,,d2e0ec59ffd9f84764f5b147725d7196
3,2022-10-10 23:36:19,91e3c22c08b24ba01ac4524d77bcb1da,addb88a34374d43aa9ecd4df7359ce39,,,,False,False,False,‎Acesse este link para entrar no meu grupo do ...,5,2022-10-10 23:36:19.724987,-0.1531,,439A91ADD8F355CD23C4BB107A5E88BB,558594228826.0:12@s.whatsapp.net,TextoExtendido,whatsapp,,
4,2022-10-10 23:40:12,77c1a8a31fee269db258a028a61f0b88,a5831b109d4d95fae8ee0ce464e48a6a,bb80cd530332bb6b95c34e719992d249.f4v,video/mp4,,True,False,False,,5,2022-10-10 23:40:14.397495,,,092203A082AC3DFB2A4933F60453AEB8,558594228826.0:12@s.whatsapp.net,Video,whatsapp,,bb80cd530332bb6b95c34e719992d249
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
598966,2022-11-13 00:46:56,b1b419f54664f6cd4fcdb40425862d36,b1b419f54664f6cd4fcdb40425862d36,7719ba57cbe74eb5c8059d6b1a14d1ca.f4v,video/mp4,,True,False,False,,5,2023-01-30 18:20:22.262572,,,08E682716448AB06DEE126A30E7AAA77,558594228826.0:17@s.whatsapp.net,Video,whatsapp,,7719ba57cbe74eb5c8059d6b1a14d1ca
598967,2022-11-13 13:37:03,2ad743e9ff831a75911a983b42205845,64c9a7c8ac338faa258e5aea9a1c2ae3,c6d59f0b86e63dbcce7fc682a1eaea50.f4v,video/mp4,,True,False,False,,5,2023-01-30 18:20:22.269632,,,3EB0259ED9B8FACF18A9,558594228826.0:17@s.whatsapp.net,Video,whatsapp,,c6d59f0b86e63dbcce7fc682a1eaea50
598968,2022-11-13 17:43:22,b1b419f54664f6cd4fcdb40425862d36,b1b419f54664f6cd4fcdb40425862d36,003ace942d9665adc9647df9d9526c0c.jpeg,image/jpeg,,True,False,False,,5,2023-01-30 18:20:22.277283,,,5CD9F23CD07BEE38693E68E11F9C6A55,558594228826.0:17@s.whatsapp.net,Imagem,whatsapp,,003ace942d9665adc9647df9d9526c0c
598969,2022-11-13 21:40:47,b1b419f54664f6cd4fcdb40425862d36,b1b419f54664f6cd4fcdb40425862d36,bab5653a94b097404739a061dbbc7407.f4v,video/mp4,,True,False,False,,5,2023-01-30 18:20:22.284021,,,D3BDE4A5C68E7148C0925F6C582C99A9,558594228826.0:17@s.whatsapp.net,Video,whatsapp,,bab5653a94b097404739a061dbbc7407


In [6]:
# Removendo linhas com valores NaN na coluna 'text_content_anonymous'
df_filtrado_whatsapp = df_filtrado_whatsapp.dropna(subset=['text_content_anonymous'])

# Removendo trava_zap
df_filtrado_whatsapp = df_filtrado_whatsapp[df_filtrado_whatsapp['trava_zap'] == False]


len(df_filtrado_whatsapp)

254480

In [7]:

df_geral_whatsapp = df_filtrado_whatsapp.copy()

# Aplicar o pré-processamento à coluna de texto
df_geral_whatsapp['text_processed'] = df_geral_whatsapp['text_content_anonymous'].apply(preprocess_text)

In [8]:
palavras_religiosas = [
    "deus", "jesus", "misericordia", "davi",
    "salomao", "reino", "templo", "conservador",
    "pentecostal", "rcc", "renovacao", "carismatic",
    "paulo ricardo", "bernardo kuster", "herege", "ateu",
    "jerico", "heresia"
]

# Função para verificar se uma palavra está relacionada à religião
def relacionada_religiao(word):
    word_lower = word.lower()
    palavras_religiosas_lower = [palavra.lower() for palavra in palavras_religiosas]

    # Verificando se alguma palavra da lista de palavras religiosas está presente
    return any(palavra in word_lower for palavra in palavras_religiosas_lower)


df_religiao_whatsapp = df_geral_whatsapp[df_geral_whatsapp['text_processed'].apply(lambda x: relacionada_religiao(x))]


In [9]:
palavras_politicas = [ "lula", "bolsonaro", "pt", "pl", "stf", "patria", "55", "22", "13", "senadores", "lulaladrao",
                       "urnas", "alexandre", "moraes", "comunismo", "eleicao", "eleicoes", "esquerda", "direita",
                         "presidente", "tse", "fraude", "voto", "turno", "ministro"]

def retirar_mensagens_com_palavras_politicas(word):
    word_lower = word.lower()
    palavras_politica_lower = [palavra.lower() for palavra in palavras_politicas]

    return any(palavra in word_lower for palavra in palavras_politica_lower)


df_politico_whatsapp = df_religiao_whatsapp[~df_religiao_whatsapp['text_processed'].apply(lambda x: retirar_mensagens_com_palavras_politicas(x))]

In [10]:
df_politico_whatsapp.shape

(5167, 21)

In [None]:

# Inicializar um contador para entidades
entidade_counter = Counter()

# Iterar sobre a coluna com textos
for doc_text in df_politico_whatsapp['text_processed']:
    doc = nlp(doc_text)
    for ent in doc.ents:
        entidade_counter[ent.text] += 1

# Obter as 10 entidades mais frequentes
top_10_entidades = entidade_counter.most_common(10)

# Mostrar os resultados
for entidade, frequencia in top_10_entidades:
    print(f"{entidade}: {frequencia}")


jesus: 661
brasil: 471
jesus cristo: 178
deus: 115
ifood: 63
chirley: 57
hostilio souza: 57
misericordia: 55
acredito: 39
familia: 33


In [11]:
from transformers import pipeline
from collections import Counter

ner = pipeline(
    "ner",
    model="Davlan/xlm-roberta-base-ner-hrl",
    aggregation_strategy="simple",
    device=0  # usa GPU
)

entidade_counter = Counter()

for text in df_politico_whatsapp["text_processed"].dropna():
    if not isinstance(text, str) or text.strip() == "":
        continue

    # evita estouro de memória / tokens
    text = text[:1000]

    entities = ner(text)
    for ent in entities:
        entidade_counter[ent["word"]] += 1

for entidade, freq in entidade_counter.most_common(10):
    print(f"{entidade}: {freq}")


Device set to use cuda:0
  attn_output = torch.nn.functional.scaled_dot_product_attention(
You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


brasil: 615
je: 561
jesus: 549
sus: 282
israel: 54
ley: 54
: 31
galile: 28
jesus cristo: 19
reginaldo manzotti: 17
